Merge branch 'release/dirac-ITT-2020'

Merge pull request #312 from i-kanamori/debug_512
add reordring of random number generators in IO
2026-07-29 15:03:28 +01:00 · 2020-10-13 13:38:29 -04:00 · 2020-10-13 11:42:12 -04:00 · 2020-10-13 11:41:38 -04:00 · 2020-10-12 12:33:13 +01:00 · 2020-10-10 16:52:56 +01:00
161 changed files with 1290 additions and 496 deletions
@@ -9,11 +9,6 @@ matrix:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
-      env: PREC=single
-    - os:        osx
-      osx_image: xcode8.3
-      compiler: clang
-      env: PREC=double
      
 before_install:
    - export GRIDDIR=`pwd`
@@ -55,7 +50,7 @@ script:
    - make -j4
    - make install
    - cd $CWD/build
-    - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
+    - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
@@ -34,6 +34,12 @@
 #define __SYCL__REDEFINE__
 #endif

+/* HIP save and restore compile environment*/
+#ifdef GRID_HIP
+#pragma push
+#pragma push_macro("__HIP_DEVICE_COMPILE__")
+#endif
+#define EIGEN_NO_HIP

 #include <Grid/Eigen/Dense>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
@@ -52,6 +58,12 @@
 #pragma pop
 #endif

+/*HIP restore*/
+#ifdef __HIP__REDEFINE__
+#pragma pop_macro("__HIP_DEVICE_COMPILE__")
+#pragma pop
+#endif
+
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif
@@ -138,21 +138,6 @@ public:
 		      int recv_from_rank,
 		      int bytes);
  
-  void SendRecvPacket(void *xmit,
-		      void *recv,
-		      int xmit_to_rank,
-		      int recv_from_rank,
-		      int bytes);
-  
-  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-			   void *xmit,
-			   int xmit_to_rank,
-			   void *recv,
-			   int recv_from_rank,
-			   int bytes);
-  
-  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,
 			       void *recv,
@@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}

-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int xmit_to_rank,
-					   int recv_from_rank,
-					   int bytes)
-{
-  assert(0);
-}
-

 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
@@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  assert(0);
-}
-
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  assert(0);
-}
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
@@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int recv_from_rank,
 						     int bytes, int dir)
 {
-  std::vector<CommsRequest_t> list;
-  // Discard the "dir"
-  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
-  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
-  // Discard the "dir"
-  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
-  SendToRecvFromComplete(waitall);
 }

 void CartesianCommunicator::StencilBarrier(void){};
@@ -32,6 +32,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
 #endif
+#ifdef GRID_HIP
+#include <hip/hip_runtime_api.h>
+#endif

 NAMESPACE_BEGIN(Grid); 
 #define header "SharedMemoryMpi: "
@@ -425,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef GRID_CUDA
+#if defined(GRID_CUDA) ||defined(GRID_HIP)
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@@ -448,21 +451,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-#ifndef GRID_MPI3_SHM_NONE
-  auto err =  cudaMalloc(&ShmCommBuf, bytes);
-#else
-  auto err =  cudaMallocManaged(&ShmCommBuf, bytes);
-#endif
-  if ( err !=  cudaSuccess) {
-    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
-    exit(EXIT_FAILURE);  
-  }
+  ShmCommBuf = acceleratorAllocDevice(bytes);
+
  if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes 
+	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);

@@ -475,15 +472,26 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
+#ifdef GRID_CUDA
    cudaIpcMemHandle_t handle;
-    
    if ( r==WorldShmRank ) { 
-      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
+      auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
+#endif
+#ifdef GRID_HIP
+    hipIpcMemHandle_t handle;    
+    if ( r==WorldShmRank ) { 
+      auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
+      if ( err !=  hipSuccess) {
+	std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+#endif
    //////////////////////////////////////////////////
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
@@ -500,13 +508,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // If I am not the source, overwrite thisBuf with remote buffer
    ///////////////////////////////////////////////////////////////
    void * thisBuf = ShmCommBuf;
+#ifdef GRID_CUDA
    if ( r!=WorldShmRank ) { 
-      err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
+      auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
+#endif
+#ifdef GRID_HIP
+    if ( r!=WorldShmRank ) { 
+      auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
+      if ( err !=  hipSuccess) {
+	std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+#endif
    ///////////////////////////////////////////////////////////////
    // Save a copy of the device buffers
    ///////////////////////////////////////////////////////////////
@@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  autoView( lhs_v , lhs, AcceleratorRead);
  autoView( rhs_v , rhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
    auto rhs_t=rhs_v(ss);
+    auto tmp  =ret_v(ss);
    mac(&tmp,&lhs_t,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
  });
@@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
+    auto tmp  =ret_v(ss);
    auto lhs_t=lhs_v(ss);
    mac(&tmp,&lhs_t,&rhs);
    coalescedWrite(ret_v[ss],tmp);
@@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( rhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
+    auto tmp  =ret_v(ss);
    auto rhs_t=rhs_v(ss);
    mac(&tmp,&lhs,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
@@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid);

 #ifdef GRID_HIP
 extern hipDeviceProp_t *gpu_props;
+#define WARP_SIZE 64
 #endif
 #ifdef GRID_CUDA
 extern cudaDeviceProp *gpu_props;
+#define WARP_SIZE 32
 #endif

-#define WARP_SIZE 32
 __device__ unsigned int retirementCount = 0;

 template <class Iterator>
@@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
  
  // cannot use overloaded operators for sobj as they are not volatile-qualified
  memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
-  __syncwarp();
+  acceleratorSynchronise();
  
  const Iterator VEC = WARP_SIZE;
  const Iterator vid = tid & (VEC-1);
@@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
      beta += temp;
      memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
    }
-    __syncwarp();
+    acceleratorSynchronise();
  }
-  __syncthreads();
+  acceleratorSynchroniseAll();
  
  if (threadIdx.x == 0) {
    beta  = Zero();
@@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
    }
    memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
  }
-  __syncthreads();
+  acceleratorSynchroniseAll();
 }


@@ -130,6 +130,8 @@ public:
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){

    if ( log.active ) {
+      std::ios_base::fmtflags f(stream.flags());
+
      stream << log.background()<<  std::left;
      if (log.topWidth > 0)
      {
@@ -152,6 +154,8 @@ public:
 	       << now	       << log.background() << " : " ;
      }
      stream << log.colour();
+      stream.flags(f);
+
      return stream;
    } else { 
      return devnull;
@@ -1,3 +1,4 @@
 #include <Grid/GridCore.h>

-int Grid::BinaryIO::latticeWriteMaxRetry = -1;
+int                    Grid::BinaryIO::latticeWriteMaxRetry = -1;
+Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;
@@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
 public:
+  struct IoPerf
+  {
+    uint64_t size{0},time{0};
+    double   mbytesPerSecond{0.};
+  };
+
+  static IoPerf lastPerf;
  static int latticeWriteMaxRetry;

  /////////////////////////////////////////////////////////////////////////////
@@ -502,12 +509,15 @@ class BinaryIO {
      timer.Stop();
    }
    
+    lastPerf.size            = sizeof(fobj)*iodata.size()*nrank;
+    lastPerf.time            = timer.useconds();
+    lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
    std::cout<<GridLogMessage<<"IOobject: ";
    if ( control & BINARYIO_READ) std::cout << " read  ";
    else                          std::cout << " write ";
    uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
-    std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
+    std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
+	     << lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;

    std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;

@@ -663,10 +673,15 @@ class BinaryIO {
 	     nersc_csum,scidac_csuma,scidac_csumb);

    timer.Start();
-    thread_for(lidx,lsites,{
+    thread_for(lidx,lsites,{  // FIX ME, suboptimal implementation
      std::vector<RngStateType> tmp(RngStateCount);
      std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
-      parallel_rng.SetState(tmp,lidx);
+      Coordinate lcoor;
+      grid->LocalIndexToLocalCoor(lidx, lcoor);
+      int o_idx=grid->oIndex(lcoor);
+      int i_idx=grid->iIndex(lcoor);
+      int gidx=parallel_rng.generator_idx(o_idx,i_idx);
+      parallel_rng.SetState(tmp,gidx);
      });
    timer.Stop();

@@ -723,7 +738,12 @@ class BinaryIO {
    std::vector<RNGstate> iodata(lsites);
    thread_for(lidx,lsites,{
      std::vector<RngStateType> tmp(RngStateCount);
-      parallel_rng.GetState(tmp,lidx);
+      Coordinate lcoor;
+      grid->LocalIndexToLocalCoor(lidx, lcoor);
+      int o_idx=grid->oIndex(lcoor);
+      int i_idx=grid->iIndex(lcoor);
+      int gidx=parallel_rng.generator_idx(o_idx,i_idx);
+      parallel_rng.GetState(tmp,gidx);
      std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
    });
    timer.Stop();
@@ -47,7 +47,7 @@ static constexpr int Ym = 5;
 static constexpr int Zm = 6;
 static constexpr int Tm = 7;

-static constexpr int Nc=3;
+static constexpr int Nc=Config_Nc;
 static constexpr int Ns=4;
 static constexpr int Nd=4;
 static constexpr int Nhs=2; // half spinor
@@ -63,17 +63,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   template<int Naik> accelerator_inline
+   template<int Naik> 
+   static accelerator_inline
   void DhopSiteGeneric(StencilView &st, 
 			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteGenericInt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteGenericExt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
@@ -82,17 +85,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteHand(StencilView &st, 
 		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
 		     const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteHandInt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteHandExt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
@@ -101,6 +107,7 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
+   
   void DhopSiteAsm(StencilView &st, 
 		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
@@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
-  LatticeInteger zz (UGrid);   zz=0.0;
+  PropagatorField zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
  for (int s=0;s<Ls;s++) {

@@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);

-  LatticeInteger zz (UGrid);   zz=0.0;
+  PropagatorField  zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);

  for(int s=0;s<Ls;s++){
@@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid);


 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
 					  SiteSpinor *buf, int sF, int sU, 
@@ -221,7 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,


 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -300,7 +300,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,


 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -78,7 +78,7 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 // Int, Ext, Int+Ext cases for comms overlap
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -126,7 +126,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
  // Only contributions from interior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU, 
@@ -174,7 +174,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
  // Only contributions from exterior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU,
@@ -224,7 +224,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
 ////////////////////////////////////////////////////////////////////////////////////
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 {
@@ -253,7 +253,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
      ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\
  });

-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@@ -293,7 +293,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 				       DoubledGaugeField &U,
 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
  HAND_RESULT_EXT(ss,F)

 #define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void							\
+  template<> accelerator_inline void							\
  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
    nmu = 0;								\
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Impl> void 
+template<class Impl> accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_RESULT(ss);
 }

-template<class Impl>
+template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_RESULT(ss);
 }

-template<class Impl> void 
+template<class Impl>  accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_RESULT(ss);
 }

-template<class Impl>
+template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_RESULT(ss);
 }

-template<class Impl> void 
+template<class Impl>  accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_RESULT_EXT(ss);
 }

-template<class Impl>
+template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  ////////////////////////////////////////////////////////////////////
  // All legs kernels ; comms then compute
  ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
  coalescedWrite(out[sF],result,lane);
 };

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
 					  SiteHalfSpinor *buf, int sF,
 					  int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
  ////////////////////////////////////////////////////////////////////
  // Interior kernels
  ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFi
  coalescedWrite(out[sF], result,lane);
 };

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeField
 ////////////////////////////////////////////////////////////////////
 // Exterior kernels
 ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFi
  }
 };

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
 };

 #define DhopDirMacro(Dir,spProj,spRecon)	\
-  template <class Impl>							\
+  template <class Impl> accelerator_inline				\
  void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
 					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
  {									\
@@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
 DhopDirMacro(Zm,spProjZm,spReconZm);
 DhopDirMacro(Tm,spProjTm,spReconTm);

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
 {
@@ -41,6 +41,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+typedef struct { uint16_t x;} half;
+#endif
+typedef struct Half2_t { half x; half y; } Half2;
+
 #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )

 template<class pair>
@@ -125,14 +130,14 @@ inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const
 }

 constexpr int NSIMD_RealH    = COALESCE_GRANULARITY / sizeof(half);
-constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2);
+constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2);
 constexpr int NSIMD_RealF    = COALESCE_GRANULARITY / sizeof(float);
 constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
 constexpr int NSIMD_RealD    = COALESCE_GRANULARITY / sizeof(double);
 constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
 constexpr int NSIMD_Integer  = COALESCE_GRANULARITY / sizeof(Integer);

-typedef GpuComplex<half2  > GpuComplexH;
+typedef GpuComplex<Half2  > GpuComplexH;
 typedef GpuComplex<float2 > GpuComplexF;
 typedef GpuComplex<double2> GpuComplexD;

@@ -147,11 +152,9 @@ typedef GpuVector<NSIMD_Integer,  Integer     > GpuVectorI;
 accelerator_inline float half2float(half h)
 {
  float f;
-#ifdef GRID_SIMT
+#if defined(GRID_CUDA) || defined(GRID_HIP)
  f = __half2float(h);
 #else 
-  //f = __half2float(h);
-  __half_raw hr(h);
  Grid_half hh; 
  hh.x = hr.x;
  f=  sfw_half_to_float(hh);
@@ -161,13 +164,11 @@ accelerator_inline float half2float(half h)
 accelerator_inline half float2half(float f)
 {
  half h;
-#ifdef GRID_SIMT
+#if defined(GRID_CUDA) || defined(GRID_HIP)
  h = __float2half(f);
 #else
  Grid_half hh = sfw_float_to_half(f);
-  __half_raw hr;  
-  hr.x = hh.x;
-  h = __half(hr);
+  h.x = hh.x;
 #endif
  return h;
 }
@@ -523,7 +524,7 @@ namespace Optimization {
    ////////////////////////////////////////////////////////////////////////////////////
    // Single / Half
    ////////////////////////////////////////////////////////////////////////////////////
-    static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
+     static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
      int N = GpuVectorCF::N;
      GpuVectorCH h;
      for(int i=0;i<N;i++) {
@@ -55,6 +55,7 @@ void acceleratorInit(void)
 	printf("AcceleratorCudaInit[%d]: ========================\n",rank);
 	printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);

+
 	GPU_PROP_FMT(totalGlobalMem,"%lld");
 	GPU_PROP(managedMemory);
 	GPU_PROP(isMultiGpuBoard);
@@ -109,20 +110,24 @@ void acceleratorInit(void)
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}

+  printf("world_rank %d has %d devices\n",world_rank,nDevices);
+  size_t totalDeviceMem=0;
  for (int i = 0; i < nDevices; i++) {

 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
    hipGetDeviceProperties(&gpu_props[i], i);
+    hipDeviceProp_t prop; 
+    prop = gpu_props[i];
+    totalDeviceMem = prop.totalGlobalMem;
    if ( world_rank == 0) {
-      hipDeviceProp_t prop; 
-      prop = gpu_props[i];
      printf("AcceleratorHipInit: ========================\n");
      printf("AcceleratorHipInit: Device Number    : %d\n", i);
      printf("AcceleratorHipInit: ========================\n");
      printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);

+      GPU_PROP_FMT(totalGlobalMem,"%lu");
      //      GPU_PROP(managedMemory);
      GPU_PROP(isMultiGpuBoard);
      GPU_PROP(warpSize);
@@ -131,6 +136,7 @@ void acceleratorInit(void)
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
+  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
 #ifdef GRID_IBM_SUMMIT
@@ -307,17 +307,13 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 inline void *acceleratorAllocShared(size_t bytes)
 {
-#if 0
  void *ptr=NULL;
  auto err = hipMallocManaged((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
+    printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
-#else
-  return malloc(bytes);
-#endif
 };
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }

@@ -327,7 +323,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = hipMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
+    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
 };
@@ -111,11 +111,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu

 ``` bash
 mkdir build; cd build
-../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
+../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
 ```

-where `--enable-precision=` set the default precision,
-`--enable-simd=` set the SIMD type, `--enable-
+where `--enable-simd=` set the SIMD type, `--enable-
 comms=`, and `<path>` should be replaced by the prefix path where you want to
 install Grid. Other options are detailed in the next section, you can also use `configure
 --help` to display them. Like with any other program using GNU autotool, the
@@ -146,8 +145,8 @@ If you want to build all the tests at once just use `make tests`.
 - `--enable-numa`: enable NUMA first touch optimisation
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`).
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
+- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
+- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
 - `--disable-timers`: disable system dependent high-resolution timers.
 - `--enable-chroma`: enable Chroma regression tests.
@@ -201,8 +200,7 @@ Alternatively, some CPU codenames can be directly used:
 The following configuration is recommended for the Intel Knights Landing platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi-auto  \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -212,8 +210,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi       \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -232,8 +229,7 @@ for interior communication. This is the mpi3 communications implementation.
 We recommend four ranks per node for best performance, but optimum is local volume dependent.

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CC=icpc MPICXX=mpiicpc 
@@ -244,8 +240,7 @@ We recommend four ranks per node for best performance, but optimum is local volu
 The following configuration is recommended for the Intel Haswell platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -262,8 +257,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -280,8 +274,7 @@ This is the default.
 The following configuration is recommended for the Intel Skylake platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=mpiicpc
@@ -298,8 +291,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -330,8 +322,7 @@ and 8 threads per rank.
 The following configuration is recommended for the AMD EPYC platform.

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3 \
             CXX=mpicxx 
 ```
@@ -115,11 +115,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu

 ``` bash
 mkdir build; cd build
-../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
+../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
 ```

-where `--enable-precision=` set the default precision,
-`--enable-simd=` set the SIMD type, `--enable-
+where `--enable-simd=` set the SIMD type, `--enable-
 comms=`, and `<path>` should be replaced by the prefix path where you want to
 install Grid. Other options are detailed in the next section, you can also use `configure
 --help` to display them. Like with any other program using GNU autotool, the
@@ -150,8 +149,8 @@ If you want to build all the tests at once just use `make tests`.
 - `--enable-numa`: enable NUMA first touch optimisation
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`).
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
+- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
+- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
 - `--disable-timers`: disable system dependent high-resolution timers.
 - `--enable-chroma`: enable Chroma regression tests.
@@ -205,8 +204,7 @@ Alternatively, some CPU codenames can be directly used:
 The following configuration is recommended for the Intel Knights Landing platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi-auto  \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -216,8 +214,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi       \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -236,8 +233,7 @@ for interior communication. This is the mpi3 communications implementation.
 We recommend four ranks per node for best performance, but optimum is local volume dependent.

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CC=icpc MPICXX=mpiicpc 
@@ -248,8 +244,7 @@ We recommend four ranks per node for best performance, but optimum is local volu
 The following configuration is recommended for the Intel Haswell platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -266,8 +261,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -284,8 +278,7 @@ This is the default.
 The following configuration is recommended for the Intel Skylake platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=mpiicpc
@@ -302,8 +295,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -334,8 +326,7 @@ and 8 threads per rank.
 The following configuration is recommended for the AMD EPYC platform.

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3 \
             CXX=mpicxx 
 ```
@@ -12,31 +12,31 @@ module load mpi/openmpi-aarch64

 scl enable gcc-toolset-10 bash

-../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
+../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"

 * gcc 10.1 prebuild w/ MPI, QPACE4 interactive login

 scl enable gcc-toolset-10 bash
 module load mpi/openmpi-aarch64

-../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
+../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"

 ------------------------------------------------------------------------------

 * armclang 20.2 (qp4)

-../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN"
+../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN"

 ------------------------------------------------------------------------------

 * gcc 10.0.1 VLA (merlin)

-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static


 * gcc 10.0.1 fixed-size ACLE (merlin)

-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"


 * gcc 10.0.1 fixed-size ACLE (fjt) w/ MPI
@@ -46,34 +46,34 @@ export OMPI_CXX=g++-10.0.1
 export MPICH_CC=gcc-10.0.1
 export MPICH_CXX=g++-10.0.1

-$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt"
+$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt"

 --------------------------------------------------------

 * armclang 20.0 VLA (merlin)

-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static

 TODO check ARMCLANGCOMPAT


 * armclang 20.1 VLA (merlin)

-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static

 TODO check ARMCLANGCOMPAT


 * armclang 20.1 VLA (fjt cluster)

-../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
+../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"

 TODO check ARMCLANGCOMPAT


 * armclang 20.1 VLA w/MPI (fjt cluster)

-../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64"
+../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64"

 No ARMCLANGCOMPAT -> still correct ?

@@ -81,9 +81,9 @@ No ARMCLANGCOMPAT -> still correct ?

 * Fujitsu fcc

-../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN"
+../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN"


 * Fujitsu fcc w/ MPI

-../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
+../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
@@ -1,8 +1,16 @@

 #include "Benchmark_IO.hpp"

+#ifndef BENCH_IO_LMIN
+#define BENCH_IO_LMIN 8
+#endif
+
 #ifndef BENCH_IO_LMAX
-#define BENCH_IO_LMAX 40
+#define BENCH_IO_LMAX 32
+#endif
+
+#ifndef BENCH_IO_NPASS
+#define BENCH_IO_NPASS 10
 #endif

 using namespace Grid;
@@ -12,37 +20,179 @@ std::string filestem(const int l)
  return "iobench_l" + std::to_string(l);
 }

+int vol(const int i)
+{
+  return BENCH_IO_LMIN + 2*i;
+}
+
+int volInd(const int l)
+{
+  return (l - BENCH_IO_LMIN)/2;
+}
+
+template <typename Mat>
+void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
+{
+  auto            nr = data[0].rows(), nc = data[0].cols();
+  Eigen::MatrixXd sqSum(nr, nc);
+  double          n = static_cast<double>(data.size());
+
+  assert(n > 1.);
+  mean  = Mat::Zero(nr, nc);
+  sqSum = Mat::Zero(nr, nc);
+  for (auto &d: data)
+  {
+    mean  += d;
+    sqSum += d.cwiseProduct(d);
+  }
+  stdDev = ((sqSum - mean.cwiseProduct(mean)/n)/(n - 1.)).cwiseSqrt();
+  mean  /= n;
+}
+
+#define grid_printf(...) \
+{\
+  char _buf[1024];\
+  sprintf(_buf, __VA_ARGS__);\
+  MSG << _buf;\
+}
+
+enum {sRead = 0, sWrite = 1, gRead = 2, gWrite = 3};
+
 int main (int argc, char ** argv)
 {
-#ifdef HAVE_LIME
  Grid_init(&argc,&argv);

-  int64_t threads = GridThread::GetThreads();
+  int64_t                      threads = GridThread::GetThreads();
+  auto                         mpi     = GridDefaultMpi();
+  unsigned int                 nVol    = (BENCH_IO_LMAX - BENCH_IO_LMIN)/2 + 1;
+  unsigned int                 nRelVol = (BENCH_IO_LMAX - 24)/2 + 1;
+  std::vector<Eigen::MatrixXd> perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4));
+  std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
+  std::vector<int>             latt;
+
  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
-  MSG << SEP << std::endl;
-  MSG << "Benchmark Lime write" << std::endl;
-  MSG << SEP << std::endl;
-  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
+  MSG << "MPI partition " << mpi << std::endl;
+  for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
  {
-    auto             mpi  = GridDefaultMpi();
-    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+    MSG << BIGSEP << std::endl;
+    MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
+    MSG << BIGSEP << std::endl;
+    MSG << SEP << std::endl;
+    MSG << "Benchmark std write" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};

-    std::cout << "-- Local volume " << l << "^4" << std::endl;
-    writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
+      perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+
+    MSG << SEP << std::endl;
+    MSG << "Benchmark std read" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
+      perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+
+  #ifdef HAVE_LIME
+    MSG << SEP << std::endl;
+    MSG << "Benchmark Grid C-Lime write" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
+      perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+
+    MSG << SEP << std::endl;
+    MSG << "Benchmark Grid C-Lime read" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
+      perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+#endif
+    avPerf[i].fill(0.);
+    for (int f = 0; f < 4; ++f)
+    for (int l = 24; l <= BENCH_IO_LMAX; l += 2)
+    {
+      avPerf[i](f) += perf[i](volInd(l), f);
+    }
+    avPerf[i] /= nRelVol;
  }

-  MSG << "Benchmark Lime read" << std::endl;
-  MSG << SEP << std::endl;
-  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
-  {
-    auto             mpi  = GridDefaultMpi();
-    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+  Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
+  Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
+  double          n = BENCH_IO_NPASS;

-    std::cout << "-- Local volume " << l << "^4" << std::endl;
-    readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
+  stats(mean, stdDev, perf);
+  stats(avMean, avStdDev, avPerf);
+  rob.fill(100.);
+  rob -= 100.*stdDev.cwiseQuotient(mean.cwiseAbs());
+  avRob.fill(100.);
+  avRob -= 100.*avStdDev.cwiseQuotient(avMean.cwiseAbs());
+
+  MSG << BIGSEP << std::endl;
+  MSG << "SUMMARY" << std::endl;
+  MSG << BIGSEP << std::endl;
+  MSG << "Summary of individual results (all results in MB/s)." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << std::endl;
+  grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n",
+              "L", "std read", "std dev", "std write", "std dev",
+              "Grid read", "std dev", "Grid write", "std dev");
+  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+  {
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
+                l, mean(volInd(l), sRead), stdDev(volInd(l), sRead),
+                mean(volInd(l), sWrite), stdDev(volInd(l), sWrite),
+                mean(volInd(l), gRead), stdDev(volInd(l), gRead),
+                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
  }
+  MSG << std::endl;
+  MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << std::endl;
+  grid_printf("%4s %12s %12s %12s %12s\n",
+              "L", "std read", "std write", "Grid read", "Grid write");
+  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+  {
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n",
+                l, rob(volInd(l), sRead), rob(volInd(l), sWrite),
+                rob(volInd(l), gRead), rob(volInd(l), gWrite));
+  }
+  MSG << std::endl;
+  MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << std::endl;
+  grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n",
+              "std read", "std dev", "std write", "std dev",
+              "Grid read", "std dev", "Grid write", "std dev");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
+              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
+              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
+  MSG << std::endl;
+  MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << std::endl;
+  grid_printf("%12s %12s %12s %12s\n",
+              "std read", "std write", "Grid read", "Grid write");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f\n",
+              avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite));

  Grid_finalize();
-#endif
+
  return EXIT_SUCCESS;
 }
@@ -5,6 +5,8 @@
 #ifdef HAVE_LIME
 #define MSG std::cout << GridLogMessage
 #define SEP \
+"-----------------------------------------------------------------------------"
+#define BIGSEP \
 "============================================================================="

 namespace Grid {
@@ -14,13 +16,152 @@ using WriterFn = std::function<void(const std::string, Field &)> ;
 template <typename Field>
 using ReaderFn = std::function<void(Field &, const std::string)>;

+// AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
+// 
+// template <typename Field>
+// void stdWrite(const std::string filestem, Field &vec)
+// {
+//   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+//   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
+//   size_t        size;
+//   uint32_t      crc;
+//   GridStopWatch ioWatch, crcWatch;
+
+//   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+//   autoView(vec_v, vec, CpuRead);
+//   crcWatch.Start();
+//   crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
+//   std::fwrite(&crc, sizeof(uint32_t), 1, file);
+//   crcWatch.Stop();
+//   MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+//   ioWatch.Start();
+//   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+//   ioWatch.Stop();
+//   std::fclose(file);
+//   size *= vec.Grid()->ProcessorCount();
+//   auto &p = BinaryIO::lastPerf;
+//   p.size            = size;
+//   p.time            = ioWatch.useconds();
+//   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+//   MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() 
+//       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+//   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+// }
+//
+// template <typename Field>
+// void stdRead(Field &vec, const std::string filestem)
+// {
+//   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+//   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
+//   size_t        size;
+//   uint32_t      crcRead, crcData;
+//   GridStopWatch ioWatch, crcWatch;
+
+//   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+//   crcWatch.Start();
+//   std::fread(&crcRead, sizeof(uint32_t), 1, file);
+//   crcWatch.Stop();
+//   {
+//     autoView(vec_v, vec, CpuWrite);
+//     ioWatch.Start();
+//     std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+//     ioWatch.Stop();
+//     std::fclose(file);
+//   }
+//   {
+//     autoView(vec_v, vec, CpuRead);
+//     crcWatch.Start();
+//     crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
+//     crcWatch.Stop();
+//   }
+//   MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+//   assert(crcData == crcRead);
+//   size *= vec.Grid()->ProcessorCount();
+//   auto &p = BinaryIO::lastPerf;
+//   p.size            = size;
+//   p.time            = ioWatch.useconds();
+//   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+//   MSG << "Std I/O read: Read " <<  p.size << " bytes in " << ioWatch.Elapsed() 
+//       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+//   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+// }
+
+template <typename Field>
+void stdWrite(const std::string filestem, Field &vec)
+{
+  std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+  std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary);
+  size_t        size, sizec;
+  uint32_t      crc;
+  GridStopWatch ioWatch, crcWatch;
+
+  size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  sizec = size/sizeof(char); // just in case of...
+  autoView(vec_v, vec, CpuRead);
+  crcWatch.Start();
+  crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
+  file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t)/sizeof(char));
+  crcWatch.Stop();
+  MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+  ioWatch.Start();
+  file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
+  file.flush();
+  ioWatch.Stop();
+  size *= vec.Grid()->ProcessorCount();
+  auto &p = BinaryIO::lastPerf;
+  p.size            = size;
+  p.time            = ioWatch.useconds();
+  p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+  MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() 
+      << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+  MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+}
+
+template <typename Field>
+void stdRead(Field &vec, const std::string filestem)
+{
+  std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+  std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary);
+  size_t        size, sizec;
+  uint32_t      crcRead, crcData;
+  GridStopWatch ioWatch, crcWatch;
+
+  size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  sizec = size/sizeof(char); // just in case of...
+  crcWatch.Start();
+  file.read(reinterpret_cast<char *>(&crcRead), sizeof(uint32_t)/sizeof(char));
+  crcWatch.Stop();
+  {
+    autoView(vec_v, vec, CpuWrite);
+    ioWatch.Start();
+    file.read(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
+    ioWatch.Stop();
+  }
+  {
+    autoView(vec_v, vec, CpuRead);
+    crcWatch.Start();
+    crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
+    crcWatch.Stop();
+  }
+  MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+  assert(crcData == crcRead);
+  size *= vec.Grid()->ProcessorCount();
+  auto &p = BinaryIO::lastPerf;
+  p.size            = size;
+  p.time            = ioWatch.useconds();
+  p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+  MSG << "Std I/O read: Read " <<  p.size << " bytes in " << ioWatch.Elapsed() 
+      << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+  MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+}
+
 template <typename Field>
 void limeWrite(const std::string filestem, Field &vec)
 {
  emptyUserRecord   record;
  ScidacWriter binWriter(vec.Grid()->IsBoss());

-  binWriter.open(filestem + ".bin");
+  binWriter.open(filestem + ".lime.bin");
  binWriter.writeScidacFieldRecord(vec, record);
  binWriter.close();
 }
@@ -31,7 +172,7 @@ void limeRead(Field &vec, const std::string filestem)
  emptyUserRecord   record;
  ScidacReader binReader;

-  binReader.open(filestem + ".bin");
+  binReader.open(filestem + ".lime.bin");
  binReader.readScidacFieldRecord(vec, record);
  binReader.close();
 }
@@ -73,12 +214,18 @@ void writeBenchmark(const Coordinate &latt, const std::string filename,
  auto                           simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
  std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
  std::shared_ptr<GridBase>      gPt;
+  std::random_device             rd;

  makeGrid(gPt, gBasePt, Ls, rb);

-  GridBase                       *g = gPt.get();
-  GridParallelRNG                rng(g);
-  Field                          vec(g);
+  GridBase         *g = gPt.get();
+  GridParallelRNG  rng(g);
+  Field            vec(g);
+
+  rng.SeedFixedIntegers({static_cast<int>(rd()), static_cast<int>(rd()),
+                         static_cast<int>(rd()), static_cast<int>(rd()),
+                         static_cast<int>(rd()), static_cast<int>(rd()),
+                         static_cast<int>(rd()), static_cast<int>(rd())});

  random(rng, vec);
  write(filename, vec);
@@ -96,8 +243,8 @@ void readBenchmark(const Coordinate &latt, const std::string filename,

  makeGrid(gPt, gBasePt, Ls, rb);

-  GridBase                       *g = gPt.get();
-  Field                          vec(g);
+  GridBase *g = gPt.get();
+  Field    vec(g);

  read(vec, filename);
 }
@@ -1,14 +1,9 @@
 #include "Benchmark_IO.hpp"

-#define MSG std::cout << GridLogMessage
-#define SEP \
-"============================================================================="
-
 using namespace Grid;

 int main (int argc, char ** argv)
 {
-#ifdef HAVE_LIME
  std::vector<std::string> dir;
  unsigned int             Ls;
  bool                     rb;
@@ -34,46 +29,71 @@ int main (int argc, char ** argv)
  }
  Grid_init(&argc,&argv);

-
  int64_t threads = GridThread::GetThreads();
+  auto    mpi     = GridDefaultMpi();
+
  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
-  MSG << SEP << std::endl;
-  MSG << "Benchmark double precision Lime write" << std::endl;
-  MSG << SEP << std::endl;
-  for (auto &d: dir)
-  {
-    MSG << "-- Directory " << d << std::endl;
-    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermion>, Ls, rb);
-  }
+  MSG << "MPI partition " << mpi << std::endl;

  MSG << SEP << std::endl;
-  MSG << "Benchmark double precision Lime read" << std::endl;
+  MSG << "Benchmark Grid std write" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
    MSG << "-- Directory " << d << std::endl;
-    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermion>, Ls, rb);
+    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                   stdWrite<LatticeFermion>, Ls, rb);
+  }
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid std read" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                  stdRead<LatticeFermion>, Ls, rb);
  }

+#ifdef HAVE_LIME
  MSG << SEP << std::endl;
-  MSG << "Benchmark single precision Lime write" << std::endl;
+  MSG << "Benchmark Grid C-Lime write" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
    MSG << "-- Directory " << d << std::endl;
-    writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
+    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                   limeWrite<LatticeFermion>, Ls, rb);
  }
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid C-Lime read" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                  limeRead<LatticeFermion>, Ls, rb);
+  }
+#endif

-  MSG << SEP << std::endl;
-  MSG << "Benchmark single precision Lime read" << std::endl;
-  MSG << SEP << std::endl;
-  for (auto &d: dir)
-  {
-    MSG << "-- Directory " << d << std::endl;
-    readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
-  }
+  // MSG << SEP << std::endl;
+  // MSG << "Benchmark single precision Lime write" << std::endl;
+  // MSG << SEP << std::endl;
+  // for (auto &d: dir)
+  // {
+  //   MSG << "-- Directory " << d << std::endl;
+  //   writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
+  // }
+
+  // MSG << SEP << std::endl;
+  // MSG << "Benchmark single precision Lime read" << std::endl;
+  // MSG << SEP << std::endl;
+  // for (auto &d: dir)
+  // {
+  //   MSG << "-- Directory " << d << std::endl;
+  //   readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
+  // }

  Grid_finalize();
-#endif
+
  return EXIT_SUCCESS;
 }
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -125,7 +125,7 @@ public:
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
-	std::cout << GridLogMessage<< latt_size <<std::endl;
+
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
@@ -137,8 +137,8 @@ public:
 	for(int d=0;d<8;d++){
 	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	  //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}

 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@@ -202,6 +202,8 @@ public:
    return;
  }

+
+  
  static void Memory(void)
  {
    const int Nvec=8;
@@ -222,7 +224,7 @@ public:


  uint64_t lmax=32;
-#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)

    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=8){
@@ -247,11 +249,6 @@ public:
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x-y;
-	autoView( x_v , x, CpuWrite);
-	autoView( y_v , y, CpuWrite);
-	autoView( z_v , z, CpuRead);
-        x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
-        y_v[4]=z_v[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
@@ -266,6 +263,61 @@ public:
  };


+  static void SU4(void)
+  {
+    const int Nc4=4;
+    typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4;
+
+    Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+    Coordinate mpi_layout  = GridDefaultMpi();
+    
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
+    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+  
+    uint64_t NN;
+
+
+    uint64_t lmax=32;
+#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+
+    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+    for(int lat=8;lat<=lmax;lat+=8){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      NN =Grid.NodeCount();
+
+
+      LatticeSU4 z(&Grid); z=Zero();
+      LatticeSU4 x(&Grid); x=Zero();
+      LatticeSU4 y(&Grid); y=Zero();
+      double a=2.0;
+
+      uint64_t Nloop=NLOOP;
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+	z=x*y;
+      }
+      double stop=usecond();
+      double time = (stop-start)/Nloop*1000;
+     
+      double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add
+      double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF);
+      std::cout<<GridLogMessage<<std::setprecision(3) 
+	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
+	       << "\t\t"<< bytes/time/NN <<std::endl;
+
+    }
+  };
+
+
  static double DWF(int Ls,int L)
  {
    RealD mass=0.1;
@@ -296,6 +348,7 @@ public:
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Nc             : "<<Nc<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
@@ -324,7 +377,7 @@ public:
    typedef LatticeGaugeFieldF Gauge;
    
    ///////// Source preparation ////////////
-    Gauge Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
+    Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
    Fermion src   (FGrid); random(RNG5,src);
    Fermion src_e (FrbGrid);
    Fermion src_o (FrbGrid);
@@ -369,7 +422,7 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 50;
+	uint64_t ncall = 500;

 	FGrid->Broadcast(0,&ncall,sizeof(ncall));

@@ -387,7 +440,13 @@ public:
 	FGrid->Barrier();
 	
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-	double flops=(1344.0*volume)/2;
+
+	// Nc=3 gives
+	// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
+	// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2  + Nd*Nc*Ns*2
+	//	double flops=(1344.0*volume)/2;
+	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns  + Nd*Nc*Ns*2;
+	double flops=(fps*volume)/2;
 	double mf_hi, mf_lo, mf_err;

 	timestat.statistics(t_time);
@@ -402,6 +461,7 @@ public:
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;

+	std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
@@ -478,7 +538,7 @@ public:
    typedef typename Action::FermionField Fermion; 
    typedef LatticeGaugeFieldF Gauge;
    
-    Gauge Umu(FGrid);  SU3::HotConfiguration(RNG4,Umu); 
+    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 

    typename Action::ImplParams params;
    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
@@ -596,11 +656,12 @@ int main (int argc, char ** argv)
 #endif
  Benchmark::Decomposition();

+  int do_su4=1;
  int do_memory=1;
  int do_comms =1;

-  int sel=2;
-  std::vector<int> L_list({16,24,32});
+  int sel=4;
+  std::vector<int> L_list({8,12,16,24,32});
  int selm1=sel-1;

  std::vector<double> wilson;
@@ -624,7 +685,6 @@ int main (int argc, char ** argv)
    dwf4.push_back(result);
  }

-  /*
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -632,14 +692,13 @@ int main (int argc, char ** argv)
    double result = Benchmark::Staggered(L_list[l]) ;
    staggered.push_back(result);
  }
-  */

  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

@@ -651,6 +710,13 @@ int main (int argc, char ** argv)
    Benchmark::Memory();
  }

+  if ( do_su4 ) {
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    Benchmark::SU4();
+  }
+  
  if ( do_comms && (NN>1) ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
@@ -108,7 +108,7 @@ int main (int argc, char ** argv)

  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 #if 0
  Umu=1.0;
@@ -0,0 +1,364 @@
+ /*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid
+    Source file: ./benchmarks/Benchmark_dwf.cc
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#ifdef GRID_CUDA
+#define CUDA_PROFILE
+#endif
+
+#ifdef CUDA_PROFILE
+#include <cuda_profiler_api.h>
+#endif
+
+using namespace std;
+using namespace Grid;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  int threads = GridThread::GetThreads();
+
+  Coordinate latt4 = GridDefaultLatt();
+  int Ls=8;
+  for(int i=0;i<argc;i++)
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+
+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
+
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  LatticeFermionF src   (FGrid); random(RNG5,src);
+#if 0
+  src = Zero();
+  {
+    Coordinate origin({0,0,0,latt4[2]-1,0});
+    SpinColourVectorF tmp;
+    tmp=Zero();
+    tmp()(0)(0)=Complex(-2.0,0.0);
+    std::cout << " source site 0 " << tmp<<std::endl;
+    pokeSite(tmp,src,origin);
+  }
+#else
+  RealD N2 = 1.0/::sqrt(norm2(src));
+  src = src*N2;
+#endif
+
+
+  LatticeFermionF result(FGrid); result=Zero();
+  LatticeFermionF    ref(FGrid);    ref=Zero();
+  LatticeFermionF    tmp(FGrid);
+  LatticeFermionF    err(FGrid);
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  LatticeGaugeFieldF Umu(UGrid);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+#if 0
+  Umu=1.0;
+  for(int mu=0;mu<Nd;mu++){
+    LatticeColourMatrixF ttmp(UGrid);
+    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
+    //    if (mu !=2 ) ttmp = 0;
+    //    ttmp = ttmp* pow(10.0,mu);
+    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
+  }
+  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
+#endif
+
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  // replicate across fifth dimension
+  LatticeGaugeFieldF Umu5d(FGrid);
+  std::vector<LatticeColourMatrixF> U(4,FGrid);
+  {
+    autoView( Umu5d_v, Umu5d, CpuWrite);
+    autoView( Umu_v  , Umu  , CpuRead);
+    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
+      for(int s=0;s<Ls;s++){
+	Umu5d_v[Ls*ss+s] = Umu_v[ss];
+      }
+    }
+  }
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+  }
+  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
+
+  if (1)
+  {
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = U[mu]*Cshift(src,mu+1,1);
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
+  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  int ncall =1000;
+
+  if (1) {
+    FGrid->Barrier();
+    Dw.ZeroCounters();
+    Dw.Dhop(src,result,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      __SSC_START;
+      Dw.Dhop(src,result,0);
+      __SSC_STOP;
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=single_site_flops*volume*ncall;
+
+    auto nsimd = vComplex::Nsimd();
+    auto simdwidth = sizeof(vComplex);
+
+    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
+    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
+    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
+    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
+    err = ref-result;
+    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+    //exit(0);
+
+    if(( norm2(err)>1.0e-4) ) {
+      /*
+      std::cout << "RESULT\n " << result<<std::endl;
+      std::cout << "REF   \n " << ref   <<std::endl;
+      std::cout << "ERR   \n " << err   <<std::endl;
+      */
+      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
+      FGrid->Barrier();
+      exit(-1);
+    }
+    assert (norm2(err)< 1.0e-4 );
+    Dw.Report();
+  }
+
+  if (1)
+  { // Naive wilson dag implementation
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
+      tmp = U[mu]*Cshift(src,mu+1,1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+    }
+    ref = -0.5*ref;
+  }
+  //  dump=1;
+  Dw.Dhop(src,result,1);
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
+  err = ref-result;
+  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
+  if((norm2(err)>1.0e-4)){
+/*
+	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
+	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
+	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
+*/
+  }
+  LatticeFermionF src_e (FrbGrid);
+  LatticeFermionF src_o (FrbGrid);
+  LatticeFermionF r_e   (FrbGrid);
+  LatticeFermionF r_o   (FrbGrid);
+  LatticeFermionF r_eo  (FGrid);
+
+  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+
+  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
+
+
+  // S-direction is INNERMOST and takes no part in the parity.
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  {
+    Dw.ZeroCounters();
+    FGrid->Barrier();
+    Dw.DhopEO(src_o,r_e,DaggerNo);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+#ifdef CUDA_PROFILE
+      if(i==10) cudaProfilerStart();
+#endif
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+#ifdef CUDA_PROFILE
+      if(i==20) cudaProfilerStop();
+#endif
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(single_site_flops*volume*ncall)/2.0;
+
+    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
+    Dw.Report();
+  }
+  Dw.DhopEO(src_o,r_e,DaggerNo);
+  Dw.DhopOE(src_e,r_o,DaggerNo);
+  Dw.Dhop  (src  ,result,DaggerNo);
+
+  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
+  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
+  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
+
+  setCheckerboard(r_eo,r_o);
+  setCheckerboard(r_eo,r_e);
+
+  err = r_eo-result;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+  if((norm2(err)>1.0e-4)){
+    /*
+	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
+	std::cout<< "Deo REF\n " <<result  << std::endl;
+	std::cout<< "Deo ERR   \n " << err <<std::endl;
+    */
+  }
+
+  pickCheckerboard(Even,src_e,err);
+  pickCheckerboard(Odd,src_o,err);
+  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
+
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
+  Grid_finalize();
+  exit(0);
+}
@@ -63,7 +63,7 @@ int main (int argc, char ** argv)

  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
  LatticeGaugeFieldF Umu(UGrid); 
-  SU3::HotConfiguration(RNG4,Umu); 
+  SU<Nc>::HotConfiguration(RNG4,Umu); 
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;

  RealD mass=0.1;
@@ -30,7 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 using namespace std;
 using namespace Grid;
- ;
+


 int main (int argc, char ** argv)
@@ -53,7 +53,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  std::cout << GridLogMessage << "Seeded"<<std::endl;

-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);

  std::cout << GridLogMessage << "made random gauge fields"<<std::endl;

@@ -36,12 +36,12 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-#define LMAX (48)
+#define LMAX (40)
 #define LMIN (8)
 #define LADD (8)

-  int64_t Nwarm=50;
-  int64_t Nloop=500;
+  int64_t Nwarm=10;
+  int64_t Nloop=100;

  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();
@@ -118,6 +118,41 @@ int main (int argc, char ** argv)

    }

+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z=z+ x*y"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+
+  for(int lat=LMIN;lat<=LMAX;lat+=LADD){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
+
+      for(int64_t i=0;i<Nwarm;i++){
+	z=z+x*y;
+      }
+      double start=usecond();
+      for(int64_t i=0;i<Nloop;i++){
+	z=z+x*y;
+      }
+      double stop=usecond();
+      double time = (stop-start)/Nloop*1000.0;
+      
+      double bytes=4*vol*Nc*Nc*sizeof(Complex);
+      double flops=Nc*Nc*(6+8+8)*vol;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+
+    }
+
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -143,7 +178,6 @@ int main (int argc, char ** argv)
      double start=usecond();
      for(int64_t i=0;i<Nloop;i++){
 	mult(z,x,y);
-	//	mac(z,x,y);
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000.0;
@@ -187,7 +187,8 @@ int main (int argc, char ** argv)
 	  auto xx = coalescedRead(x_v[ss]);
 	  auto yy = coalescedRead(y_v[ss]);
 	  auto zz = coalescedRead(z_v[ss]);
-	  zz = zz+xx*yy;
+	  //zz = zz+xx*yy;
+	  mac(&zz,&xx,&yy);
 	  coalescedWrite(z_v[ss],zz);
        });
      }
@@ -123,6 +123,24 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac

+############### Nc
+AC_ARG_ENABLE([Nc],
+    [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
+    [ac_Nc=${enable_Nc}], [ac_Nc=3])
+
+case ${ac_Nc} in
+    2)
+        AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);;
+    3)
+        AC_DEFINE([Config_Nc],[3],[Gauge group Nc]);;
+    4)
+        AC_DEFINE([Config_Nc],[4],[Gauge group Nc]);;
+    5)
+        AC_DEFINE([Config_Nc],[5],[Gauge group Nc]);;
+    *)
+      AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
+esac
+
 ############### FP16 conversions
 AC_ARG_ENABLE([sfw-fp16],
    [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])],
@@ -330,12 +348,18 @@ case ${CXXTEST} in
    fi
    ;;
  hipcc)
-    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
+#    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
+    CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
    CXXLD=${CXX}
    if test $ac_openmp = yes; then
       CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
    fi
    ;;
+  dpcpp)
+    LDFLAGS="$LDFLAGS"
+    CXXFLAGS="$CXXFLAGS"
+    CXXLD=${CXX}
+    ;;
  *)
    CXXLD=${CXX}
    CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
@@ -453,23 +477,24 @@ esac
 AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
 AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"

-############### Precision selection
-AC_ARG_ENABLE([precision],
-              [AC_HELP_STRING([--enable-precision=single|double],
-                              [Select default word size of Real])],
-              [ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
+############### Precision selection - deprecate
+#AC_ARG_ENABLE([precision],
+#              [AC_HELP_STRING([--enable-precision=single|double],
+#                              [Select default word size of Real])],
+#              [ac_PRECISION=${enable_precision}],[ac_PRECISION=double])

-case ${ac_PRECISION} in
-     single)
-       AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
-     ;;
-     double)
-       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
-     ;;
-     *)
-     AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
-     ;;
-esac
+AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
+
+#case ${ac_PRECISION} in
+#     single)
+#       AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
+#     ;;
+#     double)
+#     ;;
+#     *)
+#     AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
+#     ;;
+#esac

 ######################  Shared memory allocation technique under MPI3
 AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone],
@@ -650,6 +675,7 @@ os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
+Nc                          : ${ac_Nc}
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp}
 Acceleration                : ${ac_ACCELERATOR}
@@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu

 This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:

-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridDebug --enable-comms=none
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Debug

 #### 2. `Release`

-Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):
+Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`:

-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=single --prefix=$GridPre/GridRelease --enable-comms=none
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Release

 #### 3. `MPIDebug`

 Debug configuration with MPI:

-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --prefix=$GridPre/MPIDebug

 ### 5.3 Build Grid

@@ -178,15 +178,10 @@ Then enter the cloned directory and set up the build system::
 Now you can execute the `configure` script to generate makefiles (here from a build directory)::

  mkdir build; cd build
-  ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto \
+  ../configure --enable-simd=AVX --enable-comms=mpi-auto \
      --prefix=<path>

-where::
-
-  --enable-precision=single|double
-
-sets the **default precision**. Since this is largely a benchmarking convenience, it is anticipated that the default precision may be removed in future implementations,
-and that explicit type selection be made at all points. Naturally, most code will be type templated in any case.::
+::

   --enable-simd=GEN|SSE4|AVX|AVXFMA|AVXFMA4|AVX2|AVX512|NEONv8|QPX

@@ -236,7 +231,7 @@ Detailed build configuration options
  --enable-mkl[=path]                     use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
  --enable-simd=code                      setup Grid for the SIMD target `<code>`(default: `GEN`). A list of possible SIMD targets is detailed in a section below.
  --enable-gen-simd-width=size            select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). E.g. SSE 128 bit corresponds to 16 bytes.
-  --enable-precision=single|double        set the default precision (default: `double`).
+  --enable-precision=single|double        set the default precision (default: `double`). **Deprecated option**
  --enable-comms=mpi|none                 use `<comm>` for message passing (default: `none`).
  --enable-rng=sitmo|ranlux48|mt19937     choose the RNG (default: `sitmo`).
  --disable-timers                        disable system dependent high-resolution timers.
@@ -304,8 +299,7 @@ Build setup for Intel Knights Landing platform

 The following configuration is recommended for the Intel Knights Landing platform::

-  ../configure --enable-precision=double\
-             --enable-simd=KNL        \
+  ../configure --enable-simd=KNL        \
             --enable-comms=mpi-auto  \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -314,8 +308,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.

 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::

-  ../configure --enable-precision=double\
-             --enable-simd=KNL        \
+  ../configure --enable-simd=KNL        \
             --enable-comms=mpi       \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -332,8 +325,7 @@ presently performs better with use of more than one rank per node, using shared
 for interior communication.
 We recommend four ranks per node for best performance, but optimum is local volume dependent. ::

-   ../configure --enable-precision=double\
-             --enable-simd=KNL        \
+   ../configure --enable-simd=KNL        \
             --enable-comms=mpi-auto \
             --enable-mkl             \
             CC=icpc MPICXX=mpiicpc 
@@ -343,8 +335,7 @@ Build setup for Intel Haswell Xeon platform

 The following configuration is recommended for the Intel Haswell platform::

-  ../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+  ../configure --enable-simd=AVX2       \
             --enable-comms=mpi-auto \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -360,8 +351,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.

 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::

-  ../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+  ../configure --enable-simd=AVX2       \
             --enable-comms=mpi      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -379,8 +369,7 @@ Build setup for Intel Skylake Xeon platform

 The following configuration is recommended for the Intel Skylake platform::

-  ../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+  ../configure --enable-simd=AVX512     \
             --enable-comms=mpi      \
             --enable-mkl             \
             CXX=mpiicpc
@@ -396,8 +385,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.

 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::

-  ../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+  ../configure --enable-simd=AVX512     \
             --enable-comms=mpi      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -422,8 +410,7 @@ and 8 threads per rank.
 The following configuration is recommended for the AMD EPYC platform::


-  ../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+  ../configure --enable-simd=AVX2       \
             --enable-comms=mpi \
             CXX=mpicxx 

@@ -69,7 +69,7 @@ int main (int argc, char ** argv)

  std::vector<LatticeColourMatrix> U(4,&Fine);
  
-  SU3::HotConfiguration(pRNGa,Umu);
+  SU<Nc>::HotConfiguration(pRNGa,Umu);


  FieldMetaData header;
@@ -84,7 +84,7 @@ int main (int argc, char ** argv)

  std::vector<LatticeColourMatrix> U(4,&Fine);
  
-  SU3::HotConfiguration(pRNGa,Umu);
+  SU<Nc>::HotConfiguration(pRNGa,Umu);

  FieldMetaData header;
  std::string file("./ckpoint_lat.4000");
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          sRNG5(sFGrid);  sRNG5.SeedFixedIntegers(seeds5);

  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);

  RealD mass=0.1;
  RealD M5  =1.8;
@@ -202,7 +202,7 @@ int main (int argc, char ** argv) {
  std::vector<int> seeds4({1,2,3,4});
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
  //  FieldMetaData header;
  //  NerscIO::readConfiguration(Umu,header,Params.config);

@@ -71,7 +71,7 @@ int main (int argc, char ** argv)
  LatticeGaugeFieldD Umu(UGrid);
  LatticeGaugeFieldF Umu_f(UGrid_f); 
  
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);

  precisionChange(Umu_f,Umu);
  
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
  LatticeGaugeFieldD Umu(UGrid);
  LatticeGaugeFieldF Umu_f(UGrid_f); 
  
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);

  precisionChange(Umu_f,Umu);
  
@@ -64,7 +64,7 @@ int main (int argc, char ** argv)
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);

  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
@@ -131,7 +131,7 @@ int main (int argc, char ** argv)
  // LatticeFermion result(FGrid); result=Zero();
  // LatticeGaugeField Umu(UGrid); 

-  // SU3::HotConfiguration(RNG4,Umu);
+  // SU<Nc>::HotConfiguration(RNG4,Umu);

  // std::vector<LatticeColourMatrix> U(4,UGrid);
  // for(int mu=0;mu<Nd;mu++){
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);

-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
  std::vector<LatticeColourMatrix> U(4,UGrid);

  RealD mass=0.1;
@@ -73,7 +73,7 @@ int main (int argc, char ** argv)
    LatticeFermion    ref   (FGrid); ref = Zero();
    LatticeFermion    tmp   (FGrid); tmp = Zero();
    LatticeFermion    err   (FGrid); err = Zero();
-    LatticeGaugeField Umu   (UGrid); SU3::HotConfiguration(RNG4, Umu);
+    LatticeGaugeField Umu   (UGrid); SU<Nc>::HotConfiguration(RNG4, Umu);
    std::vector<LatticeColourMatrix> U(4,UGrid);

    // Only one non-zero (y)
@@ -72,7 +72,7 @@ int main (int argc, char ** argv)
  LatticeFermion    ref(FGrid);    ref=Zero();
  LatticeFermion    tmp(FGrid);    tmp=Zero();
  LatticeFermion    err(FGrid);    tmp=Zero();
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
  std::vector<LatticeColourMatrix> U(4,UGrid);

  // Only one non-zero (y)
@@ -138,7 +138,7 @@ int main (int argc, char ** argv)

  LatticeGaugeFieldD Umu(&GRID);

-  SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
  //  Umu=Zero();
  ////////////////////////////////////////////////////
  // Wilson test
@@ -73,11 +73,11 @@ int main (int argc, char ** argv)
  LatticeColourMatrix   xform2(&GRID); // Gauge xform
  LatticeColourMatrix   xform3(&GRID); // Gauge xform
  
-  SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
  Uorg=Umu;
  Urnd=Umu;

-  SU3::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge
+  SU<Nc>::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge

  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
  std::cout << " Initial plaquette "<<plaq << std::endl;
@@ -121,7 +121,7 @@ int main (int argc, char ** argv)
  std::cout<< "* Testing non-unit configuration                                *" <<std::endl;
  std::cout<< "*****************************************************************" <<std::endl;

-  SU3::HotConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge

  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
  std::cout << " Initial plaquette "<<plaq << std::endl;
@@ -136,7 +136,7 @@ int main (int argc, char ** argv)
  std::cout<< "*****************************************************************" <<std::endl;

  Umu=Urnd;
-  SU3::HotConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge

  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
  std::cout << " Initial plaquette "<<plaq << std::endl;
@@ -114,7 +114,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG4_2f(UGrid_2f);  RNG4_2f.SeedFixedIntegers(seeds4);

  GparityGaugeField Umu_2f(UGrid_2f);
-  SU3::HotConfiguration(RNG4_2f,Umu_2f);
+  SU<Nc>::HotConfiguration(RNG4_2f,Umu_2f);

  StandardFermionField    src   (FGrid_2f); 
  StandardFermionField    tmpsrc(FGrid_2f); 
@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
  FermionField    ref(&Grid);    ref=Zero();
  FermionField    tmp(&Grid);    tmp=Zero();
  FermionField    err(&Grid);    tmp=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);

  double volume=1;
@@ -66,14 +66,14 @@ int main(int argc, char** argv) {

  std::cout << GridLogMessage << "*********************************************"
            << std::endl;
-  std::cout << GridLogMessage << "* Generators for SU(3)" << std::endl;
+  std::cout << GridLogMessage << "* Generators for SU(Nc" << std::endl;
  std::cout << GridLogMessage << "*********************************************"
            << std::endl;
-  SU3::printGenerators();
-  std::cout << "Dimension of adjoint representation: "<< SU3Adjoint::Dimension << std::endl;
-  SU3Adjoint::printGenerators();
-  SU3::testGenerators();
-  SU3Adjoint::testGenerators();
+  SU<Nc>::printGenerators();
+  std::cout << "Dimension of adjoint representation: "<< SU<Nc>Adjoint::Dimension << std::endl;
+  SU<Nc>Adjoint::printGenerators();
+  SU<Nc>::testGenerators();
+  SU<Nc>Adjoint::testGenerators();

  std::cout<<GridLogMessage<<"*********************************************"<<std::endl;
  std::cout<<GridLogMessage<<"* Generators for SU(4)"<<std::endl;
@@ -87,22 +87,22 @@ int main(int argc, char** argv) {
  // Projectors 
  GridParallelRNG gridRNG(grid);
  gridRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-  SU3Adjoint::LatticeAdjMatrix Gauss(grid);
-  SU3::LatticeAlgebraVector ha(grid);
-  SU3::LatticeAlgebraVector hb(grid);
+  SU<Nc>Adjoint::LatticeAdjMatrix Gauss(grid);
+  SU<Nc>::LatticeAlgebraVector ha(grid);
+  SU<Nc>::LatticeAlgebraVector hb(grid);
  random(gridRNG,Gauss);

  std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl;
-  SU3Adjoint::projectOnAlgebra(ha, Gauss);
+  SU<Nc>Adjoint::projectOnAlgebra(ha, Gauss);
  std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl;
  std::cout << GridLogMessage << "Start projector" << std::endl;
-  SU3Adjoint::projector(hb, Gauss);
+  SU<Nc>Adjoint::projector(hb, Gauss);
  std::cout << GridLogMessage << "end projector" << std::endl;

  std::cout << GridLogMessage << "ReStart projector" << std::endl;
-  SU3Adjoint::projector(hb, Gauss);
+  SU<Nc>Adjoint::projector(hb, Gauss);
  std::cout << GridLogMessage << "end projector" << std::endl;
-  SU3::LatticeAlgebraVector diff = ha -hb;
+  SU<Nc>::LatticeAlgebraVector diff = ha -hb;
  std::cout << GridLogMessage << "Difference: " << norm2(diff) << std::endl;


@@ -260,20 +260,20 @@ int main(int argc, char** argv) {
  std::cout << GridLogMessage << "Test for the Two Index Symmetric projectors"
      << std::endl;
  // Projectors 
-  SU3TwoIndexSymm::LatticeTwoIndexMatrix Gauss2(grid);
+  SU<Nc>TwoIndexSymm::LatticeTwoIndexMatrix Gauss2(grid);
  random(gridRNG,Gauss2);
  
  std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl;
-  SU3TwoIndexSymm::projectOnAlgebra(ha, Gauss2);
+  SU<Nc>TwoIndexSymm::projectOnAlgebra(ha, Gauss2);
  std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl;
  std::cout << GridLogMessage << "Start projector" << std::endl;
-  SU3TwoIndexSymm::projector(hb, Gauss2);
+  SU<Nc>TwoIndexSymm::projector(hb, Gauss2);
  std::cout << GridLogMessage << "end projector" << std::endl;
  
  std::cout << GridLogMessage << "ReStart projector" << std::endl;
-  SU3TwoIndexSymm::projector(hb, Gauss2);
+  SU<Nc>TwoIndexSymm::projector(hb, Gauss2);
  std::cout << GridLogMessage << "end projector" << std::endl;
-  SU3::LatticeAlgebraVector diff2 = ha - hb;
+  SU<Nc>::LatticeAlgebraVector diff2 = ha - hb;
  std::cout << GridLogMessage << "Difference: " << norm2(diff) << std::endl;
  std::cout << GridLogMessage << "*********************************************"
      << std::endl;
@@ -284,20 +284,20 @@ int main(int argc, char** argv) {
  std::cout << GridLogMessage << "Test for the Two index anti-Symmetric projectors"
      << std::endl;
  // Projectors
-  SU3TwoIndexAntiSymm::LatticeTwoIndexMatrix Gauss2a(grid);
+  SU<Nc>TwoIndexAntiSymm::LatticeTwoIndexMatrix Gauss2a(grid);
  random(gridRNG,Gauss2a);
  
  std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl;
-  SU3TwoIndexAntiSymm::projectOnAlgebra(ha, Gauss2a);
+  SU<Nc>TwoIndexAntiSymm::projectOnAlgebra(ha, Gauss2a);
  std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl;
  std::cout << GridLogMessage << "Start projector" << std::endl;
-  SU3TwoIndexAntiSymm::projector(hb, Gauss2a);
+  SU<Nc>TwoIndexAntiSymm::projector(hb, Gauss2a);
  std::cout << GridLogMessage << "end projector" << std::endl;
  
  std::cout << GridLogMessage << "ReStart projector" << std::endl;
-  SU3TwoIndexAntiSymm::projector(hb, Gauss2a);
+  SU<Nc>TwoIndexAntiSymm::projector(hb, Gauss2a);
  std::cout << GridLogMessage << "end projector" << std::endl;
-  SU3::LatticeAlgebraVector diff2a = ha - hb;
+  SU<Nc>::LatticeAlgebraVector diff2a = ha - hb;
  std::cout << GridLogMessage << "Difference: " << norm2(diff2a) << std::endl;
  std::cout << GridLogMessage << "*********************************************"
      << std::endl;
@@ -444,7 +444,7 @@ int main(int argc, char **argv) {
      // Lattice 12x12 GEMM
      scFooBar = scFoo * scBar;

-      // Benchmark some simple operations LatticeSU3 * Lattice SU3.
+      // Benchmark some simple operations LatticeSU<Nc> * Lattice SU<Nc>.
      double t0, t1, flops;
      double bytes;
      int ncall = 5000;
@@ -73,7 +73,7 @@ int main (int argc, char ** argv)
    LatticeFermion    ref   (FGrid); ref = Zero();
    LatticeFermion    tmp   (FGrid); tmp = Zero();
    LatticeFermion    err   (FGrid); err = Zero();
-    LatticeGaugeField Umu   (UGrid); SU3::HotConfiguration(RNG4, Umu);
+    LatticeGaugeField Umu   (UGrid); SU<Nc>::HotConfiguration(RNG4, Umu);
    std::vector<LatticeColourMatrix> U(4,UGrid);

    // Only one non-zero (y)
@@ -55,7 +55,7 @@ int main (int argc, char ** argv)
  GridParallelRNG  pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
  GridSerialRNG    sRNG;       sRNG.SeedFixedIntegers(sseeds);

-  // SU3 colour operatoions
+  // SU<Nc> colour operatoions
  LatticeColourMatrix link(grid);
  LatticeColourMatrix staple(grid);

@@ -87,10 +87,10 @@ int main (int argc, char ** argv)

 	link = PeekIndex<LorentzIndex>(Umu,mu);

-	for( int subgroup=0;subgroup<SU3::su2subgroups();subgroup++ ) {
+	for( int subgroup=0;subgroup<SU<Nc>::su2subgroups();subgroup++ ) {

 	  // update Even checkerboard
-	  SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask);
+	  SU<Nc>::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask);

 	}

@@ -64,7 +64,7 @@ int main (int argc, char ** argv)
  FermionField    err(&Grid);    tmp=Zero();
  FermionField phi   (&Grid); random(pRNG,phi);
  FermionField chi   (&Grid); random(pRNG,chi);
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);


@@ -75,7 +75,7 @@ int main (int argc, char ** argv)
  FermionField phi   (FGrid); random(pRNG5,phi);
  FermionField chi   (FGrid); random(pRNG5,chi);

-  LatticeGaugeField Umu(UGrid); SU3::ColdConfiguration(pRNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::ColdConfiguration(pRNG4,Umu);
  LatticeGaugeField Umua(UGrid); Umua=Umu;

  double volume=Ls;
@@ -84,7 +84,7 @@ int main (int argc, char ** argv)
  FermionField chi   (FGrid); random(pRNG5,chi);

  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(pRNG4,Umu);
+  SU<Nc>::HotConfiguration(pRNG4,Umu);

  /*
  for(int mu=1;mu<4;mu++){
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
  FermionField chi   (FGrid); random(pRNG5,chi);

  LatticeGaugeFieldF Umu(UGrid);
-  SU3::HotConfiguration(pRNG4,Umu);
+  SU<Nc>::HotConfiguration(pRNG4,Umu);

  /*
  for(int mu=1;mu<4;mu++){
@@ -64,7 +64,7 @@ int main (int argc, char ** argv)
  FermionField    err(&Grid);    tmp=Zero();
  FermionField phi   (&Grid); random(pRNG,phi);
  FermionField chi   (&Grid); random(pRNG,chi);
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);


@@ -74,7 +74,7 @@ int main(int argc, char **argv)
  FermionField chi(&Grid);
  random(pRNG, chi);
  LatticeGaugeField Umu(&Grid);
-  SU3::HotConfiguration(pRNG, Umu);
+  SU<Nc>::HotConfiguration(pRNG, Umu);
  std::vector<LatticeColourMatrix> U(4, &Grid);

  double volume = 1;
@@ -70,7 +70,7 @@ int main (int argc, char ** argv)
  LatticeFermion    tmp(&Grid);    tmp=Zero();
  LatticeFermion    err(&Grid);    tmp=Zero();
  LatticeGaugeField Umu(&Grid); 
-  SU3::HotConfiguration(pRNG,Umu);
+  SU<Nc>::HotConfiguration(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);

  double volume=1;
@@ -71,7 +71,7 @@ int main (int argc, char ** argv)
  LatticeFermion    ref(&Grid);    ref=Zero();
  LatticeFermion    tmp(&Grid);    tmp=Zero();
  LatticeFermion    err(&Grid);    tmp=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);

  double volume=1;
@@ -116,7 +116,7 @@ int main (int argc, char ** argv)

  LatticeGaugeField Umu(UGrid);
  LatticeGaugeFieldF UmuF(UGridF);
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
  precisionChange(UmuF,Umu);
  std::vector<LatticeColourMatrix> U(4,UGrid);

@@ -77,7 +77,7 @@ int main (int argc, char ** argv)
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);

 #if 0
  std::vector<LatticeColourMatrix> U(4,UGrid);
@@ -70,7 +70,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);

-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
  std::vector<LatticeColourMatrix> U(4,UGrid);

  RealD mass=0.1;
@@ -71,9 +71,9 @@ int main (int argc, char ** argv)
  std::string file("./ckpoint_lat.400");
  NerscIO::readConfiguration(Umu,header,file);

-  //  SU3::ColdConfiguration(RNG4,Umu);
-  //  SU3::TepidConfiguration(RNG4,Umu);
-  //  SU3::HotConfiguration(RNG4,Umu);
+  //  SU<Nc>::ColdConfiguration(RNG4,Umu);
+  //  SU<Nc>::TepidConfiguration(RNG4,Umu);
+  //  SU<Nc>::HotConfiguration(RNG4,Umu);
  //  Umu=Zero();

  RealD mass=0.1;
@@ -108,8 +108,8 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);

  LatticeGaugeField Umu(UGrid);
-  SU3::ColdConfiguration(Umu);
-  //  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::ColdConfiguration(Umu);
+  //  SU<Nc>::HotConfiguration(RNG4,Umu);

  RealD mass=0.3;
  RealD M5  =1.0;
@@ -73,7 +73,7 @@ int main(int argc, char** argv)

  // Random gauge field
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

  DomainWallEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5);
  DomainWallEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0,  1, M5);
@@ -77,7 +77,7 @@ int main(int argc, char** argv)

  // Random gauge field
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

  // GparityDomainWallFermionR::ImplParams params;
  FermionAction::ImplParams params;
@@ -75,7 +75,7 @@ int main(int argc, char** argv)

  // Random gauge field
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

  MobiusEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5, b, c);
  MobiusEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0,  1, M5, b, c);
@@ -79,7 +79,7 @@ int main(int argc, char** argv)

  // Random gauge field
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

  FermionAction::ImplParams params;
  FermionAction Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5, b, c, params);
@@ -102,7 +102,7 @@ int main(int argc, char **argv)

  // Random gauge field
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

  // Initialize RHMC fermion operators
  DomainWallFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5);
@@ -104,7 +104,7 @@ int main(int argc, char **argv)

  // Random gauge field
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

  // Initialize RHMC fermion operators
  GparityDomainWallFermionR::ImplParams params;
@@ -104,7 +104,7 @@ int main(int argc, char **argv)

  // Random gauge field
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

  // Initialize RHMC fermion operators
  MobiusFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, b, c);
@@ -106,7 +106,7 @@ int main(int argc, char **argv)

  // Random gauge field
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

  // Initialize RHMC fermion operators
  GparityDomainWallFermionR::ImplParams params;
@@ -59,7 +59,7 @@ int main (int argc, char ** argv)

  LatticeGaugeField U(UGrid);

-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
  
  ////////////////////////////////////
  // Unmodified matrix element
@@ -93,7 +93,7 @@ int main (int argc, char ** argv)

  for(int mu=0;mu<Nd;mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom,mommu,mu);

@@ -60,7 +60,7 @@ int main (int argc, char ** argv)

  LatticeGaugeField U(UGrid);

-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
  
  ////////////////////////////////////
  // Unmodified matrix element
@@ -94,7 +94,7 @@ int main (int argc, char ** argv)

  for(int mu=0;mu<Nd;mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom,mommu,mu);

@@ -72,7 +72,7 @@ int main (int argc, char** argv)
  LatticeFermion MphiPrime  (FGrid);

  LatticeGaugeField U(UGrid);
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);

  ////////////////////////////////////
  // Unmodified matrix element
@@ -105,7 +105,7 @@ int main (int argc, char** argv)

  for(int mu=0; mu<Nd; mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom, mommu, mu);

@@ -63,8 +63,8 @@ int main (int argc, char ** argv)

  LatticeGaugeField U(UGrid);

-  SU3::HotConfiguration(RNG4,U);
-  //  SU3::ColdConfiguration(pRNG,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
+  //  SU<Nc>::ColdConfiguration(pRNG,U);
  
  ////////////////////////////////////
  // Unmodified matrix element
@@ -112,7 +112,7 @@ int main (int argc, char ** argv)

  for(int mu=0;mu<Nd;mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    Hmom -= real(sum(trace(mommu*mommu)));

@@ -75,7 +75,7 @@ int main (int argc, char** argv)
  FermionField MphiPrime  (FGrid);

  LatticeGaugeField U(UGrid);
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);

  ////////////////////////////////////
  // Unmodified matrix element
@@ -109,7 +109,7 @@ int main (int argc, char** argv)

  for(int mu=0; mu<Nd; mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom, mommu, mu);

@@ -51,7 +51,7 @@ int main (int argc, char ** argv)

  LatticeGaugeField U(&Grid);

-  SU3::HotConfiguration(pRNG,U);
+  SU<Nc>::HotConfiguration(pRNG,U);
  
  double beta = 1.0;
  ConjugateWilsonGaugeActionR Action(beta);
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)

  for(int mu=0;mu<Nd;mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom,mommu,mu);

@@ -54,7 +54,7 @@ int main (int argc, char ** argv)

  LatticeGaugeField U(&Grid);

-  SU3::HotConfiguration(pRNG,U);
+  SU<Nc>::HotConfiguration(pRNG,U);
  
  double beta = 1.0;
  double c1   = 0.331;
@@ -82,7 +82,7 @@ int main (int argc, char ** argv)

  for(int mu=0;mu<Nd;mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom,mommu,mu);

@@ -63,7 +63,7 @@ int main (int argc, char ** argv)

  LatticeGaugeField U(UGrid);

-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
  
  ////////////////////////////////////
  // Unmodified matrix element
@@ -100,7 +100,7 @@ int main (int argc, char ** argv)

  for(int mu=0;mu<Nd;mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom,mommu,mu);

@@ -57,7 +57,7 @@ int main (int argc, char ** argv)

  LatticeGaugeField U(UGrid);

-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
  
  ////////////////////////////////////
  // Unmodified matrix element
@@ -94,7 +94,7 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<Nd;mu++){

    // Traceless antihermitian momentum; gaussian in lie alg
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); 
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); 

    PokeIndex<LorentzIndex>(mom,mommu,mu);

@@ -58,7 +58,7 @@ int main (int argc, char ** argv)
    PokeIndex<LorentzIndex>(P, P_mu, mu);
  }

-  SU3::HotConfiguration(pRNG,U);
+  SU<Nc>::HotConfiguration(pRNG,U);
  

  ConjugateGradient<LatticeGaugeField> CG(1.0e-8, 10000);
@@ -95,7 +95,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "Update the U " << std::endl;
  for(int mu=0;mu<Nd;mu++){
  // Traceless antihermitian momentum; gaussian in lie algebra
-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); 
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); 
    auto Umu = PeekIndex<LorentzIndex>(U, mu);
    PokeIndex<LorentzIndex>(mom,mommu,mu);
    Umu = expMat(mommu, dt, 12) * Umu;
@@ -60,7 +60,7 @@ int main (int argc, char ** argv)

  LatticeGaugeField U(UGrid);

-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
  
  ////////////////////////////////////
  // Unmodified matrix element
@@ -96,7 +96,7 @@ int main (int argc, char ** argv)

  for(int mu=0;mu<Nd;mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom,mommu,mu);

@@ -72,7 +72,7 @@ int main (int argc, char** argv)
  LatticeFermion MphiPrime  (FGrid);

  LatticeGaugeField U(UGrid);
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);

  ////////////////////////////////////
  // Unmodified matrix element
@@ -107,7 +107,7 @@ int main (int argc, char** argv)

  for(int mu=0; mu<Nd; mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom, mommu, mu);

@@ -76,7 +76,7 @@ int main (int argc, char** argv)
  FermionField MphiPrime  (FGrid);

  LatticeGaugeField U(UGrid);
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);

  ////////////////////////////////////
  // Unmodified matrix element
@@ -112,7 +112,7 @@ int main (int argc, char** argv)

  for(int mu=0; mu<Nd; mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom, mommu, mu);
    autoView( U_v , U, CpuRead);
@@ -62,7 +62,7 @@ int main (int argc, char ** argv)

  LatticeGaugeField U(UGrid);

-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
  
  ////////////////////////////////////
  // Unmodified matrix element
@@ -96,7 +96,7 @@ int main (int argc, char ** argv)

  for(int mu=0;mu<Nd;mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom,mommu,mu);

@@ -54,7 +54,7 @@ int main (int argc, char ** argv)

  LatticeGaugeField U(&Grid);

-  SU3::HotConfiguration(pRNG,U);
+  SU<Nc>::HotConfiguration(pRNG,U);
  
  double beta = 1.0;
  double c1   = -0.331;
@@ -82,7 +82,7 @@ int main (int argc, char ** argv)

  for(int mu=0;mu<Nd;mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom,mommu,mu);

@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
  LatticeGaugeField U(&Grid);

  //SU2::HotConfiguration(pRNG,U);
-  SU3::ColdConfiguration(pRNG,U);
+  SU<Nc>::ColdConfiguration(pRNG,U);
  
  ////////////////////////////////////
  // Unmodified matrix element
@@ -98,7 +98,7 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<Nd;mu++){

    // Traceless antihermitian momentum; gaussian in lie alg
-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); 
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); 

    Hmom -= real(sum(trace(mommu*mommu)));

@@ -62,8 +62,8 @@ int main(int argc, char **argv)

  LatticeGaugeField U(&Grid);

-  SU3::HotConfiguration(pRNG, U);
-  //SU3::ColdConfiguration(pRNG, U);// Clover term Zero()
+  SU<Nc>::HotConfiguration(pRNG, U);
+  //SU<Nc>::ColdConfiguration(pRNG, U);// Clover term Zero()

  ////////////////////////////////////
  // Unmodified matrix element
@@ -101,7 +101,7 @@ int main(int argc, char **argv)
  for (int mu = 0; mu < Nd; mu++)
  {
    // Traceless antihermitian momentum; gaussian in lie alg
-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu);
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu);
    Hmom -= real(sum(trace(mommu * mommu)));
    PokeIndex<LorentzIndex>(mom, mommu, mu);

@@ -59,7 +59,7 @@ int main (int argc, char ** argv)

  LatticeGaugeField U(UGrid);

-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
  
  ////////////////////////////////////
  // Unmodified matrix element
@@ -109,7 +109,7 @@ int main (int argc, char ** argv)

  for(int mu=0;mu<Nd;mu++){

-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

    PokeIndex<LorentzIndex>(mom,mommu,mu);

@@ -293,7 +293,7 @@ int main (int argc, char ** argv) {
  {
    std::vector<int> seeds4({1,2,3,4});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-    SU3::HotConfiguration(RNG4, Umu);
+    SU<Nc>::HotConfiguration(RNG4, Umu);
  }
  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << "   Ls: " << Ls << std::endl;

@@ -54,7 +54,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG5rb(FrbGrid);  RNG5.SeedFixedIntegers(seeds5);

  LatticeGaugeField Umu(UGrid); 
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
@@ -61,7 +61,7 @@ int main(int argc, char** argv) {
  RNG5.SeedFixedIntegers(seeds5);

  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

 /*
  std::vector<LatticeColourMatrix> U(4, UGrid);
@@ -280,7 +280,7 @@ void make_gauge(GaugeField &Umu, Grid::LatticePropagator &q1,Grid::LatticePropag
  Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu.Grid();
  Grid::GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);
-  Grid::SU3::HotConfiguration(RNG4, Umu);
+  Grid::SU<Nc>::HotConfiguration(RNG4, Umu);

  // Propagator
  Grid::gaussian(RNG4, q1);
@@ -277,7 +277,7 @@ double calc_grid_p(Grid::LatticeGaugeField & Umu)
  Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu.Grid();
  Grid::GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);

-  Grid::SU3::HotConfiguration(RNG4,Umu);
+  Grid::SU<Nc>::HotConfiguration(RNG4,Umu);

  Grid::LatticeColourMatrix tmp(UGrid); 
  tmp = Grid::zero;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	12e239dd9f	Merge branch 'release/dirac-ITT-2020'	2020-10-13 13:38:29 -04:00
Peter BoyleandGitHub	af2301afbb	Merge pull request #312 from i-kanamori/debug_512 add reordring of random number generators in IO	2020-10-13 11:42:12 -04:00
Peter BoyleandGitHub	f98856a26f	Merge pull request #314 from smangham/issue_readme_precision Fix for deprecated configure options in documentation (issue #313)	2020-10-13 11:41:38 -04:00
Sam Mangham	d55cc5b380	Fixed typo on --enable-comm, removed all references to --enable-precision except for config options, where it is listed as deprecated. Removed travis test for single precision.	2020-10-12 12:33:13 +01:00
portelli	c2b688abc9	Benchmark_IO: reducing max local volume to 32^4	2020-10-10 16:52:56 +01:00
portelli	b0d61b9687	Benchmark_IO cleaner output	2020-10-09 21:46:45 +01:00
portelli	5f893bf9af	Benchmark_IO procurement sizes	2020-10-09 21:31:59 +01:00
portelli	0e17bd6597	I/O benchmark cleanup	2020-10-09 20:29:57 +01:00
portelli	22caa158cc	multi-pass I/O benchmark, with statistic and robustness summary	2020-10-09 20:29:40 +01:00
portelli	b24a504d7c	hook to access last parallel I/O performance measurement	2020-10-09 20:28:54 +01:00
Peter Boyle	992ef6e9fc	more runtime	2020-10-08 22:19:20 -04:00
Peter Boyle	f32a320bc3	Single prec benchmark in double prec compile	2020-10-08 19:52:08 -04:00
Peter Boyle	5f0fe029d2	Improve meemory benchmarks for GPU (avoid host mem ping pong)	2020-10-08 19:51:28 -04:00
portelli	6b1486e89b	fixing number of colours defaulting to 4 in most cases	2020-10-08 16:31:24 +01:00
Peter Boyle	3f9c427a3a	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-10-07 13:12:57 -04:00
Peter Boyle	d201277652	Expose Nc as a compile time configure option. Remove precision option	2020-10-07 13:07:00 -04:00
portelli	fdda7cf9cf	Merge branch 'feature/benchmark-io-update' into develop	2020-10-07 15:57:53 +01:00
portelli	e22d30f715	Merge branch 'develop' into feature/benchmark-io-update	2020-10-07 15:56:39 +01:00
portelli	1ba25a0d8c	more I/O benchmark code cleaning	2020-10-07 15:38:41 +01:00
portelli	9ba3647bdf	script to convert I/O benchmark logs to CSV	2020-10-07 15:35:03 +01:00
portelli	5ee832f738	I/O benchmark code cleaning	2020-10-07 15:31:51 +01:00
Peter Boyle	35a69a5133	SU4 x SU4	2020-10-06 21:48:35 -04:00
portelli	e9c5a271a8	fixing potential issues with log alignment and timer I/O	2020-10-06 17:58:16 +01:00
portelli	acac2d6938	standard C/C++ I/O in benchmark	2020-10-06 17:57:00 +01:00
kanamori	97db2b8d20	add reordring of random number generator in IO	2020-10-06 17:25:59 +09:00
Peter Boyle	ace9cd64bb	dpcpp happy	2020-09-29 08:03:46 -07:00
Peter Boyle	a3e2aeb603	dpcpp options happiness	2020-09-29 06:50:10 -07:00
Peter Boyle	049dd25785	Revert accidental commit thanks michael	2020-09-23 04:13:50 -04:00
Peter BoyleandGitHub	d43d372294	Merge pull request #311 from mmphys/bugfix/MPIasynch Asynchronous calls removed - reflect this in Communicator_none.cc	2020-09-22 10:41:48 -04:00
Michael Marshall	b71a081cba	Asynchronous calls removed - reflect this in Communicator_none.cc (Opportunistic doc update - OpenMP support on Mac OS)	2020-09-21 09:33:23 +01:00
Peter Boyle	c48909590b	MPI asynch call removal	2020-09-17 20:47:32 +01:00
Peter Boyle	446ef40570	HIP IPC	2020-09-17 20:31:46 +01:00
Peter Boyle	81441e98f4	HIP runs sensible	2020-09-16 03:35:03 +01:00
Peter Boyle	ecd3f890f5	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-09-16 02:30:14 +01:00
Peter Boyle	1c881ce23c	HIP does not like half2 visible members x and y so must define own Half2	2020-09-16 02:28:33 +01:00
Peter Boyle	dacbbdd051	Hip Happy Birthday	2020-09-16 00:37:02 +01:00
Peter Boyle	2859955a03	HIP requires "inline"	2020-09-16 00:36:13 +01:00
Peter Boyle	cc220abd1d	inline for HIP	2020-09-16 00:35:38 +01:00
Peter Boyle	d1c0c0197e	HipCC requires inline on definition	2020-09-16 00:35:06 +01:00
Peter Boyle	fd9424ef27	innlines required to make HIP happy	2020-09-16 00:34:32 +01:00
Peter Boyle	a5c35c4024	Make HIP / Vega happy	2020-09-16 00:33:53 +01:00
Peter Boyle	e03b64dc06	HIP default flaags to work on ROCM	2020-09-16 00:33:09 +01:00
Peter Boyle	4677c40195	HIP improvements	2020-09-16 00:32:27 +01:00
Peter Boyle	288c615782	Hip improvements	2020-09-16 00:31:50 +01:00
Peter Boyle	48e81cf6f8	Hip Pragmas	2020-09-16 00:31:03 +01:00