diff --git a/.travis.yml b/.travis.yml
index 129fd582..3a0e1e35 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,11 +9,6 @@ matrix:
     - os:        osx
       osx_image: xcode8.3
       compiler: clang
-      env: PREC=single
-    - os:        osx
-      osx_image: xcode8.3
-      compiler: clang
-      env: PREC=double
       
 before_install:
     - export GRIDDIR=`pwd`
@@ -55,7 +50,7 @@ script:
     - make -j4
     - make install
     - cd $CWD/build
-    - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
+    - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
     - make -j4 
     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
     - make check
diff --git a/Grid/GridStd.h b/Grid/GridStd.h
index ecb561ea..28f6bc46 100644
--- a/Grid/GridStd.h
+++ b/Grid/GridStd.h
@@ -28,4 +28,7 @@
 ///////////////////
 #include "Config.h"
 
+#ifdef TOFU
+#undef GRID_COMMS_THREADS
+#endif
 #endif /* GRID_STD_H */
diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index 249732fb..4b357523 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -165,9 +165,17 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-//template<class T> using commAllocator = devAllocator<T>;
+#ifdef ACCELERATOR_CSHIFT
+// Cshift on device
+template<class T> using cshiftAllocator = devAllocator<T>;
+#else
+// Cshift on host
+template<class T> using cshiftAllocator = std::allocator<T>;
+#endif
+
 template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
 template<class T> using commVector = std::vector<T,devAllocator<T> >;
+template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
 
 NAMESPACE_END(Grid);
 
diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc
index 537f7c32..3f165007 100644
--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@@ -1,7 +1,6 @@
 #include <Grid/GridCore.h>
 #ifdef GRID_UVM
 
-#warning "Grid is assuming unified virtual memory address space"
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////////////////////
 // View management is 1:1 address space mapping
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 83f71233..c6543851 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
   MPI_Initialized(&flag); // needed to coexist with other libs apparently
   if ( !flag ) {
 
-#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
+#ifndef GRID_COMMS_THREADS
     nCommThreads=1;
     // wrong results here too
     // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
@@ -358,16 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   assert(from != _processor);
   assert(gme  == ShmRank);
   double off_node_bytes=0.0;
+  int tag;
 
   if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
+    tag= dir+from*32;
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
     assert(ierr==0);
     list.push_back(rrq);
     off_node_bytes+=bytes;
   }
 
   if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
+    tag= dir+_processor*32;
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
     assert(ierr==0);
     list.push_back(xrq);
     off_node_bytes+=bytes;
diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc
index 0cbde9eb..6089093b 100644
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -457,8 +457,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
     exit(EXIT_FAILURE);  
   }
-  if ( WorldRank == 0 ){
-    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes 
+  //  if ( WorldRank == 0 ){
+  if ( 1 ){
+    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
   }
   SharedMemoryZero(ShmCommBuf,bytes);
@@ -665,7 +666,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
       
-      //      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
       if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
@@ -771,19 +771,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
   std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
   MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 
 
-#ifdef GRID_IBM_SUMMIT
-  // Hide the shared memory path between sockets 
-  // if even number of nodes
-  if ( (ShmSize & 0x1)==0 ) {
-    int SocketSize = ShmSize/2;
-    int mySocket = ShmRank/SocketSize; 
+#ifdef GRID_SHM_FORCE_MPI
+  // Hide the shared memory path between ranks
+  {
     for(int r=0;r<size;r++){
-      int hisRank=ShmRanks[r];
-      if ( hisRank!= MPI_UNDEFINED ) {
-	int hisSocket=hisRank/SocketSize;
-	if ( hisSocket != mySocket ) {
-	  ShmRanks[r] = MPI_UNDEFINED;
-	}
+      if ( r!=rank ) {
+	ShmRanks[r] = MPI_UNDEFINED;
       }
     }
   }
diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h
index b0dd068d..f2f39815 100644
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
   int rd = rhs.Grid()->_rdimensions[dimension];
 
@@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
      }
   }
   {
-    autoView(rhs_v , rhs, AcceleratorRead);
     auto buffer_p = & buffer[0];
     auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
+    autoView(rhs_v , rhs, AcceleratorRead);
     accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
     });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for(i,ent,{
+      buffer_p[table[i].first]=rhs_v[table[i].second];
+    });
+#endif
   }
 }
 
@@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
   int n1=rhs.Grid()->_slice_stride[dimension];
 
   if ( cbmask ==0x3){
+#ifdef ACCELERATOR_CSHIFT    
     autoView(rhs_v , rhs, AcceleratorRead);
     accelerator_for2d(n,e1,b,e2,1,{
 	int o      =   n*n1;
@@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
       });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for2d(n,e1,b,e2,{
+	int o      =   n*n1;
+	int offset = b+n*e2;
+	
+	vobj temp =rhs_v[so+o+b];
+	extract<vobj>(temp,pointers,offset);
+      });
+#endif
   } else { 
-    autoView(rhs_v , rhs, AcceleratorRead);
-
     Coordinate rdim=rhs.Grid()->_rdimensions;
     Coordinate cdm =rhs.Grid()->_checker_dim_mask;
     std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
+#ifdef ACCELERATOR_CSHIFT    
+    autoView(rhs_v , rhs, AcceleratorRead);
     accelerator_for2d(n,e1,b,e2,1,{
 
 	Coordinate coor;
@@ -134,13 +152,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	  extract<vobj>(temp,pointers,offset);
 	}
       });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for2d(n,e1,b,e2,{
+
+	Coordinate coor;
+
+	int o=n*n1;
+	int oindex = o+b;
+
+       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
+
+	int ocb=1<<cb;
+	int offset = b+n*e2;
+
+	if ( ocb & cbmask ) {
+	  vobj temp =rhs_v[so+o+b];
+	  extract<vobj>(temp,pointers,offset);
+	}
+      });
+#endif
   }
 }
 
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
   int rd = rhs.Grid()->_rdimensions[dimension];
 
@@ -182,12 +220,19 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
   }
   
   {
-    autoView( rhs_v, rhs, AcceleratorWrite);
     auto buffer_p = & buffer[0];
     auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
+    autoView( rhs_v, rhs, AcceleratorWrite);
     accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
     });
+#else
+    autoView( rhs_v, rhs, CpuWrite);
+    thread_for(i,ent,{
+      rhs_v[table[i].first]=buffer_p[table[i].second];
+    });
+#endif
   }
 }
 
@@ -208,14 +253,23 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
   int e2=rhs.Grid()->_slice_block[dimension];
 
   if(cbmask ==0x3 ) {
-    autoView( rhs_v , rhs, AcceleratorWrite);
     int _slice_stride = rhs.Grid()->_slice_stride[dimension];
     int _slice_block = rhs.Grid()->_slice_block[dimension];
+#ifdef ACCELERATOR_CSHIFT    
+    autoView( rhs_v , rhs, AcceleratorWrite);
     accelerator_for2d(n,e1,b,e2,1,{
 	int o      = n*_slice_stride;
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
       });
+#else
+    autoView( rhs_v , rhs, CpuWrite);
+    thread_for2d(n,e1,b,e2,{
+	int o      = n*_slice_stride;
+	int offset = b+n*_slice_block;
+	merge(rhs_v[so+o+b],pointers,offset);
+    });
+#endif
   } else { 
 
     // Case of SIMD split AND checker dim cannot currently be hit, except in 
@@ -280,12 +334,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
   }
 
   {
+    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
     autoView(rhs_v , rhs, AcceleratorRead);
     autoView(lhs_v , lhs, AcceleratorWrite);
-    auto table = &Cshift_table[0];
     accelerator_for(i,ent,vobj::Nsimd(),{
       coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
     });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    autoView(lhs_v , lhs, CpuWrite);
+    thread_for(i,ent,{
+      lhs_v[table[i].first]=rhs_v[table[i].second];
+    });
+#endif
   }
 }
 
@@ -324,12 +386,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
   }
 
   {
+    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
     autoView( rhs_v, rhs, AcceleratorRead);
     autoView( lhs_v, lhs, AcceleratorWrite);
-    auto table = &Cshift_table[0];
     accelerator_for(i,ent,1,{
       permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
     });
+#else
+    autoView( rhs_v, rhs, CpuRead);
+    autoView( lhs_v, lhs, CpuWrite);
+    thread_for(i,ent,{
+      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
+    });
+#endif
   }
 }
 
diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
index 0f0e80b1..375d004e 100644
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -101,7 +101,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
     Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
   }
 }
-
+#define ACCELERATOR_CSHIFT_NO_COPY
+#ifdef ACCELERATOR_CSHIFT_NO_COPY
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
   typedef typename vobj::vector_type vector_type;
@@ -121,9 +122,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
   assert(shift<fd);
   
   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  commVector<vobj> send_buf(buffer_size);
-  commVector<vobj> recv_buf(buffer_size);
-
+  cshiftVector<vobj> send_buf(buffer_size);
+  cshiftVector<vobj> recv_buf(buffer_size);
+    
   int cb= (cbmask==0x2)? Odd : Even;
   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
 
@@ -138,7 +139,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 
     } else {
 
-      int words = send_buf.size();
+      int words = buffer_size;
       if (cbmask != 0x3) words=words>>1;
 
       int bytes = words * sizeof(vobj);
@@ -150,12 +151,14 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
       int xmit_to_rank;
       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
 
+      grid->Barrier();
 
       grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
+
       grid->Barrier();
 
       Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@@ -195,8 +198,15 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
   int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
   //  int words = sizeof(vobj)/sizeof(vector_type);
 
-  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
-  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
+  std::vector<cshiftVector<scalar_object> >  send_buf_extract(Nsimd);
+  std::vector<cshiftVector<scalar_object> >  recv_buf_extract(Nsimd);
+  scalar_object *  recv_buf_extract_mpi;
+  scalar_object *  send_buf_extract_mpi;
+ 
+  for(int s=0;s<Nsimd;s++){
+    send_buf_extract[s].resize(buffer_size);
+    recv_buf_extract[s].resize(buffer_size);
+  }
 
   int bytes = buffer_size*sizeof(scalar_object);
 
@@ -242,11 +252,204 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
       if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 
-	grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
+	grid->Barrier();
+
+	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
+	recv_buf_extract_mpi = &recv_buf_extract[i][0];
+	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
-			     (void *)&recv_buf_extract[i][0],
+			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
+
+	grid->Barrier();
+
+	rpointers[i] = &recv_buf_extract[i][0];
+      } else { 
+	rpointers[i] = &send_buf_extract[nbr_lane][0];
+      }
+
+    }
+    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
+  }
+
+}
+#else 
+template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  GridBase *grid=rhs.Grid();
+  Lattice<vobj> temp(rhs.Grid());
+
+  int fd              = rhs.Grid()->_fdimensions[dimension];
+  int rd              = rhs.Grid()->_rdimensions[dimension];
+  int pd              = rhs.Grid()->_processors[dimension];
+  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
+  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
+  assert(simd_layout==1);
+  assert(comm_dim==1);
+  assert(shift>=0);
+  assert(shift<fd);
+  
+  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
+  cshiftVector<vobj> send_buf_v(buffer_size);
+  cshiftVector<vobj> recv_buf_v(buffer_size);
+  vobj *send_buf;
+  vobj *recv_buf;
+  {
+    grid->ShmBufferFreeAll();
+    size_t bytes = buffer_size*sizeof(vobj);
+    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
+    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
+  }
+    
+  int cb= (cbmask==0x2)? Odd : Even;
+  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
+
+  for(int x=0;x<rd;x++){       
+
+    int sx        =  (x+sshift)%rd;
+    int comm_proc = ((x+sshift)/rd)%pd;
+    
+    if (comm_proc==0) {
+
+      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+
+    } else {
+
+      int words = buffer_size;
+      if (cbmask != 0x3) words=words>>1;
+
+      int bytes = words * sizeof(vobj);
+
+      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
+
+      //      int rank           = grid->_processor;
+      int recv_from_rank;
+      int xmit_to_rank;
+      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+
+
+      grid->Barrier();
+
+      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
+      grid->SendToRecvFrom((void *)&send_buf[0],
+			   xmit_to_rank,
+			   (void *)&recv_buf[0],
+			   recv_from_rank,
+			   bytes);
+      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
+
+      grid->Barrier();
+
+      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
+    }
+  }
+}
+
+template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  GridBase *grid=rhs.Grid();
+  const int Nsimd = grid->Nsimd();
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+   
+  int fd = grid->_fdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
+  int ld = grid->_ldimensions[dimension];
+  int pd = grid->_processors[dimension];
+  int simd_layout     = grid->_simd_layout[dimension];
+  int comm_dim        = grid->_processors[dimension] >1 ;
+
+  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+
+  assert(comm_dim==1);
+  assert(simd_layout==2);
+  assert(shift>=0);
+  assert(shift<fd);
+
+  int permute_type=grid->PermuteType(dimension);
+
+  ///////////////////////////////////////////////
+  // Simd direction uses an extract/merge pair
+  ///////////////////////////////////////////////
+  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
+  //  int words = sizeof(vobj)/sizeof(vector_type);
+
+  std::vector<cshiftVector<scalar_object> >  send_buf_extract(Nsimd);
+  std::vector<cshiftVector<scalar_object> >  recv_buf_extract(Nsimd);
+  scalar_object *  recv_buf_extract_mpi;
+  scalar_object *  send_buf_extract_mpi;
+  {
+    size_t bytes = sizeof(scalar_object)*buffer_size;
+    grid->ShmBufferFreeAll();
+    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
+    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
+  }
+  for(int s=0;s<Nsimd;s++){
+    send_buf_extract[s].resize(buffer_size);
+    recv_buf_extract[s].resize(buffer_size);
+  }
+
+  int bytes = buffer_size*sizeof(scalar_object);
+
+  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
+  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
+
+  ///////////////////////////////////////////
+  // Work out what to send where
+  ///////////////////////////////////////////
+  int cb    = (cbmask==0x2)? Odd : Even;
+  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
+
+  // loop over outer coord planes orthog to dim
+  for(int x=0;x<rd;x++){       
+
+    // FIXME call local permute copy if none are offnode.
+    for(int i=0;i<Nsimd;i++){       
+      pointers[i] = &send_buf_extract[i][0];
+    }
+    int sx   = (x+sshift)%rd;
+    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+
+    for(int i=0;i<Nsimd;i++){
+      
+      int inner_bit = (Nsimd>>(permute_type+1));
+      int ic= (i&inner_bit)? 1:0;
+
+      int my_coor          = rd*ic + x;
+      int nbr_coor         = my_coor+sshift;
+      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
+
+      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
+      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
+      int nbr_lane = (i&(~inner_bit));
+
+      int recv_from_rank;
+      int xmit_to_rank;
+
+      if (nbr_ic) nbr_lane|=inner_bit;
+
+      assert (sx == nbr_ox);
+
+      if(nbr_proc){
+	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
+
+	grid->Barrier();
+
+	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
+	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
+			     xmit_to_rank,
+			     (void *)recv_buf_extract_mpi,
+			     recv_from_rank,
+			     bytes);
+	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
+
 	grid->Barrier();
 	rpointers[i] = &recv_buf_extract[i][0];
       } else { 
@@ -258,7 +461,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
   }
 
 }
-
+#endif
 NAMESPACE_END(Grid); 
 
 #endif
diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h
index 28ea0294..9f5f1da7 100644
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
-//#include <Grid/lattice/Lattice_reality.h>
+#include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
 #include <Grid/lattice/Lattice_comparison_utils.h>
 #include <Grid/lattice/Lattice_comparison.h>
diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h
index c43844f8..4a8a7423 100644
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -342,19 +342,14 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
 
 GridUnopClass(UnarySub, -a);
 GridUnopClass(UnaryNot, Not(a));
-GridUnopClass(UnaryAdj, adj(a));
-GridUnopClass(UnaryConj, conjugate(a));
 GridUnopClass(UnaryTrace, trace(a));
 GridUnopClass(UnaryTranspose, transpose(a));
 GridUnopClass(UnaryTa, Ta(a));
 GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
-GridUnopClass(UnaryToReal, toReal(a));
-GridUnopClass(UnaryToComplex, toComplex(a));
 GridUnopClass(UnaryTimesI, timesI(a));
 GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
 GridUnopClass(UnaryAbs, abs(a));
 GridUnopClass(UnarySqrt, sqrt(a));
-GridUnopClass(UnaryRsqrt, rsqrt(a));
 GridUnopClass(UnarySin, sin(a));
 GridUnopClass(UnaryCos, cos(a));
 GridUnopClass(UnaryAsin, asin(a));
@@ -456,20 +451,17 @@ GridTrinOpClass(TrinaryWhere,
 GRID_DEF_UNOP(operator-, UnarySub);
 GRID_DEF_UNOP(Not, UnaryNot);
 GRID_DEF_UNOP(operator!, UnaryNot);
-GRID_DEF_UNOP(adj, UnaryAdj);
-GRID_DEF_UNOP(conjugate, UnaryConj);
+//GRID_DEF_UNOP(adj, UnaryAdj);
+//GRID_DEF_UNOP(conjugate, UnaryConj);
 GRID_DEF_UNOP(trace, UnaryTrace);
 GRID_DEF_UNOP(transpose, UnaryTranspose);
 GRID_DEF_UNOP(Ta, UnaryTa);
 GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
-GRID_DEF_UNOP(toReal, UnaryToReal);
-GRID_DEF_UNOP(toComplex, UnaryToComplex);
 GRID_DEF_UNOP(timesI, UnaryTimesI);
 GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
 GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
                                // abs-fabs-dabs-labs thing
 GRID_DEF_UNOP(sqrt, UnarySqrt);
-GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
 GRID_DEF_UNOP(sin, UnarySin);
 GRID_DEF_UNOP(cos, UnaryCos);
 GRID_DEF_UNOP(asin, UnaryAsin);
@@ -494,27 +486,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
 /////////////////////////////////////////////////////////////
 template <class Op, class T1>
 auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> 
+  -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > 
 {
-  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr);
+  Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr);
   return ret;
 }
 template <class Op, class T1, class T2>
 auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> 
+  -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type >
 {
-  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr);
+  Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr);
   return ret;
 }
 template <class Op, class T1, class T2, class T3>
 auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
+  -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
 				   vecEval(0, expr.arg2),
-				   vecEval(0, expr.arg3)))> 
+				   vecEval(0, expr.arg3)))>::type >
 {
-  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
+  Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
 				vecEval(0, expr.arg2),
-			        vecEval(0, expr.arg3)))>  ret(expr);
+			        vecEval(0, expr.arg3)))>::type >  ret(expr);
   return ret;
 }
 #define EXPRESSION_CLOSURE(function)					\
diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h
index af9d7280..95f55d10 100644
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
     basis_v.push_back(basis[k].View(AcceleratorWrite));
   }
 
-#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
+#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) )
   int max_threads = thread_max();
   Vector < vobj > Bt(Nm * max_threads);
   thread_region
@@ -161,11 +161,12 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
   double * Qt_j = & Qt_jv[0];
   for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
 
+  auto basis_vp=& basis_v[0];
   autoView(result_v,result,AcceleratorWrite);
   accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
     auto B=coalescedRead(zz);
     for(int k=k0; k<k1; ++k){
-      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
+      B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
     }
     coalescedWrite(result_v[ss], B);
   });
diff --git a/Grid/lattice/Lattice_reality.h b/Grid/lattice/Lattice_reality.h
index 61491d6b..51deeb01 100644
--- a/Grid/lattice/Lattice_reality.h
+++ b/Grid/lattice/Lattice_reality.h
@@ -45,8 +45,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
   autoView( ret_v, ret, AcceleratorWrite);
 
   ret.Checkerboard()=lhs.Checkerboard();
-  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
-    coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
+  accelerator_for( ss, lhs_v.size(), 1, {
+     ret_v[ss] = adj(lhs_v[ss]);
   });
   return ret;
 };
@@ -64,6 +64,53 @@ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
   return ret;
 };
 
+template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
+  Lattice<typename vobj::Complexified> ret(lhs.Grid());
+
+  autoView( lhs_v, lhs, AcceleratorRead);
+  autoView( ret_v, ret, AcceleratorWrite);
+
+  ret.Checkerboard() = lhs.Checkerboard();
+  accelerator_for( ss, lhs_v.size(), 1, {
+    ret_v[ss] = toComplex(lhs_v[ss]);
+  });
+  return ret;
+};
+template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
+  Lattice<typename vobj::Realified> ret(lhs.Grid());
+
+  autoView( lhs_v, lhs, AcceleratorRead);
+  autoView( ret_v, ret, AcceleratorWrite);
+
+  ret.Checkerboard() = lhs.Checkerboard();
+  accelerator_for( ss, lhs_v.size(), 1, {
+    ret_v[ss] = toReal(lhs_v[ss]);
+  });
+  return ret;
+};
+
+
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+auto toComplex(const Expression &expr)  -> decltype(closure(expr)) 
+{
+  return toComplex(closure(expr));
+}
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+auto toReal(const Expression &expr)  -> decltype(closure(expr)) 
+{
+  return toReal(closure(expr));
+}
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+auto adj(const Expression &expr)  -> decltype(closure(expr)) 
+{
+  return adj(closure(expr));
+}
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+auto conjugate(const Expression &expr)  -> decltype(closure(expr)) 
+{
+  return conjugate(closure(expr));
+}
+
 NAMESPACE_END(Grid);
 
 #endif
diff --git a/Grid/log/Log.h b/Grid/log/Log.h
index d459a4a9..68693647 100644
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -130,6 +130,8 @@ public:
   friend std::ostream& operator<< (std::ostream& stream, Logger& log){
 
     if ( log.active ) {
+      std::ios_base::fmtflags f(stream.flags());
+
       stream << log.background()<<  std::left;
       if (log.topWidth > 0)
       {
@@ -152,6 +154,8 @@ public:
 	       << now	       << log.background() << " : " ;
       }
       stream << log.colour();
+      stream.flags(f);
+
       return stream;
     } else { 
       return devnull;
diff --git a/Grid/parallelIO/BinaryIO.cc b/Grid/parallelIO/BinaryIO.cc
index 221a7fe8..ef1b6683 100644
--- a/Grid/parallelIO/BinaryIO.cc
+++ b/Grid/parallelIO/BinaryIO.cc
@@ -1,3 +1,4 @@
 #include <Grid/GridCore.h>
 
-int Grid::BinaryIO::latticeWriteMaxRetry = -1;
+int                    Grid::BinaryIO::latticeWriteMaxRetry = -1;
+Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;
diff --git a/Grid/parallelIO/BinaryIO.h b/Grid/parallelIO/BinaryIO.h
index 1f11add9..e9893aa1 100644
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
  public:
+  struct IoPerf
+  {
+    uint64_t size{0},time{0};
+    double   mbytesPerSecond{0.};
+  };
+
+  static IoPerf lastPerf;
   static int latticeWriteMaxRetry;
 
   /////////////////////////////////////////////////////////////////////////////
@@ -502,12 +509,15 @@ class BinaryIO {
       timer.Stop();
     }
     
+    lastPerf.size            = sizeof(fobj)*iodata.size()*nrank;
+    lastPerf.time            = timer.useconds();
+    lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
     std::cout<<GridLogMessage<<"IOobject: ";
     if ( control & BINARYIO_READ) std::cout << " read  ";
     else                          std::cout << " write ";
     uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
-    std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
+    std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
+	     << lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
 
     std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;
 
@@ -663,10 +673,15 @@ class BinaryIO {
 	     nersc_csum,scidac_csuma,scidac_csumb);
 
     timer.Start();
-    thread_for(lidx,lsites,{
+    thread_for(lidx,lsites,{  // FIX ME, suboptimal implementation
       std::vector<RngStateType> tmp(RngStateCount);
       std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
-      parallel_rng.SetState(tmp,lidx);
+      Coordinate lcoor;
+      grid->LocalIndexToLocalCoor(lidx, lcoor);
+      int o_idx=grid->oIndex(lcoor);
+      int i_idx=grid->iIndex(lcoor);
+      int gidx=parallel_rng.generator_idx(o_idx,i_idx);
+      parallel_rng.SetState(tmp,gidx);
       });
     timer.Stop();
 
@@ -723,7 +738,12 @@ class BinaryIO {
     std::vector<RNGstate> iodata(lsites);
     thread_for(lidx,lsites,{
       std::vector<RngStateType> tmp(RngStateCount);
-      parallel_rng.GetState(tmp,lidx);
+      Coordinate lcoor;
+      grid->LocalIndexToLocalCoor(lidx, lcoor);
+      int o_idx=grid->oIndex(lcoor);
+      int i_idx=grid->iIndex(lcoor);
+      int gidx=parallel_rng.generator_idx(o_idx,i_idx);
+      parallel_rng.GetState(tmp,gidx);
       std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
     });
     timer.Stop();
diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h
index faacac63..76d7def4 100644
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -47,7 +47,7 @@ static constexpr int Ym = 5;
 static constexpr int Zm = 6;
 static constexpr int Tm = 7;
 
-static constexpr int Nc=3;
+static constexpr int Nc=Config_Nc;
 static constexpr int Ns=4;
 static constexpr int Nd=4;
 static constexpr int Nhs=2; // half spinor
diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
index df1bce7c..3032a80c 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -92,20 +92,16 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
   int lvol = _Umu.Grid()->lSites();
   int DimRep = Impl::Dimension;
 
-  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-
-  Coordinate lcoor;
-  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
-
   {
     autoView(CTv,CloverTerm,CpuRead);
     autoView(CTIv,CloverTermInv,CpuWrite);
-    for (int site = 0; site < lvol; site++) {
+    thread_for(site, lvol, {
+      Coordinate lcoor;
       grid->LocalIndexToLocalCoor(site, lcoor);
-      EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+      typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
       peekLocalSite(Qx, CTv, lcoor);
-      Qxinv = Zero();
       //if (csw!=0){
       for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
@@ -126,21 +122,21 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
       //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
       //  }
       pokeLocalSite(Qxinv, CTIv, lcoor);
-    }
+    });
   }
 
   // Separate the even and odd parts
   pickCheckerboard(Even, CloverTermEven, CloverTerm);
   pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
 
-  pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
-  pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
+  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
+  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
 
   pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
   pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
 
-  pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
+  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
 }
 
 template <class Impl>
diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h
index c2443dd0..98e8175a 100644
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -159,6 +159,13 @@ private:
       Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
 						     Resources.GetSerialRNG(),
 						     Resources.GetParallelRNG());
+    } else {
+      // others
+      std::cout << GridLogError << "Unrecognized StartingType\n";
+      std::cout
+	<< GridLogError
+	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+      exit(1);
     }
 
     Smearing.set_Field(U);
diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index b268b684..15516b56 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -51,7 +51,7 @@ public:
 
   private: 
   template <class mobj, class robj>
-  static void baryon_site(const mobj &D1,
+  static void BaryonSite(const mobj &D1,
 				 const mobj &D2,
 				 const mobj &D3,
 				 const Gamma GammaA_left,
@@ -61,8 +61,18 @@ public:
 				 const int parity,
 				 const bool * wick_contractions,
   				 robj &result);
+  template <class mobj, class robj>
+  static void BaryonSiteMatrix(const mobj &D1,
+         const mobj &D2,
+         const mobj &D3,
+         const Gamma GammaA_left,
+         const Gamma GammaB_left,
+         const Gamma GammaA_right,
+         const Gamma GammaB_right,
+         const bool * wick_contractions,
+           robj &result);
   public:
-  static void Wick_Contractions(std::string qi, 
+  static void WickContractions(std::string qi, 
                  std::string qf, 
                  bool* wick_contractions);
   static void ContractBaryons(const PropagatorField &q1_left,
@@ -75,8 +85,17 @@ public:
 				 const bool* wick_contractions,
 				 const int parity,
 				 ComplexField &baryon_corr);
+  static void ContractBaryonsMatrix(const PropagatorField &q1_left,
+         const PropagatorField &q2_left,
+         const PropagatorField &q3_left,
+         const Gamma GammaA_left,
+         const Gamma GammaB_left,
+         const Gamma GammaA_right,
+         const Gamma GammaB_right,
+         const bool* wick_contractions,
+         SpinMatrixField &baryon_corr);
   template <class mobj, class robj>
-  static void ContractBaryons_Sliced(const mobj &D1,
+  static void ContractBaryonsSliced(const mobj &D1,
 				 const mobj &D2,
 				 const mobj &D3,
 				 const Gamma GammaA_left,
@@ -87,9 +106,20 @@ public:
 				 const int parity,
 				 const int nt,
 				 robj &result);
+  template <class mobj, class robj>
+  static void ContractBaryonsSlicedMatrix(const mobj &D1,
+         const mobj &D2,
+         const mobj &D3,
+         const Gamma GammaA_left,
+         const Gamma GammaB_left,
+         const Gamma GammaA_right,
+         const Gamma GammaB_right,
+         const bool* wick_contractions,
+         const int nt,
+         robj &result);
   private:
   template <class mobj, class mobj2, class robj>
-  static void Baryon_Gamma_3pt_Group1_Site(
+  static void BaryonGamma3ptGroup1Site(
            const mobj &Dq1_ti,
            const mobj2 &Dq2_spec,
            const mobj2 &Dq3_spec,
@@ -101,7 +131,7 @@ public:
            robj &result);
 
   template <class mobj, class mobj2, class robj>
-  static void Baryon_Gamma_3pt_Group2_Site(
+  static void BaryonGamma3ptGroup2Site(
            const mobj2 &Dq1_spec,
            const mobj &Dq2_ti,
            const mobj2 &Dq3_spec,
@@ -113,7 +143,7 @@ public:
            robj &result);
 
   template <class mobj, class mobj2, class robj>
-  static void Baryon_Gamma_3pt_Group3_Site(
+  static void BaryonGamma3ptGroup3Site(
            const mobj2 &Dq1_spec,
            const mobj2 &Dq2_spec,
            const mobj &Dq3_ti,
@@ -125,7 +155,7 @@ public:
            robj &result);
   public:
   template <class mobj>
-  static void Baryon_Gamma_3pt(
+  static void BaryonGamma3pt(
            const PropagatorField &q_ti,
            const mobj &Dq_spec1,
            const mobj &Dq_spec2,
@@ -138,7 +168,7 @@ public:
            SpinMatrixField &stn_corr);
   private: 
   template <class mobj, class mobj2, class robj>
-  static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
+  static void SigmaToNucleonQ1EyeSite(const mobj &Dq_loop,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
 						 const mobj &Ds_ti,
@@ -147,7 +177,7 @@ public:
 		                 		 const Gamma GammaB_nucl,
 						 robj &result);
   template <class mobj, class mobj2, class robj>
-  static void Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
+  static void SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti,
 						 const mobj &Du_tf,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
@@ -159,7 +189,7 @@ public:
 
 
   template <class mobj, class mobj2, class robj>
-  static void Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
+  static void SigmaToNucleonQ2EyeSite(const mobj &Dq_loop,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
 						 const mobj &Ds_ti,
@@ -168,7 +198,7 @@ public:
 		                 		 const Gamma GammaB_nucl,
 						 robj &result);
   template <class mobj, class mobj2, class robj>
-  static void Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,
+  static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti,
 						 const mobj &Du_tf,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
@@ -179,7 +209,7 @@ public:
 						 robj &result);
   public:
   template <class mobj>
-  static void Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
+  static void SigmaToNucleonEye(const PropagatorField &qq_loop,
 				 const mobj &Du_spec,
 				 const PropagatorField &qd_tf,
 				 const PropagatorField &qs_ti,
@@ -189,7 +219,7 @@ public:
 		                 const std::string op,
 				 SpinMatrixField &stn_corr);
   template <class mobj>
-  static void Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
+  static void SigmaToNucleonNonEye(const PropagatorField &qq_ti,
 				 const PropagatorField &qq_tf,
 				 const mobj &Du_spec,
 				 const PropagatorField &qd_tf,
@@ -217,7 +247,7 @@ const Real BaryonUtils<FImpl>::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.};
 //This is the old version
 template <class FImpl>
 template <class mobj, class robj>
-void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
+void BaryonUtils<FImpl>::BaryonSite(const mobj &D1,
                 const mobj &D2,
                 const mobj &D3,
                          const Gamma GammaA_i,
@@ -329,12 +359,132 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
     }}
 }
 
+//New version without parity projection or trace
+template <class FImpl>
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::BaryonSiteMatrix(const mobj &D1,
+                const mobj &D2,
+                const mobj &D3,
+                         const Gamma GammaA_i,
+                         const Gamma GammaB_i,
+                         const Gamma GammaA_f,
+                         const Gamma GammaB_f,
+                const bool * wick_contraction,
+                robj &result)
+{
+
+    auto D1_GAi =  D1 * GammaA_i;
+    auto GAf_D1_GAi = GammaA_f * D1_GAi;
+    auto GBf_D1_GAi = GammaB_f * D1_GAi;
+
+    auto D2_GBi = D2 * GammaB_i;
+    auto GBf_D2_GBi = GammaB_f * D2_GBi;
+    auto GAf_D2_GBi = GammaA_f * D2_GBi;
+
+    auto GBf_D3 = GammaB_f * D3;
+    auto GAf_D3 = GammaA_f * D3;
+
+    for (int ie_f=0; ie_f < 6 ; ie_f++){
+        int a_f = epsilon[ie_f][0]; //a
+        int b_f = epsilon[ie_f][1]; //b
+        int c_f = epsilon[ie_f][2]; //c
+    for (int ie_i=0; ie_i < 6 ; ie_i++){
+        int a_i = epsilon[ie_i][0]; //a'
+        int b_i = epsilon[ie_i][1]; //b'
+        int c_i = epsilon[ie_i][2]; //c'
+
+        Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
+        //This is the \delta_{456}^{123} part
+        if (wick_contraction[0]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int rho_f=0; rho_f<Ns; rho_f++){
+                auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
+                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                    result()(rho_f,rho_i)() += ee  * GAf_D1_GAi_rr_cc
+                                        * D2_GBi    ()(alpha_f,beta_i)(a_f,a_i)
+                                        * GBf_D3    ()(alpha_f,beta_i)(b_f,b_i);
+                }}
+            }}
+        }   
+        //This is the \delta_{456}^{231} part
+        if (wick_contraction[1]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                  auto GBf_D2_GBi_ab_ba = GBf_D2_GBi ()(alpha_f,beta_i)(b_f,a_i);
+                for (int rho_f=0; rho_f<Ns; rho_f++){
+                    result()(rho_f,rho_i)() += ee  * D1_GAi_ar_ac
+                                        * GBf_D2_GBi_ab_ba
+                                        * GAf_D3        ()(rho_f,beta_i)(c_f,b_i);
+                }}
+            }}
+        }   
+        //This is the \delta_{456}^{312} part
+        if (wick_contraction[2]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                  auto D3_ab_ab = D3 ()(alpha_f,beta_i)(a_f,b_i);
+                for (int rho_f=0; rho_f<Ns; rho_f++){
+                    result()(rho_f,rho_i)() += ee  * GBf_D1_GAi_ar_bc
+                                        * GAf_D2_GBi    ()(rho_f,beta_i)(c_f,a_i)
+                                        * D3_ab_ab;
+                }}
+            }}
+        }   
+        //This is the \delta_{456}^{132} part
+        if (wick_contraction[3]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int rho_f=0; rho_f<Ns; rho_f++){
+                auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
+                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                    result()(rho_f,rho_i)() -= ee  * GAf_D1_GAi_rr_cc
+                                        * GBf_D2_GBi    ()(alpha_f,beta_i)(b_f,a_i)
+                                        * D3            ()(alpha_f,beta_i)(a_f,b_i);
+                }}
+            }}
+        }   
+        //This is the \delta_{456}^{321} part
+        if (wick_contraction[4]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                  auto D2_GBi_ab_aa = D2_GBi()(alpha_f,beta_i)(a_f,a_i);
+                for (int rho_f=0; rho_f<Ns; rho_f++){
+                    result()(rho_f,rho_i)() -= ee  * GBf_D1_GAi_ar_bc
+                                        * D2_GBi_ab_aa
+                                        * GAf_D3    ()(rho_f,beta_i)(c_f,b_i);
+                }}
+            }}
+        }   
+        //This is the \delta_{456}^{213} part
+        if (wick_contraction[5]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                  auto GBf_D3_ab_bb = GBf_D3()(alpha_f,beta_i)(b_f,b_i);
+                for (int rho_f=0; rho_f<Ns; rho_f++){
+                    result()(rho_f,rho_i)() -= ee  * D1_GAi_ar_ac
+                                        * GAf_D2_GBi    ()(rho_f,beta_i)(c_f,a_i)
+                                        * GBf_D3_ab_bb;
+                }}
+            }}
+        }
+    }}
+}
+
 /* Computes which wick contractions should be performed for a    *
  * baryon 2pt function given the initial and finals state quark  *
  * flavours.                                                     *
  * The array wick_contractions must be of length 6               */
 template<class FImpl>
-void BaryonUtils<FImpl>::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) {
+void BaryonUtils<FImpl>::WickContractions(std::string qi, std::string qf, bool* wick_contractions) {
     const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
     for (int ie=0; ie < 6 ; ie++) {
         wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3
@@ -364,11 +514,6 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 
   assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
   assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
-
-  std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
-  std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
-  std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
-  std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
   assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
@@ -397,13 +542,62 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
     auto D2 = v2[ss];
     auto D3 = v3[ss];
     vobj result=Zero();
-    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result);
+    BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result);
     vbaryon_corr[ss] = result; 
   }  );//end loop over lattice sites
 
   t += usecond();
 
-  std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
+  std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
+}
+
+template<class FImpl>
+void BaryonUtils<FImpl>::ContractBaryonsMatrix(const PropagatorField &q1_left,
+             const PropagatorField &q2_left,
+             const PropagatorField &q3_left,
+                         const Gamma GammaA_left,
+                         const Gamma GammaB_left,
+                         const Gamma GammaA_right,
+                         const Gamma GammaB_right,
+             const bool* wick_contractions,
+             SpinMatrixField &baryon_corr)
+{
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+ 
+  GridBase *grid = q1_left.Grid();
+  
+  autoView(vbaryon_corr, baryon_corr,CpuWrite);
+  autoView( v1 , q1_left, CpuRead);
+  autoView( v2 , q2_left, CpuRead);
+  autoView( v3 , q3_left, CpuRead);
+
+  // Real bytes =0.;
+  // bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
+  // for (int ie=0; ie < 6 ; ie++){
+  //   if(ie==0 or ie==3){
+  //      bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie];
+  //   }
+  //   else{
+  //      bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie];
+  //   }
+  // }
+  // Real t=0.;
+  // t =-usecond();
+
+  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+    auto D1 = v1[ss];
+    auto D2 = v2[ss];
+    auto D3 = v3[ss];
+    sobj result=Zero();
+    BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result);
+    vbaryon_corr[ss] = result; 
+  }  );//end loop over lattice sites
+
+  // t += usecond();
+
+  // std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
 
 }
 
@@ -414,7 +608,7 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
  * Wick_Contractions function above                               */
 template <class FImpl>
 template <class mobj, class robj>
-void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
+void BaryonUtils<FImpl>::ContractBaryonsSliced(const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -429,16 +623,33 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
 
   assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
   assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
-
-  std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
-  std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
-  std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
-  std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
   assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
   for (int t=0; t<nt; t++) {
-    baryon_site(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]);
+    BaryonSite(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]);
+  }
+}
+
+template <class FImpl>
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::ContractBaryonsSlicedMatrix(const mobj &D1,
+             const mobj &D2,
+             const mobj &D3,
+                         const Gamma GammaA_left,
+                         const Gamma GammaB_left,
+                         const Gamma GammaA_right,
+                         const Gamma GammaB_right,
+             const bool* wick_contractions,
+             const int nt,
+             robj &result)
+{
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
+  for (int t=0; t<nt; t++) {
+    BaryonSiteMatrix(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result[t]);
   }
 }
 
@@ -454,7 +665,7 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
  * Dq4_tf is a quark line from t_f to t_J */
 template<class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group1_Site(
+void BaryonUtils<FImpl>::BaryonGamma3ptGroup1Site(
                         const mobj &Dq1_ti,
                         const mobj2 &Dq2_spec,
                         const mobj2 &Dq3_spec,
@@ -546,7 +757,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group1_Site(
  * Dq4_tf is a quark line from t_f to t_J */
 template<class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group2_Site(
+void BaryonUtils<FImpl>::BaryonGamma3ptGroup2Site(
                         const mobj2 &Dq1_spec,
                         const mobj &Dq2_ti,
                         const mobj2 &Dq3_spec,
@@ -636,7 +847,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group2_Site(
  * Dq4_tf is a quark line from t_f to t_J */
 template<class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group3_Site(
+void BaryonUtils<FImpl>::BaryonGamma3ptGroup3Site(
                         const mobj2 &Dq1_spec,
                         const mobj2 &Dq2_spec,
                         const mobj &Dq3_ti,
@@ -728,7 +939,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group3_Site(
  * https://aportelli.github.io/Hadrons-doc/#/mcontraction        */
 template<class FImpl>
 template <class mobj>
-void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
+void BaryonUtils<FImpl>::BaryonGamma3pt(
                         const PropagatorField &q_ti,
                         const mobj &Dq_spec1,
                         const mobj &Dq_spec2,
@@ -751,7 +962,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
             auto Dq_ti = vq_ti[ss];
             auto Dq_tf = vq_tf[ss];
             sobj result=Zero();
-            Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
+            BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
             vcorr[ss] += result; 
         });//end loop over lattice sites
     } else if (group == 2) {
@@ -759,7 +970,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
             auto Dq_ti = vq_ti[ss];
             auto Dq_tf = vq_tf[ss];
             sobj result=Zero();
-            Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
+            BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
             vcorr[ss] += result; 
         });//end loop over lattice sites
     } else if (group == 3) {
@@ -767,7 +978,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
             auto Dq_ti = vq_ti[ss];
             auto Dq_tf = vq_tf[ss];
             sobj result=Zero();
-            Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
+            BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
 
             vcorr[ss] += result; 
         });//end loop over lattice sites
@@ -787,7 +998,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
  * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
+void BaryonUtils<FImpl>::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
 						 const mobj &Ds_ti,
@@ -838,7 +1049,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
  * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
+void BaryonUtils<FImpl>::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti,
 						 const mobj &Du_tf,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
@@ -897,7 +1108,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
  * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
+void BaryonUtils<FImpl>::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
 						 const mobj &Ds_ti,
@@ -948,7 +1159,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
  * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,
+void BaryonUtils<FImpl>::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti,
 						 const mobj &Du_tf,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
@@ -1002,7 +1213,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,
 
 template<class FImpl>
 template <class mobj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
+void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
 						 const mobj &Du_spec,
 						 const PropagatorField &qd_tf,
 						 const PropagatorField &qs_ti,
@@ -1029,9 +1240,9 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
     auto Ds_ti = vs_ti[ss];
     sobj result=Zero();
     if(op == "Q1"){
-      Sigma_to_Nucleon_Q1_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      SigmaToNucleonQ1EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
     } else if(op == "Q2"){
-      Sigma_to_Nucleon_Q2_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      SigmaToNucleonQ2EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
     } else {
       assert(0 && "Weak Operator not correctly specified");
     }
@@ -1041,7 +1252,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
 
 template<class FImpl>
 template <class mobj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
+void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
 						 const PropagatorField &qq_tf,
 						 const mobj &Du_spec,
 						 const PropagatorField &qd_tf,
@@ -1071,9 +1282,9 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
     auto Ds_ti = vs_ti[ss];
     sobj result=Zero();
     if(op == "Q1"){
-      Sigma_to_Nucleon_Q1_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
     } else if(op == "Q2"){
-      Sigma_to_Nucleon_Q2_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
     } else {
       assert(0 && "Weak Operator not correctly specified");
     }
diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h
index 0cc0cc1a..69ab4ebb 100644
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@@ -449,7 +449,8 @@ public:
     LatticeReal alpha(grid);
 
     //    std::cout<<GridLogMessage<<"xi "<<xi <<std::endl;
-    alpha = toReal(2.0 * xi);
+    xi = 2.0 *xi;
+    alpha = toReal(xi);
 
     do {
       // A. Generate two uniformly distributed pseudo-random numbers R and R',
@@ -734,7 +735,6 @@ public:
     }
   }
 
-
   template <typename GaugeField>
   static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) {
     typedef typename GaugeField::vector_type vector_type;
@@ -799,6 +799,89 @@ public:
   }
 };
 
+template<int N>
+LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
+{
+  GridBase *grid=Umu.Grid();
+  auto lvol = grid->lSites();
+  LatticeComplexD ret(grid);
+
+  autoView(Umu_v,Umu,CpuRead);
+  autoView(ret_v,ret,CpuWrite);
+  thread_for(site,lvol,{
+    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
+    Coordinate lcoor;
+    grid->LocalIndexToLocalCoor(site, lcoor);
+    iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
+    peekLocalSite(Us, Umu_v, lcoor);
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	EigenU(i,j) = Us()()(i,j);
+      }}
+    ComplexD det = EigenU.determinant();
+    pokeLocalSite(det,ret_v,lcoor);
+    std::cout << " site " <<site<<" det " <<det <<std::endl;
+  });
+  return ret;
+}
+template<int N>
+static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
+{
+  Umu      = ProjectOnGroup(Umu);
+  auto det = Determinant(Umu);
+
+  det = pow(det,-1);
+  
+  for(int i=0;i<N;i++){
+    auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
+    element = element * det;
+    PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
+  }
+}
+template<int N>
+static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<vComplexD, N> >,Nd> > &U)
+{
+  GridBase *grid=U.Grid();
+  // Reunitarise
+  for(int mu=0;mu<Nd;mu++){
+    auto Umu = PeekIndex<LorentzIndex>(U,mu);
+    Umu      = ProjectOnGroup(Umu);
+    ProjectSUn(Umu);
+    PokeIndex<LorentzIndex>(U,Umu,mu);
+  }
+}
+// Explicit specialisation for SU(3).
+// Explicit specialisation for SU(3).
+static void
+ProjectSU3 (Lattice<iScalar<iScalar<iMatrix<vComplexD, 3> > > > &Umu)
+{
+  GridBase *grid=Umu.Grid();
+  const int x=0;
+  const int y=1;
+  const int z=2;
+  // Reunitarise
+  Umu = ProjectOnGroup(Umu);
+  autoView(Umu_v,Umu,CpuWrite);
+  thread_for(ss,grid->oSites(),{
+      auto cm = Umu_v[ss];
+      cm()()(2,x) = adj(cm()()(0,y)*cm()()(1,z)-cm()()(0,z)*cm()()(1,y)); //x= yz-zy
+      cm()()(2,y) = adj(cm()()(0,z)*cm()()(1,x)-cm()()(0,x)*cm()()(1,z)); //y= zx-xz
+      cm()()(2,z) = adj(cm()()(0,x)*cm()()(1,y)-cm()()(0,y)*cm()()(1,x)); //z= xy-yx
+      Umu_v[ss]=cm;
+  });
+}
+static void ProjectSU3(Lattice<iVector<iScalar<iMatrix<vComplexD, 3> >,Nd> > &U)
+{
+  GridBase *grid=U.Grid();
+  // Reunitarise
+  for(int mu=0;mu<Nd;mu++){
+    auto Umu = PeekIndex<LorentzIndex>(U,mu);
+    Umu      = ProjectOnGroup(Umu);
+    ProjectSU3(Umu);
+    PokeIndex<LorentzIndex>(U,Umu,mu);
+  }
+}
+
 typedef SU<2> SU2;
 typedef SU<3> SU3;
 typedef SU<4> SU4;
diff --git a/Grid/serialisation/JSON_IO.cc b/Grid/serialisation/JSON_IO.cc
index aca8bab3..f2282099 100644
--- a/Grid/serialisation/JSON_IO.cc
+++ b/Grid/serialisation/JSON_IO.cc
@@ -26,7 +26,7 @@
     *************************************************************************************/
     /*  END LEGAL */
 #include <Grid/Grid.h>
-#ifndef __NVCC__
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
 
 NAMESPACE_BEGIN(Grid);
 
diff --git a/Grid/simd/Grid_vector_unops.h b/Grid/simd/Grid_vector_unops.h
index d225699b..b89bb785 100644
--- a/Grid/simd/Grid_vector_unops.h
+++ b/Grid/simd/Grid_vector_unops.h
@@ -125,14 +125,6 @@ accelerator_inline Grid_simd<S, V> sqrt(const Grid_simd<S, V> &r) {
   return SimdApply(SqrtRealFunctor<S>(), r);
 }
 template <class S, class V>
-accelerator_inline Grid_simd<S, V> rsqrt(const Grid_simd<S, V> &r) {
-  return SimdApply(RSqrtRealFunctor<S>(), r);
-}
-template <class Scalar>
-accelerator_inline Scalar rsqrt(const Scalar &r) {
-  return (RSqrtRealFunctor<Scalar>(), r);
-}
-template <class S, class V>
 accelerator_inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
   return SimdApply(CosRealFunctor<S>(), r);
 }
diff --git a/Grid/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h
index 1ef9fc23..bbaa4a00 100644
--- a/Grid/tensors/Tensor_Ta.h
+++ b/Grid/tensors/Tensor_Ta.h
@@ -95,14 +95,18 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
   vtype nrm;
   vtype inner;
   for(int c1=0;c1<N;c1++){
+
+    // Normalises row c1
     zeroit(inner);	
     for(int c2=0;c2<N;c2++)
       inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
 
-    nrm = rsqrt(inner);
+    nrm = sqrt(inner);
+    nrm = 1.0/nrm;
     for(int c2=0;c2<N;c2++)
       ret._internal[c1][c2]*= nrm;
       
+    // Remove c1 from rows c1+1...N-1
     for (int b=c1+1; b<N; ++b){
       decltype(ret._internal[b][b]*ret._internal[b][b]) pr;
       zeroit(pr);
diff --git a/Grid/tensors/Tensor_unary.h b/Grid/tensors/Tensor_unary.h
index 3adc2897..9686fd49 100644
--- a/Grid/tensors/Tensor_unary.h
+++ b/Grid/tensors/Tensor_unary.h
@@ -84,7 +84,6 @@ NAMESPACE_BEGIN(Grid);
   }
 
 UNARY(sqrt);
-UNARY(rsqrt);
 UNARY(sin);
 UNARY(cos);
 UNARY(asin);
diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc
index e9e4f71f..6fc1afe2 100644
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -52,7 +52,7 @@ void acceleratorInit(void)
     prop = gpu_props[i];
     totalDeviceMem = prop.totalGlobalMem;
     if ( world_rank == 0) {
-#ifndef GRID_IBM_SUMMIT
+#ifndef GRID_DEFAULT_GPU
       if ( i==rank ) {
 	printf("AcceleratorCudaInit[%d]: ========================\n",rank);
 	printf("AcceleratorCudaInit[%d]: Device Number    : %d\n", rank,i);
@@ -77,11 +77,17 @@ void acceleratorInit(void)
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
 
-#ifdef GRID_IBM_SUMMIT
+#ifdef GRID_DEFAULT_GPU
   // IBM Jsrun makes cuda Device numbering screwy and not match rank
-  if ( world_rank == 0 )  printf("AcceleratorCudaInit: IBM Summit or similar - use default device\n");
+  if ( world_rank == 0 ) {
+    printf("AcceleratorCudaInit: using default device \n");
+    printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
+    printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
+    printf("AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no \n");
+  }
 #else
   printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
+  printf("AcceleratorCudaInit: Configure options --enable-select-gpu=yes \n");
   cudaSetDevice(rank);
 #endif
   if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
@@ -143,11 +149,18 @@ void acceleratorInit(void)
   MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
-#ifdef GRID_IBM_SUMMIT
-  // IBM Jsrun makes cuda Device numbering screwy and not match rank
-  if ( world_rank == 0 )  printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n");
+
+#ifdef GRID_DEFAULT_GPU
+  if ( world_rank == 0 ) {
+    printf("AcceleratorHipInit: using default device \n");
+    printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
+    printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
+  }
 #else
-  if ( world_rank == 0 )  printf("AcceleratorHipInit: setting device to node rank\n");
+  if ( world_rank == 0 ) {
+    printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
+    printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
+  }
   hipSetDevice(rank);
 #endif
   if ( world_rank == 0 )  printf("AcceleratorHipInit: ================================================\n");
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index da8a85b0..5f5cd5fe 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -166,15 +166,18 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 inline int  acceleratorIsCommunicable(void *ptr)
 {
-  int uvm;
-  auto 
-  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
-  assert(cuerr == cudaSuccess );
-  if(uvm) return 0;
-  else    return 1;
+  //  int uvm=0;
+  //  auto 
+  //  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
+  //  assert(cuerr == cudaSuccess );
+  //  if(uvm) return 0;
+  //  else    return 1;
+    return 1;
 }
 
 #endif
@@ -229,8 +232,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
+inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
 inline int  acceleratorIsCommunicable(void *ptr)
 {
 #if 0
@@ -328,10 +333,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
   return ptr;
 };
 
-inline void acceleratorFreeShared(void *ptr){ free(ptr);};
+inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
 
 #endif
 
@@ -369,8 +376,10 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
+inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
 
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
 #ifdef HAVE_MM_MALLOC_H
 inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
@@ -393,6 +402,8 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,
 inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 #endif
 
+
+
 ///////////////////////////////////////////////////
 // Synchronise across local threads for divergence resynch
 ///////////////////////////////////////////////////
diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 37d16176..9be39e94 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -473,11 +473,13 @@ void Grid_init(int *argc,char ***argv)
     LebesgueOrder::UseLebesgueOrder=1;
   }
   CartesianCommunicator::nCommThreads = 1;
+#ifdef GRID_COMMS_THREADS  
   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
     GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
     assert(CartesianCommunicator::nCommThreads > 0);
   }
+#endif  
   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
     GridCmdOptionIntVector(arg,LebesgueOrder::Block);
diff --git a/README b/README
index 86506f52..0beabff3 100644
--- a/README
+++ b/README
@@ -111,11 +111,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu
 
 ``` bash
 mkdir build; cd build
-../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
+../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
 ```
 
-where `--enable-precision=` set the default precision,
-`--enable-simd=` set the SIMD type, `--enable-
+where `--enable-simd=` set the SIMD type, `--enable-
 comms=`, and `<path>` should be replaced by the prefix path where you want to
 install Grid. Other options are detailed in the next section, you can also use `configure
 --help` to display them. Like with any other program using GNU autotool, the
@@ -146,8 +145,8 @@ If you want to build all the tests at once just use `make tests`.
 - `--enable-numa`: enable NUMA first touch optimisation
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
-- `--enable-precision={single|double}`: set the default precision (default: `double`).
-- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
+- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
+- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
 - `--disable-timers`: disable system dependent high-resolution timers.
 - `--enable-chroma`: enable Chroma regression tests.
@@ -201,8 +200,7 @@ Alternatively, some CPU codenames can be directly used:
 The following configuration is recommended for the Intel Knights Landing platform:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
              --enable-comms=mpi-auto  \
              --enable-mkl             \
              CXX=icpc MPICXX=mpiicpc
@@ -212,8 +210,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
              --enable-comms=mpi       \
              --enable-mkl             \
              CXX=CC CC=cc
@@ -232,8 +229,7 @@ for interior communication. This is the mpi3 communications implementation.
 We recommend four ranks per node for best performance, but optimum is local volume dependent.
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
              --enable-comms=mpi3-auto \
              --enable-mkl             \
              CC=icpc MPICXX=mpiicpc 
@@ -244,8 +240,7 @@ We recommend four ranks per node for best performance, but optimum is local volu
 The following configuration is recommended for the Intel Haswell platform:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
              --enable-comms=mpi3-auto \
              --enable-mkl             \
              CXX=icpc MPICXX=mpiicpc
@@ -262,8 +257,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
              --enable-comms=mpi3      \
              --enable-mkl             \
              CXX=CC CC=cc
@@ -280,8 +274,7 @@ This is the default.
 The following configuration is recommended for the Intel Skylake platform:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
              --enable-comms=mpi3      \
              --enable-mkl             \
              CXX=mpiicpc
@@ -298,8 +291,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
              --enable-comms=mpi3      \
              --enable-mkl             \
              CXX=CC CC=cc
@@ -330,8 +322,7 @@ and 8 threads per rank.
 The following configuration is recommended for the AMD EPYC platform.
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
              --enable-comms=mpi3 \
              CXX=mpicxx 
 ```
diff --git a/README.md b/README.md
index 9f690ce0..4cbae720 100644
--- a/README.md
+++ b/README.md
@@ -115,11 +115,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu
 
 ``` bash
 mkdir build; cd build
-../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
+../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
 ```
 
-where `--enable-precision=` set the default precision,
-`--enable-simd=` set the SIMD type, `--enable-
+where `--enable-simd=` set the SIMD type, `--enable-
 comms=`, and `<path>` should be replaced by the prefix path where you want to
 install Grid. Other options are detailed in the next section, you can also use `configure
 --help` to display them. Like with any other program using GNU autotool, the
@@ -150,8 +149,8 @@ If you want to build all the tests at once just use `make tests`.
 - `--enable-numa`: enable NUMA first touch optimisation
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
-- `--enable-precision={single|double}`: set the default precision (default: `double`).
-- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
+- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
+- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
 - `--disable-timers`: disable system dependent high-resolution timers.
 - `--enable-chroma`: enable Chroma regression tests.
@@ -205,8 +204,7 @@ Alternatively, some CPU codenames can be directly used:
 The following configuration is recommended for the Intel Knights Landing platform:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
              --enable-comms=mpi-auto  \
              --enable-mkl             \
              CXX=icpc MPICXX=mpiicpc
@@ -216,8 +214,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
              --enable-comms=mpi       \
              --enable-mkl             \
              CXX=CC CC=cc
@@ -236,8 +233,7 @@ for interior communication. This is the mpi3 communications implementation.
 We recommend four ranks per node for best performance, but optimum is local volume dependent.
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
              --enable-comms=mpi3-auto \
              --enable-mkl             \
              CC=icpc MPICXX=mpiicpc 
@@ -248,8 +244,7 @@ We recommend four ranks per node for best performance, but optimum is local volu
 The following configuration is recommended for the Intel Haswell platform:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
              --enable-comms=mpi3-auto \
              --enable-mkl             \
              CXX=icpc MPICXX=mpiicpc
@@ -266,8 +261,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
              --enable-comms=mpi3      \
              --enable-mkl             \
              CXX=CC CC=cc
@@ -284,8 +278,7 @@ This is the default.
 The following configuration is recommended for the Intel Skylake platform:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
              --enable-comms=mpi3      \
              --enable-mkl             \
              CXX=mpiicpc
@@ -302,8 +295,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
              --enable-comms=mpi3      \
              --enable-mkl             \
              CXX=CC CC=cc
@@ -334,8 +326,7 @@ and 8 threads per rank.
 The following configuration is recommended for the AMD EPYC platform.
 
 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
              --enable-comms=mpi3 \
              CXX=mpicxx 
 ```
diff --git a/SVE_README.txt b/SVE_README.txt
index 0c167c4a..cefec4be 100644
--- a/SVE_README.txt
+++ b/SVE_README.txt
@@ -12,31 +12,31 @@ module load mpi/openmpi-aarch64
 
 scl enable gcc-toolset-10 bash
 
-../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
+../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
 
 * gcc 10.1 prebuild w/ MPI, QPACE4 interactive login
 
 scl enable gcc-toolset-10 bash
 module load mpi/openmpi-aarch64
 
-../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
+../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
 
 ------------------------------------------------------------------------------
 
 * armclang 20.2 (qp4)
 
-../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN"
+../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN"
 
 ------------------------------------------------------------------------------
 
 * gcc 10.0.1 VLA (merlin)
 
-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
 
 
 * gcc 10.0.1 fixed-size ACLE (merlin)
 
-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
 
 
 * gcc 10.0.1 fixed-size ACLE (fjt) w/ MPI
@@ -46,34 +46,34 @@ export OMPI_CXX=g++-10.0.1
 export MPICH_CC=gcc-10.0.1
 export MPICH_CXX=g++-10.0.1
 
-$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt"
+$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt"
 
 --------------------------------------------------------
 
 * armclang 20.0 VLA (merlin)
 
-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
 
 TODO check ARMCLANGCOMPAT
 
 
 * armclang 20.1 VLA (merlin)
 
-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
 
 TODO check ARMCLANGCOMPAT
 
 
 * armclang 20.1 VLA (fjt cluster)
 
-../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
+../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
 
 TODO check ARMCLANGCOMPAT
 
 
 * armclang 20.1 VLA w/MPI (fjt cluster)
 
-../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64"
+../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64"
 
 No ARMCLANGCOMPAT -> still correct ?
 
@@ -81,9 +81,9 @@ No ARMCLANGCOMPAT -> still correct ?
 
 * Fujitsu fcc
 
-../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN"
+../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN"
 
 
 * Fujitsu fcc w/ MPI
 
-../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
+../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc
index c8c0937f..87e7224d 100644
--- a/benchmarks/Benchmark_IO.cc
+++ b/benchmarks/Benchmark_IO.cc
@@ -1,10 +1,18 @@
-
 #include "Benchmark_IO.hpp"
 
-#ifndef BENCH_IO_LMAX
-#define BENCH_IO_LMAX 40
+#ifndef BENCH_IO_LMIN
+#define BENCH_IO_LMIN 8
 #endif
 
+#ifndef BENCH_IO_LMAX
+#define BENCH_IO_LMAX 32
+#endif
+
+#ifndef BENCH_IO_NPASS
+#define BENCH_IO_NPASS 10
+#endif
+
+#ifdef HAVE_LIME
 using namespace Grid;
 
 std::string filestem(const int l)
@@ -12,37 +20,182 @@ std::string filestem(const int l)
   return "iobench_l" + std::to_string(l);
 }
 
+int vol(const int i)
+{
+  return BENCH_IO_LMIN + 2*i;
+}
+
+int volInd(const int l)
+{
+  return (l - BENCH_IO_LMIN)/2;
+}
+
+template <typename Mat>
+void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
+{
+  auto            nr = data[0].rows(), nc = data[0].cols();
+  Eigen::MatrixXd sqSum(nr, nc);
+  double          n = static_cast<double>(data.size());
+
+  assert(n > 1.);
+  mean  = Mat::Zero(nr, nc);
+  sqSum = Mat::Zero(nr, nc);
+  for (auto &d: data)
+  {
+    mean  += d;
+    sqSum += d.cwiseProduct(d);
+  }
+  stdDev = ((sqSum - mean.cwiseProduct(mean)/n)/(n - 1.)).cwiseSqrt();
+  mean  /= n;
+}
+
+#define grid_printf(...) \
+{\
+  char _buf[1024];\
+  sprintf(_buf, __VA_ARGS__);\
+  MSG << _buf;\
+}
+
+enum {sRead = 0, sWrite = 1, gRead = 2, gWrite = 3};
+
 int main (int argc, char ** argv)
 {
-#ifdef HAVE_LIME
   Grid_init(&argc,&argv);
 
-  int64_t threads = GridThread::GetThreads();
+  int64_t                      threads = GridThread::GetThreads();
+  auto                         mpi     = GridDefaultMpi();
+  unsigned int                 nVol    = (BENCH_IO_LMAX - BENCH_IO_LMIN)/2 + 1;
+  unsigned int                 nRelVol = (BENCH_IO_LMAX - 24)/2 + 1;
+  std::vector<Eigen::MatrixXd> perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4));
+  std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
+  std::vector<int>             latt;
+
   MSG << "Grid is setup to use " << threads << " threads" << std::endl;
-  MSG << SEP << std::endl;
-  MSG << "Benchmark Lime write" << std::endl;
-  MSG << SEP << std::endl;
-  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
+  MSG << "MPI partition " << mpi << std::endl;
+  for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
   {
-    auto             mpi  = GridDefaultMpi();
-    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+    MSG << BIGSEP << std::endl;
+    MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
+    MSG << BIGSEP << std::endl;
+    MSG << SEP << std::endl;
+    MSG << "Benchmark std write" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
 
-    std::cout << "-- Local volume " << l << "^4" << std::endl;
-    writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
+      perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+
+    MSG << SEP << std::endl;
+    MSG << "Benchmark std read" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
+      perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+
+  #ifdef HAVE_LIME
+    MSG << SEP << std::endl;
+    MSG << "Benchmark Grid C-Lime write" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
+      perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+
+    MSG << SEP << std::endl;
+    MSG << "Benchmark Grid C-Lime read" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
+      perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
+#endif
+    avPerf[i].fill(0.);
+    for (int f = 0; f < 4; ++f)
+    for (int l = 24; l <= BENCH_IO_LMAX; l += 2)
+    {
+      avPerf[i](f) += perf[i](volInd(l), f);
+    }
+    avPerf[i] /= nRelVol;
   }
 
-  MSG << "Benchmark Lime read" << std::endl;
-  MSG << SEP << std::endl;
-  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
-  {
-    auto             mpi  = GridDefaultMpi();
-    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+  Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
+  Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
+  double          n = BENCH_IO_NPASS;
 
-    std::cout << "-- Local volume " << l << "^4" << std::endl;
-    readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
+  stats(mean, stdDev, perf);
+  stats(avMean, avStdDev, avPerf);
+  rob.fill(100.);
+  rob -= 100.*stdDev.cwiseQuotient(mean.cwiseAbs());
+  avRob.fill(100.);
+  avRob -= 100.*avStdDev.cwiseQuotient(avMean.cwiseAbs());
+
+  MSG << BIGSEP << std::endl;
+  MSG << "SUMMARY" << std::endl;
+  MSG << BIGSEP << std::endl;
+  MSG << "Summary of individual results (all results in MB/s)." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << std::endl;
+  grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n",
+              "L", "std read", "std dev", "std write", "std dev",
+              "Grid read", "std dev", "Grid write", "std dev");
+  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+  {
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
+                l, mean(volInd(l), sRead), stdDev(volInd(l), sRead),
+                mean(volInd(l), sWrite), stdDev(volInd(l), sWrite),
+                mean(volInd(l), gRead), stdDev(volInd(l), gRead),
+                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
   }
+  MSG << std::endl;
+  MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << std::endl;
+  grid_printf("%4s %12s %12s %12s %12s\n",
+              "L", "std read", "std write", "Grid read", "Grid write");
+  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+  {
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n",
+                l, rob(volInd(l), sRead), rob(volInd(l), sWrite),
+                rob(volInd(l), gRead), rob(volInd(l), gWrite));
+  }
+  MSG << std::endl;
+  MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << std::endl;
+  grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n",
+              "std read", "std dev", "std write", "std dev",
+              "Grid read", "std dev", "Grid write", "std dev");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
+              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
+              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
+  MSG << std::endl;
+  MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << std::endl;
+  grid_printf("%12s %12s %12s %12s\n",
+              "std read", "std write", "Grid read", "Grid write");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f\n",
+              avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite));
 
   Grid_finalize();
-#endif
+
   return EXIT_SUCCESS;
 }
+#else
+int main(int argc,char ** argv){}
+#endif
diff --git a/benchmarks/Benchmark_IO.hpp b/benchmarks/Benchmark_IO.hpp
index d3416353..2ff42d52 100644
--- a/benchmarks/Benchmark_IO.hpp
+++ b/benchmarks/Benchmark_IO.hpp
@@ -2,10 +2,12 @@
 #define Benchmark_IO_hpp_
 
 #include <Grid/Grid.h>
-#ifdef HAVE_LIME
 #define MSG std::cout << GridLogMessage
 #define SEP \
+"-----------------------------------------------------------------------------"
+#define BIGSEP \
 "============================================================================="
+#ifdef HAVE_LIME
 
 namespace Grid {
 
@@ -14,13 +16,152 @@ using WriterFn = std::function<void(const std::string, Field &)> ;
 template <typename Field>
 using ReaderFn = std::function<void(Field &, const std::string)>;
 
+// AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
+// 
+// template <typename Field>
+// void stdWrite(const std::string filestem, Field &vec)
+// {
+//   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+//   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
+//   size_t        size;
+//   uint32_t      crc;
+//   GridStopWatch ioWatch, crcWatch;
+
+//   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+//   autoView(vec_v, vec, CpuRead);
+//   crcWatch.Start();
+//   crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
+//   std::fwrite(&crc, sizeof(uint32_t), 1, file);
+//   crcWatch.Stop();
+//   MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+//   ioWatch.Start();
+//   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+//   ioWatch.Stop();
+//   std::fclose(file);
+//   size *= vec.Grid()->ProcessorCount();
+//   auto &p = BinaryIO::lastPerf;
+//   p.size            = size;
+//   p.time            = ioWatch.useconds();
+//   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+//   MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() 
+//       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+//   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+// }
+//
+// template <typename Field>
+// void stdRead(Field &vec, const std::string filestem)
+// {
+//   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+//   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
+//   size_t        size;
+//   uint32_t      crcRead, crcData;
+//   GridStopWatch ioWatch, crcWatch;
+
+//   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+//   crcWatch.Start();
+//   std::fread(&crcRead, sizeof(uint32_t), 1, file);
+//   crcWatch.Stop();
+//   {
+//     autoView(vec_v, vec, CpuWrite);
+//     ioWatch.Start();
+//     std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+//     ioWatch.Stop();
+//     std::fclose(file);
+//   }
+//   {
+//     autoView(vec_v, vec, CpuRead);
+//     crcWatch.Start();
+//     crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
+//     crcWatch.Stop();
+//   }
+//   MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+//   assert(crcData == crcRead);
+//   size *= vec.Grid()->ProcessorCount();
+//   auto &p = BinaryIO::lastPerf;
+//   p.size            = size;
+//   p.time            = ioWatch.useconds();
+//   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+//   MSG << "Std I/O read: Read " <<  p.size << " bytes in " << ioWatch.Elapsed() 
+//       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+//   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+// }
+
+template <typename Field>
+void stdWrite(const std::string filestem, Field &vec)
+{
+  std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+  std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary);
+  size_t        size, sizec;
+  uint32_t      crc;
+  GridStopWatch ioWatch, crcWatch;
+
+  size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  sizec = size/sizeof(char); // just in case of...
+  autoView(vec_v, vec, CpuRead);
+  crcWatch.Start();
+  crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
+  file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t)/sizeof(char));
+  crcWatch.Stop();
+  MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+  ioWatch.Start();
+  file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
+  file.flush();
+  ioWatch.Stop();
+  size *= vec.Grid()->ProcessorCount();
+  auto &p = BinaryIO::lastPerf;
+  p.size            = size;
+  p.time            = ioWatch.useconds();
+  p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+  MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() 
+      << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+  MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+}
+
+template <typename Field>
+void stdRead(Field &vec, const std::string filestem)
+{
+  std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+  std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary);
+  size_t        size, sizec;
+  uint32_t      crcRead, crcData;
+  GridStopWatch ioWatch, crcWatch;
+
+  size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  sizec = size/sizeof(char); // just in case of...
+  crcWatch.Start();
+  file.read(reinterpret_cast<char *>(&crcRead), sizeof(uint32_t)/sizeof(char));
+  crcWatch.Stop();
+  {
+    autoView(vec_v, vec, CpuWrite);
+    ioWatch.Start();
+    file.read(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
+    ioWatch.Stop();
+  }
+  {
+    autoView(vec_v, vec, CpuRead);
+    crcWatch.Start();
+    crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
+    crcWatch.Stop();
+  }
+  MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+  assert(crcData == crcRead);
+  size *= vec.Grid()->ProcessorCount();
+  auto &p = BinaryIO::lastPerf;
+  p.size            = size;
+  p.time            = ioWatch.useconds();
+  p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+  MSG << "Std I/O read: Read " <<  p.size << " bytes in " << ioWatch.Elapsed() 
+      << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+  MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+}
+
 template <typename Field>
 void limeWrite(const std::string filestem, Field &vec)
 {
   emptyUserRecord   record;
   ScidacWriter binWriter(vec.Grid()->IsBoss());
 
-  binWriter.open(filestem + ".bin");
+  binWriter.open(filestem + ".lime.bin");
   binWriter.writeScidacFieldRecord(vec, record);
   binWriter.close();
 }
@@ -31,7 +172,7 @@ void limeRead(Field &vec, const std::string filestem)
   emptyUserRecord   record;
   ScidacReader binReader;
 
-  binReader.open(filestem + ".bin");
+  binReader.open(filestem + ".lime.bin");
   binReader.readScidacFieldRecord(vec, record);
   binReader.close();
 }
@@ -73,12 +214,18 @@ void writeBenchmark(const Coordinate &latt, const std::string filename,
   auto                           simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
   std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
   std::shared_ptr<GridBase>      gPt;
+  std::random_device             rd;
 
   makeGrid(gPt, gBasePt, Ls, rb);
 
-  GridBase                       *g = gPt.get();
-  GridParallelRNG                rng(g);
-  Field                          vec(g);
+  GridBase         *g = gPt.get();
+  GridParallelRNG  rng(g);
+  Field            vec(g);
+
+  rng.SeedFixedIntegers({static_cast<int>(rd()), static_cast<int>(rd()),
+                         static_cast<int>(rd()), static_cast<int>(rd()),
+                         static_cast<int>(rd()), static_cast<int>(rd()),
+                         static_cast<int>(rd()), static_cast<int>(rd())});
 
   random(rng, vec);
   write(filename, vec);
@@ -96,8 +243,8 @@ void readBenchmark(const Coordinate &latt, const std::string filename,
 
   makeGrid(gPt, gBasePt, Ls, rb);
 
-  GridBase                       *g = gPt.get();
-  Field                          vec(g);
+  GridBase *g = gPt.get();
+  Field    vec(g);
 
   read(vec, filename);
 }
diff --git a/benchmarks/Benchmark_IO_vs_dir.cc b/benchmarks/Benchmark_IO_vs_dir.cc
index 6e6c9ae0..8252547b 100644
--- a/benchmarks/Benchmark_IO_vs_dir.cc
+++ b/benchmarks/Benchmark_IO_vs_dir.cc
@@ -1,14 +1,9 @@
 #include "Benchmark_IO.hpp"
-
-#define MSG std::cout << GridLogMessage
-#define SEP \
-"============================================================================="
-
+#ifdef HAVE_LIME
 using namespace Grid;
 
 int main (int argc, char ** argv)
 {
-#ifdef HAVE_LIME
   std::vector<std::string> dir;
   unsigned int             Ls;
   bool                     rb;
@@ -34,46 +29,74 @@ int main (int argc, char ** argv)
   }
   Grid_init(&argc,&argv);
 
-
   int64_t threads = GridThread::GetThreads();
+  auto    mpi     = GridDefaultMpi();
+
   MSG << "Grid is setup to use " << threads << " threads" << std::endl;
-  MSG << SEP << std::endl;
-  MSG << "Benchmark double precision Lime write" << std::endl;
-  MSG << SEP << std::endl;
-  for (auto &d: dir)
-  {
-    MSG << "-- Directory " << d << std::endl;
-    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermion>, Ls, rb);
-  }
+  MSG << "MPI partition " << mpi << std::endl;
 
   MSG << SEP << std::endl;
-  MSG << "Benchmark double precision Lime read" << std::endl;
+  MSG << "Benchmark Grid std write" << std::endl;
   MSG << SEP << std::endl;
   for (auto &d: dir)
   {
     MSG << "-- Directory " << d << std::endl;
-    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermion>, Ls, rb);
+    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                   stdWrite<LatticeFermion>, Ls, rb);
+  }
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid std read" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                  stdRead<LatticeFermion>, Ls, rb);
   }
 
+#ifdef HAVE_LIME
   MSG << SEP << std::endl;
-  MSG << "Benchmark single precision Lime write" << std::endl;
+  MSG << "Benchmark Grid C-Lime write" << std::endl;
   MSG << SEP << std::endl;
   for (auto &d: dir)
   {
     MSG << "-- Directory " << d << std::endl;
-    writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
+    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                   limeWrite<LatticeFermion>, Ls, rb);
   }
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid C-Lime read" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                  limeRead<LatticeFermion>, Ls, rb);
+  }
+#endif
 
-  MSG << SEP << std::endl;
-  MSG << "Benchmark single precision Lime read" << std::endl;
-  MSG << SEP << std::endl;
-  for (auto &d: dir)
-  {
-    MSG << "-- Directory " << d << std::endl;
-    readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
-  }
+  // MSG << SEP << std::endl;
+  // MSG << "Benchmark single precision Lime write" << std::endl;
+  // MSG << SEP << std::endl;
+  // for (auto &d: dir)
+  // {
+  //   MSG << "-- Directory " << d << std::endl;
+  //   writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
+  // }
+
+  // MSG << SEP << std::endl;
+  // MSG << "Benchmark single precision Lime read" << std::endl;
+  // MSG << SEP << std::endl;
+  // for (auto &d: dir)
+  // {
+  //   MSG << "-- Directory " << d << std::endl;
+  //   readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
+  // }
 
   Grid_finalize();
-#endif
+
   return EXIT_SUCCESS;
 }
+#else
+int main(int argc,char ** argv){}
+#endif
diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index dc09549c..5d602ce9 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
 
@@ -62,7 +62,7 @@ struct time_statistics{
 
 void comms_header(){
   std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
-            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
+            <<"bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)"<<std::endl;
 };
 
 Gamma::Algebra Gmu [] = {
@@ -125,7 +125,7 @@ public:
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
-	std::cout << GridLogMessage<< latt_size <<std::endl;
+
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
@@ -137,8 +137,8 @@ public:
 	for(int d=0;d<8;d++){
 	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	  //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
 
 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@@ -189,11 +189,11 @@ public:
 	//	double rbytes    = dbytes*0.5;
 	double bidibytes = dbytes;
 
-	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
-		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
-		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
+	std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
+		 << bytes << " \t "
+		 <<xbytes/timestat.mean<<" \t "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " \t "
 		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
-		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
+		 << "\t\t"<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 	
 	    }
@@ -202,6 +202,8 @@ public:
     return;
   }
 
+
+  
   static void Memory(void)
   {
     const int Nvec=8;
@@ -222,7 +224,7 @@ public:
 
 
   uint64_t lmax=32;
-#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 
     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
     for(int lat=8;lat<=lmax;lat+=8){
@@ -247,11 +249,6 @@ public:
       double start=usecond();
       for(int i=0;i<Nloop;i++){
 	z=a*x-y;
-	autoView( x_v , x, CpuWrite);
-	autoView( y_v , y, CpuWrite);
-	autoView( z_v , z, CpuRead);
-        x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
-        y_v[4]=z_v[4];
       }
       double stop=usecond();
       double time = (stop-start)/Nloop*1000;
@@ -266,6 +263,61 @@ public:
   };
 
 
+  static void SU4(void)
+  {
+    const int Nc4=4;
+    typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4;
+
+    Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+    Coordinate mpi_layout  = GridDefaultMpi();
+    
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
+    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+  
+    uint64_t NN;
+
+
+    uint64_t lmax=32;
+#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+
+    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+    for(int lat=8;lat<=lmax;lat+=8){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      NN =Grid.NodeCount();
+
+
+      LatticeSU4 z(&Grid); z=Zero();
+      LatticeSU4 x(&Grid); x=Zero();
+      LatticeSU4 y(&Grid); y=Zero();
+      double a=2.0;
+
+      uint64_t Nloop=NLOOP;
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+	z=x*y;
+      }
+      double stop=usecond();
+      double time = (stop-start)/Nloop*1000;
+     
+      double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add
+      double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF);
+      std::cout<<GridLogMessage<<std::setprecision(3) 
+	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
+	       << "\t\t"<< bytes/time/NN <<std::endl;
+
+    }
+  };
+
+
   static double DWF(int Ls,int L)
   {
     RealD mass=0.1;
@@ -282,8 +334,9 @@ public:
     int threads = GridThread::GetThreads();
     Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
     Coordinate local({L,L,L,L});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
 
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 								       GridDefaultMpi());
     uint64_t NP = TmpGrid->RankCount();
@@ -291,11 +344,11 @@ public:
     NN_global=NN;
     uint64_t SHM=NP/NN;
 
-    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
 
     ///////// Welcome message ////////////
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
     std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Nc             : "<<Nc<<std::endl;
     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
     std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
     std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
@@ -324,7 +377,7 @@ public:
     typedef LatticeGaugeFieldF Gauge;
     
     ///////// Source preparation ////////////
-    Gauge Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
+    Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
     Fermion src   (FGrid); random(RNG5,src);
     Fermion src_e (FrbGrid);
     Fermion src_o (FrbGrid);
@@ -369,7 +422,7 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 50;
+	uint64_t ncall = 500;
 
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 
@@ -387,7 +440,17 @@ public:
 	FGrid->Barrier();
 	
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-	double flops=(1344.0*volume)/2;
+
+	// Nc=3 gives
+	// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
+	// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2  + Nd*Nc*Ns*2
+	//	double flops=(1344.0*volume)/2;
+#if 0
+	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns  + Nd*Nc*Ns*2;
+#else
+	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns  + 2*Nd*Nc*Ns*2;
+#endif
+	double flops=(fps*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 
 	timestat.statistics(t_time);
@@ -402,6 +465,7 @@ public:
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 
+	std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
@@ -438,8 +502,9 @@ public:
     int threads = GridThread::GetThreads();
     Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
     Coordinate local({L,L,L,L});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
     
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 								       GridDefaultMpi());
     uint64_t NP = TmpGrid->RankCount();
@@ -447,7 +512,6 @@ public:
     NN_global=NN;
     uint64_t SHM=NP/NN;
 
-    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
 
     ///////// Welcome message ////////////
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -478,7 +542,7 @@ public:
     typedef typename Action::FermionField Fermion; 
     typedef LatticeGaugeFieldF Gauge;
     
-    Gauge Umu(FGrid);  SU3::HotConfiguration(RNG4,Umu); 
+    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
 
     typename Action::ImplParams params;
     Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
@@ -596,11 +660,12 @@ int main (int argc, char ** argv)
 #endif
   Benchmark::Decomposition();
 
+  int do_su4=1;
   int do_memory=1;
   int do_comms =1;
 
-  int sel=2;
-  std::vector<int> L_list({16,24,32});
+  int sel=4;
+  std::vector<int> L_list({8,12,16,24,32});
   int selm1=sel-1;
 
   std::vector<double> wilson;
@@ -624,7 +689,6 @@ int main (int argc, char ** argv)
     dwf4.push_back(result);
   }
 
-  /*
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
   std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -632,14 +696,13 @@ int main (int argc, char ** argv)
     double result = Benchmark::Staggered(L_list[l]) ;
     staggered.push_back(result);
   }
-  */
 
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" <<std::endl;
   for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
   }
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
@@ -651,6 +714,13 @@ int main (int argc, char ** argv)
     Benchmark::Memory();
   }
 
+  if ( do_su4 ) {
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    Benchmark::SU4();
+  }
+  
   if ( do_comms && (NN>1) ) {
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
     std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
@@ -661,9 +731,9 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
     std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  " <<std::endl;
+    std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " <<std::endl;
     for(int l=0;l<L_list.size();l++){
-      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
+      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
     }
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 232030c8..ccffb564 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -94,8 +94,8 @@ int main (int argc, char ** argv)
       RealD Nnode = Grid.NodeCount();
       RealD ppn = Nrank/Nnode;
 
-      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
-      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8);
 
       for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
diff --git a/benchmarks/Benchmark_comms_host_device.cc b/benchmarks/Benchmark_comms_host_device.cc
new file mode 100644
index 00000000..591b5597
--- /dev/null
+++ b/benchmarks/Benchmark_comms_host_device.cc
@@ -0,0 +1,260 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_comms.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+struct time_statistics{
+  double mean;
+  double err;
+  double min;
+  double max;
+
+  void statistics(std::vector<double> v){
+      double sum = std::accumulate(v.begin(), v.end(), 0.0);
+      mean = sum / v.size();
+
+      std::vector<double> diff(v.size());
+      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
+      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
+
+      auto result = std::minmax_element(v.begin(), v.end());
+      min = *result.first;
+      max = *result.second;
+}
+};
+
+void header(){
+  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
+            <<std::setw(11)<<"bytes\t\t"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  int Nloop=250;
+  int nmu=0;
+  int maxlat=32;
+  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
+
+  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
+  std::vector<double> t_time(Nloop);
+  time_statistics timestat;
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  header();
+
+  for(int lat=8;lat<=maxlat;lat+=4){
+    for(int Ls=8;Ls<=8;Ls*=2){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],
+	                      lat*mpi_layout[1],
+      			      lat*mpi_layout[2],
+      			      lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
+
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8);
+
+      for(int mu=0;mu<8;mu++){
+	xbuf[mu].resize(lat*lat*lat*Ls);
+	rbuf[mu].resize(lat*lat*lat*Ls);
+      }
+      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
+      int ncomm;
+
+      for(int mu=0;mu<4;mu++){
+	if (mpi_layout[mu]>1 ) {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+
+	  ncomm=0;
+	
+	  
+	    ncomm++;
+	    int comm_proc=1;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+
+	    comm_proc = mpi_layout[mu]-1;
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+	}
+	Grid.Barrier();
+	double stop=usecond();
+        double mean=(stop-start)/Nloop;      
+      double dbytes    = bytes*ppn;
+      double xbytes    = dbytes*2.0*ncomm;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
+               <<std::right<< xbytes/mean<<"  "
+               << "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
+
+
+	
+	}
+      }
+
+
+      
+    }
+  }
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from GPU memory "<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  header();
+
+  for(int lat=8;lat<=maxlat;lat+=4){
+    for(int Ls=8;Ls<=8;Ls*=2){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],
+	                      lat*mpi_layout[1],
+      			      lat*mpi_layout[2],
+      			      lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
+
+
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
+
+      uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+      for(int d=0;d<8;d++){
+	xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+	rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+      }
+
+      int ncomm;
+
+      for(int mu=0;mu<4;mu++){
+	if (mpi_layout[mu]>1 ) {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+
+	  ncomm=0;
+	
+	  
+	    ncomm++;
+	    int comm_proc=1;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+
+	    comm_proc = mpi_layout[mu]-1;
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+	}
+	Grid.Barrier();
+	double stop=usecond();
+        double mean=(stop-start)/Nloop;      
+      double dbytes    = bytes*ppn;
+      double xbytes    = dbytes*2.0*ncomm;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
+               <<std::right<< xbytes/mean<<"  "
+               << "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
+
+
+	
+	}
+      }
+
+      for(int d=0;d<8;d++){
+	acceleratorFreeDevice(xbuf[d]);
+	acceleratorFreeDevice(rbuf[d]);
+      }
+
+      
+    }
+  }
+
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+
+  Grid_finalize();
+}
diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index 2ef5921d..d7b49122 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -108,7 +108,7 @@ int main (int argc, char ** argv)
 
   std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
   std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 #if 0
   Umu=1.0;
diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc
new file mode 100644
index 00000000..cb86177e
--- /dev/null
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -0,0 +1,364 @@
+ /*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid
+    Source file: ./benchmarks/Benchmark_dwf.cc
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#ifdef GRID_CUDA
+#define CUDA_PROFILE
+#endif
+
+#ifdef CUDA_PROFILE
+#include <cuda_profiler_api.h>
+#endif
+
+using namespace std;
+using namespace Grid;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  int threads = GridThread::GetThreads();
+
+  Coordinate latt4 = GridDefaultLatt();
+  int Ls=8;
+  for(int i=0;i<argc;i++)
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+
+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
+
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  LatticeFermionF src   (FGrid); random(RNG5,src);
+#if 0
+  src = Zero();
+  {
+    Coordinate origin({0,0,0,latt4[2]-1,0});
+    SpinColourVectorF tmp;
+    tmp=Zero();
+    tmp()(0)(0)=Complex(-2.0,0.0);
+    std::cout << " source site 0 " << tmp<<std::endl;
+    pokeSite(tmp,src,origin);
+  }
+#else
+  RealD N2 = 1.0/::sqrt(norm2(src));
+  src = src*N2;
+#endif
+
+
+  LatticeFermionF result(FGrid); result=Zero();
+  LatticeFermionF    ref(FGrid);    ref=Zero();
+  LatticeFermionF    tmp(FGrid);
+  LatticeFermionF    err(FGrid);
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  LatticeGaugeFieldF Umu(UGrid);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+#if 0
+  Umu=1.0;
+  for(int mu=0;mu<Nd;mu++){
+    LatticeColourMatrixF ttmp(UGrid);
+    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
+    //    if (mu !=2 ) ttmp = 0;
+    //    ttmp = ttmp* pow(10.0,mu);
+    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
+  }
+  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
+#endif
+
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  // replicate across fifth dimension
+  LatticeGaugeFieldF Umu5d(FGrid);
+  std::vector<LatticeColourMatrixF> U(4,FGrid);
+  {
+    autoView( Umu5d_v, Umu5d, CpuWrite);
+    autoView( Umu_v  , Umu  , CpuRead);
+    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
+      for(int s=0;s<Ls;s++){
+	Umu5d_v[Ls*ss+s] = Umu_v[ss];
+      }
+    }
+  }
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+  }
+  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
+
+  if (1)
+  {
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = U[mu]*Cshift(src,mu+1,1);
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
+  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  int ncall =1000;
+
+  if (1) {
+    FGrid->Barrier();
+    Dw.ZeroCounters();
+    Dw.Dhop(src,result,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      __SSC_START;
+      Dw.Dhop(src,result,0);
+      __SSC_STOP;
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=single_site_flops*volume*ncall;
+
+    auto nsimd = vComplex::Nsimd();
+    auto simdwidth = sizeof(vComplex);
+
+    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
+    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
+    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
+    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
+    err = ref-result;
+    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+    //exit(0);
+
+    if(( norm2(err)>1.0e-4) ) {
+      /*
+      std::cout << "RESULT\n " << result<<std::endl;
+      std::cout << "REF   \n " << ref   <<std::endl;
+      std::cout << "ERR   \n " << err   <<std::endl;
+      */
+      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
+      FGrid->Barrier();
+      exit(-1);
+    }
+    assert (norm2(err)< 1.0e-4 );
+    Dw.Report();
+  }
+
+  if (1)
+  { // Naive wilson dag implementation
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
+      tmp = U[mu]*Cshift(src,mu+1,1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+    }
+    ref = -0.5*ref;
+  }
+  //  dump=1;
+  Dw.Dhop(src,result,1);
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
+  err = ref-result;
+  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
+  if((norm2(err)>1.0e-4)){
+/*
+	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
+	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
+	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
+*/
+  }
+  LatticeFermionF src_e (FrbGrid);
+  LatticeFermionF src_o (FrbGrid);
+  LatticeFermionF r_e   (FrbGrid);
+  LatticeFermionF r_o   (FrbGrid);
+  LatticeFermionF r_eo  (FGrid);
+
+  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+
+  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
+
+
+  // S-direction is INNERMOST and takes no part in the parity.
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  {
+    Dw.ZeroCounters();
+    FGrid->Barrier();
+    Dw.DhopEO(src_o,r_e,DaggerNo);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+#ifdef CUDA_PROFILE
+      if(i==10) cudaProfilerStart();
+#endif
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+#ifdef CUDA_PROFILE
+      if(i==20) cudaProfilerStop();
+#endif
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(single_site_flops*volume*ncall)/2.0;
+
+    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
+    Dw.Report();
+  }
+  Dw.DhopEO(src_o,r_e,DaggerNo);
+  Dw.DhopOE(src_e,r_o,DaggerNo);
+  Dw.Dhop  (src  ,result,DaggerNo);
+
+  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
+  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
+  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
+
+  setCheckerboard(r_eo,r_o);
+  setCheckerboard(r_eo,r_e);
+
+  err = r_eo-result;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+  if((norm2(err)>1.0e-4)){
+    /*
+	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
+	std::cout<< "Deo REF\n " <<result  << std::endl;
+	std::cout<< "Deo ERR   \n " << err <<std::endl;
+    */
+  }
+
+  pickCheckerboard(Even,src_e,err);
+  pickCheckerboard(Odd,src_o,err);
+  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
+
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
+  Grid_finalize();
+  exit(0);
+}
diff --git a/benchmarks/Benchmark_gparity.cc b/benchmarks/Benchmark_gparity.cc
index b03e1b63..7fa7508a 100644
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@@ -63,7 +63,7 @@ int main (int argc, char ** argv)
 
   std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
   LatticeGaugeFieldF Umu(UGrid); 
-  SU3::HotConfiguration(RNG4,Umu); 
+  SU<Nc>::HotConfiguration(RNG4,Umu); 
   std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 
   RealD mass=0.1;
diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc
index ef16c908..0aaccecc 100644
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@@ -30,7 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 using namespace std;
 using namespace Grid;
- ;
+
 
 
 int main (int argc, char ** argv)
@@ -53,7 +53,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
   std::cout << GridLogMessage << "Seeded"<<std::endl;
 
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
 
   std::cout << GridLogMessage << "made random gauge fields"<<std::endl;
 
diff --git a/configure.ac b/configure.ac
index 90f66f44..5b22309d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -123,6 +123,24 @@ case ${ac_LAPACK} in
         AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac
 
+############### Nc
+AC_ARG_ENABLE([Nc],
+    [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
+    [ac_Nc=${enable_Nc}], [ac_Nc=3])
+
+case ${ac_Nc} in
+    2)
+        AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);;
+    3)
+        AC_DEFINE([Config_Nc],[3],[Gauge group Nc]);;
+    4)
+        AC_DEFINE([Config_Nc],[4],[Gauge group Nc]);;
+    5)
+        AC_DEFINE([Config_Nc],[5],[Gauge group Nc]);;
+    *)
+      AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
+esac
+
 ############### FP16 conversions
 AC_ARG_ENABLE([sfw-fp16],
     [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])],
@@ -135,18 +153,28 @@ case ${ac_SFW_FP16} in
       AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac
 
-############### SUMMIT JSRUN
-AC_ARG_ENABLE([summit],
-    [AC_HELP_STRING([--enable-summit=yes|no], [enable IBMs jsrun resource manager for SUMMIT])],
-    [ac_SUMMIT=${enable_summit}], [ac_SUMMIT=no])
-case ${ac_SUMMIT} in
-    no);;
+############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
+AC_ARG_ENABLE([accelerator-cshift],
+    [AC_HELP_STRING([--enable-accelerator-cshift=yes|no], [run cshift on the device])],
+    [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
+
+AC_ARG_ENABLE([ucx-buggy],
+    [AC_HELP_STRING([--enable-ucx-buggy=yes|no], [enable workaround for UCX device buffer bugs])],
+    [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
+
+case ${ac_UCXBUGGY} in
     yes)
-      AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
-    *)
-      AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
+    ac_ACC_CSHIFT=no;;
+    *);;
 esac
 
+case ${ac_ACC_CSHIFT} in
+    yes)
+      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
+    *);;
+esac
+
+
 ############### SYCL/CUDA/HIP/none
 AC_ARG_ENABLE([accelerator],
     [AC_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none], [enable none,cuda,sycl,hip acceleration])],
@@ -163,8 +191,9 @@ case ${ac_ACCELERATOR} in
       echo HIP acceleration
       AC_DEFINE([GRID_HIP],[1],[Use HIP offload]);;
     none)
-      echo NO acceleration
-    ;;
+      echo NO acceleration    ;;
+    no)
+      echo NO acceleration    ;;
     *)
       AC_MSG_ERROR(["Acceleration not suppoorted ${ac_ACCELERATOR}"]);;
 esac
@@ -459,27 +488,26 @@ esac
 AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
 AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"
 
-############### Precision selection
-AC_ARG_ENABLE([precision],
-              [AC_HELP_STRING([--enable-precision=single|double],
-                              [Select default word size of Real])],
-              [ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
+###### PRECISION ALWAYS DOUBLE
+AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
 
-case ${ac_PRECISION} in
-     single)
-       AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
-     ;;
-     double)
-       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
-     ;;
-     *)
-     AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
-     ;;
+#########################################################
+######################  set GPU device to rank in node ##
+#########################################################
+AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdevice],
+              [Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no])
+case ${ac_SETDEVICE} in
+    yes);;
+    no)
+     AC_DEFINE([GRID_DEFAULT_GPU],[1],[GRID_DEFAULT_GPU] )
+    ;;
 esac
 
-######################  Shared memory allocation technique under MPI3
-AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone],
-              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
+#########################################################
+######################  Shared memory intranode #########
+#########################################################
+AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no],
+              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=no])
 
 case ${ac_SHM} in
 
@@ -498,10 +526,14 @@ case ${ac_SHM} in
      AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
      ;;
 
-     shmnone)
+     shmnone | no)
      AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
      ;;
 
+     nvlink)
+     AC_DEFINE([GRID_MPI3_SHM_NVLINK],[1],[GRID_MPI3_SHM_NVLINK] )
+     ;;
+
      hugetlbfs)
      AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
      ;;
@@ -518,10 +550,32 @@ AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
 	      [ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/])
 AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
 
+############### force MPI in SMP
+AC_ARG_ENABLE([shm-force-mpi],[AC_HELP_STRING([--enable-shm-force-mpi],
+              [Force MPI within shared memory])],[ac_SHM_FORCE_MPI=${enable_shm_force_mpi}],[ac_SHM_FORCE_MPI=no])
+case ${ac_SHM_FORCE_MPI} in
+     yes)
+        AC_DEFINE([GRID_SHM_FORCE_MPI],[1],[GRID_SHM_FORCE_MPI] )
+      ;;
+     *) ;;
+esac
+
+############### communication type selection
+AC_ARG_ENABLE([comms-threads],[AC_HELP_STRING([--enable-comms-threads | --disable-comms-threads],
+              [Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
+
+case ${ac_COMMS_THREADS} in
+     yes)
+        AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] )
+      ;;
+     *) ;;
+esac
+
 ############### communication type selection
 AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto],
               [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
 
+
 case ${ac_COMMS} in
      none)
         AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
@@ -656,6 +710,7 @@ os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
+Nc                          : ${ac_Nc}
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp}
 Acceleration                : ${ac_ACCELERATOR}
diff --git a/documentation/GridXcode/readme.md b/documentation/GridXcode/readme.md
index b8342828..3bd6fc30 100644
--- a/documentation/GridXcode/readme.md
+++ b/documentation/GridXcode/readme.md
@@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu
 
 This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:
 
-    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=double --prefix=$GridPre/Debug
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Debug
 
 #### 2. `Release`
 
-Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):
+Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`:
 
-    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=single --prefix=$GridPre/Release
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Release
 
 #### 3. `MPIDebug`
 
 Debug configuration with MPI:
 
-    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --enable-precision=double --prefix=$GridPre/MPIDebug
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --prefix=$GridPre/MPIDebug
 
 ### 5.3 Build Grid
 
diff --git a/documentation/manual.rst b/documentation/manual.rst
index 1596de5e..d51f07c1 100644
--- a/documentation/manual.rst
+++ b/documentation/manual.rst
@@ -178,15 +178,10 @@ Then enter the cloned directory and set up the build system::
 Now you can execute the `configure` script to generate makefiles (here from a build directory)::
 
   mkdir build; cd build
-  ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto \
+  ../configure --enable-simd=AVX --enable-comms=mpi-auto \
       --prefix=<path>
 
-where::
-
-  --enable-precision=single|double
-
-sets the **default precision**. Since this is largely a benchmarking convenience, it is anticipated that the default precision may be removed in future implementations,
-and that explicit type selection be made at all points. Naturally, most code will be type templated in any case.::
+::
 
    --enable-simd=GEN|SSE4|AVX|AVXFMA|AVXFMA4|AVX2|AVX512|NEONv8|QPX
 
@@ -236,7 +231,7 @@ Detailed build configuration options
   --enable-mkl[=path]                     use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
   --enable-simd=code                      setup Grid for the SIMD target `<code>`(default: `GEN`). A list of possible SIMD targets is detailed in a section below.
   --enable-gen-simd-width=size            select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). E.g. SSE 128 bit corresponds to 16 bytes.
-  --enable-precision=single|double        set the default precision (default: `double`).
+  --enable-precision=single|double        set the default precision (default: `double`). **Deprecated option**
   --enable-comms=mpi|none                 use `<comm>` for message passing (default: `none`).
   --enable-rng=sitmo|ranlux48|mt19937     choose the RNG (default: `sitmo`).
   --disable-timers                        disable system dependent high-resolution timers.
@@ -304,8 +299,7 @@ Build setup for Intel Knights Landing platform
 
 The following configuration is recommended for the Intel Knights Landing platform::
 
-  ../configure --enable-precision=double\
-             --enable-simd=KNL        \
+  ../configure --enable-simd=KNL        \
              --enable-comms=mpi-auto  \
              --enable-mkl             \
              CXX=icpc MPICXX=mpiicpc
@@ -314,8 +308,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
 
-  ../configure --enable-precision=double\
-             --enable-simd=KNL        \
+  ../configure --enable-simd=KNL        \
              --enable-comms=mpi       \
              --enable-mkl             \
              CXX=CC CC=cc
@@ -332,8 +325,7 @@ presently performs better with use of more than one rank per node, using shared
 for interior communication.
 We recommend four ranks per node for best performance, but optimum is local volume dependent. ::
 
-   ../configure --enable-precision=double\
-             --enable-simd=KNL        \
+   ../configure --enable-simd=KNL        \
              --enable-comms=mpi-auto \
              --enable-mkl             \
              CC=icpc MPICXX=mpiicpc 
@@ -343,8 +335,7 @@ Build setup for Intel Haswell Xeon platform
 
 The following configuration is recommended for the Intel Haswell platform::
 
-  ../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+  ../configure --enable-simd=AVX2       \
              --enable-comms=mpi-auto \
              --enable-mkl             \
              CXX=icpc MPICXX=mpiicpc
@@ -360,8 +351,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
 
-  ../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+  ../configure --enable-simd=AVX2       \
              --enable-comms=mpi      \
              --enable-mkl             \
              CXX=CC CC=cc
@@ -379,8 +369,7 @@ Build setup for Intel Skylake Xeon platform
 
 The following configuration is recommended for the Intel Skylake platform::
 
-  ../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+  ../configure --enable-simd=AVX512     \
              --enable-comms=mpi      \
              --enable-mkl             \
              CXX=mpiicpc
@@ -396,8 +385,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
 
-  ../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+  ../configure --enable-simd=AVX512     \
              --enable-comms=mpi      \
              --enable-mkl             \
              CXX=CC CC=cc
@@ -422,8 +410,7 @@ and 8 threads per rank.
 The following configuration is recommended for the AMD EPYC platform::
 
 
-  ../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+  ../configure --enable-simd=AVX2       \
              --enable-comms=mpi \
              CXX=mpicxx 
 
diff --git a/scripts/filelist b/scripts/filelist
index 78747315..27425a3e 100755
--- a/scripts/filelist
+++ b/scripts/filelist
@@ -26,11 +26,10 @@ for subdir in $dirs; do
     echo "tests-local: ${TESTLIST} " > Make.inc
     echo ${PREF}_PROGRAMS = ${TESTLIST} >> Make.inc
     echo >> Make.inc
-    HADLINK=`[ $subdir = './hadrons' ] && echo '-lHadrons '`
     for f in $TESTS; do
 	   BNAME=`basename $f .cc`
 	   echo ${BNAME}_SOURCES=$f >> Make.inc
-	   echo ${BNAME}_LDADD=${HADLINK}-lGrid  >> Make.inc
+	   echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc
 	   echo >> Make.inc
     done
     if [ $subdir != '.' ]; then
@@ -49,7 +48,7 @@ echo >> Make.inc
 for f in $TESTS; do
     BNAME=`basename $f .cc`
     echo ${BNAME}_SOURCES=$f  >> Make.inc
-    echo ${BNAME}_LDADD=-lGrid>> Make.inc
+    echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc
     echo >> Make.inc
 done
 cd ..
@@ -65,7 +64,7 @@ echo >> Make.inc
 for f in $TESTS; do
     BNAME=`basename $f .cc`
     echo ${BNAME}_SOURCES=$f  >> Make.inc
-    echo ${BNAME}_LDADD=-lGrid>> Make.inc
+    echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a'>> Make.inc
     echo >> Make.inc
 done
 cd ..
diff --git a/tests/IO/Test_ildg_io.cc b/tests/IO/Test_ildg_io.cc
index 369acfd1..d6e21101 100644
--- a/tests/IO/Test_ildg_io.cc
+++ b/tests/IO/Test_ildg_io.cc
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
 
   std::vector<LatticeColourMatrix> U(4,&Fine);
   
-  SU3::HotConfiguration(pRNGa,Umu);
+  SU<Nc>::HotConfiguration(pRNGa,Umu);
 
 
   FieldMetaData header;
diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc
index f5413e3b..c15c320e 100644
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@@ -84,7 +84,7 @@ int main (int argc, char ** argv)
 
   std::vector<LatticeColourMatrix> U(4,&Fine);
   
-  SU3::HotConfiguration(pRNGa,Umu);
+  SU<Nc>::HotConfiguration(pRNGa,Umu);
 
   FieldMetaData header;
   std::string file("./ckpoint_lat.4000");
diff --git a/tests/Test_cayley_even_odd_vec.cc b/tests/Test_cayley_even_odd_vec.cc
index 0e71d910..c345efd9 100644
--- a/tests/Test_cayley_even_odd_vec.cc
+++ b/tests/Test_cayley_even_odd_vec.cc
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          sRNG5(sFGrid);  sRNG5.SeedFixedIntegers(seeds5);
 
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
 
   RealD mass=0.1;
   RealD M5  =1.8;
diff --git a/tests/Test_compressed_lanczos_hot_start.cc b/tests/Test_compressed_lanczos_hot_start.cc
index 8eb7a921..dc22cfca 100644
--- a/tests/Test_compressed_lanczos_hot_start.cc
+++ b/tests/Test_compressed_lanczos_hot_start.cc
@@ -202,7 +202,7 @@ int main (int argc, char ** argv) {
   std::vector<int> seeds4({1,2,3,4});
   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
   //  FieldMetaData header;
   //  NerscIO::readConfiguration(Umu,header,Params.config);
 
diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc
index be881db9..da0b54cd 100644
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -71,7 +71,7 @@ int main (int argc, char ** argv)
   LatticeGaugeFieldD Umu(UGrid);
   LatticeGaugeFieldF Umu_f(UGrid_f); 
   
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
 
   precisionChange(Umu_f,Umu);
   
diff --git a/tests/Test_dwf_mixedcg_prec_halfcomms.cc b/tests/Test_dwf_mixedcg_prec_halfcomms.cc
index 4d94632c..8b0126dc 100644
--- a/tests/Test_dwf_mixedcg_prec_halfcomms.cc
+++ b/tests/Test_dwf_mixedcg_prec_halfcomms.cc
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
   LatticeGaugeFieldD Umu(UGrid);
   LatticeGaugeFieldF Umu_f(UGrid_f); 
   
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
 
   precisionChange(Umu_f,Umu);
   
diff --git a/tests/core/Test_cf_coarsen_support.cc b/tests/core/Test_cf_coarsen_support.cc
index e787905e..ad0309b9 100644
--- a/tests/core/Test_cf_coarsen_support.cc
+++ b/tests/core/Test_cf_coarsen_support.cc
@@ -64,7 +64,7 @@ int main (int argc, char ** argv)
   LatticeFermion    ref(FGrid); ref=Zero();
   LatticeFermion    tmp(FGrid);
   LatticeFermion    err(FGrid);
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
 
   std::vector<LatticeColourMatrix> U(4,UGrid);
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/core/Test_checker.cc b/tests/core/Test_checker.cc
index 9d15f2cb..f87133e5 100644
--- a/tests/core/Test_checker.cc
+++ b/tests/core/Test_checker.cc
@@ -131,7 +131,7 @@ int main (int argc, char ** argv)
   // LatticeFermion result(FGrid); result=Zero();
   // LatticeGaugeField Umu(UGrid); 
 
-  // SU3::HotConfiguration(RNG4,Umu);
+  // SU<Nc>::HotConfiguration(RNG4,Umu);
 
   // std::vector<LatticeColourMatrix> U(4,UGrid);
   // for(int mu=0;mu<Nd;mu++){
diff --git a/tests/core/Test_contfrac_even_odd.cc b/tests/core/Test_contfrac_even_odd.cc
index 5311f869..53d00010 100644
--- a/tests/core/Test_contfrac_even_odd.cc
+++ b/tests/core/Test_contfrac_even_odd.cc
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
   std::vector<LatticeColourMatrix> U(4,UGrid);
 
   RealD mass=0.1;
diff --git a/tests/core/Test_dwf_eofa_even_odd.cc b/tests/core/Test_dwf_eofa_even_odd.cc
index 01fff9ea..64701069 100644
--- a/tests/core/Test_dwf_eofa_even_odd.cc
+++ b/tests/core/Test_dwf_eofa_even_odd.cc
@@ -73,7 +73,7 @@ int main (int argc, char ** argv)
     LatticeFermion    ref   (FGrid); ref = Zero();
     LatticeFermion    tmp   (FGrid); tmp = Zero();
     LatticeFermion    err   (FGrid); err = Zero();
-    LatticeGaugeField Umu   (UGrid); SU3::HotConfiguration(RNG4, Umu);
+    LatticeGaugeField Umu   (UGrid); SU<Nc>::HotConfiguration(RNG4, Umu);
     std::vector<LatticeColourMatrix> U(4,UGrid);
 
     // Only one non-zero (y)
diff --git a/tests/core/Test_dwf_even_odd.cc b/tests/core/Test_dwf_even_odd.cc
index 6093ee8f..4918f02a 100644
--- a/tests/core/Test_dwf_even_odd.cc
+++ b/tests/core/Test_dwf_even_odd.cc
@@ -72,7 +72,7 @@ int main (int argc, char ** argv)
   LatticeFermion    ref(FGrid);    ref=Zero();
   LatticeFermion    tmp(FGrid);    tmp=Zero();
   LatticeFermion    err(FGrid);    tmp=Zero();
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
   std::vector<LatticeColourMatrix> U(4,UGrid);
 
   // Only one non-zero (y)
diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc
index 2ba3752b..212b1a35 100644
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -138,7 +138,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeFieldD Umu(&GRID);
 
-  SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
   //  Umu=Zero();
   ////////////////////////////////////////////////////
   // Wilson test
diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc
index 228770a8..87dbc242 100644
--- a/tests/core/Test_fft_gfix.cc
+++ b/tests/core/Test_fft_gfix.cc
@@ -73,11 +73,11 @@ int main (int argc, char ** argv)
   LatticeColourMatrix   xform2(&GRID); // Gauge xform
   LatticeColourMatrix   xform3(&GRID); // Gauge xform
   
-  SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
   Uorg=Umu;
   Urnd=Umu;
 
-  SU3::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge
+  SU<Nc>::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge
 
   Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
   std::cout << " Initial plaquette "<<plaq << std::endl;
@@ -121,7 +121,7 @@ int main (int argc, char ** argv)
   std::cout<< "* Testing non-unit configuration                                *" <<std::endl;
   std::cout<< "*****************************************************************" <<std::endl;
 
-  SU3::HotConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge
 
   plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
   std::cout << " Initial plaquette "<<plaq << std::endl;
@@ -136,7 +136,7 @@ int main (int argc, char ** argv)
   std::cout<< "*****************************************************************" <<std::endl;
 
   Umu=Urnd;
-  SU3::HotConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge
 
   plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
   std::cout << " Initial plaquette "<<plaq << std::endl;
diff --git a/tests/core/Test_gparity.cc b/tests/core/Test_gparity.cc
index 32cf9c5f..b2068901 100644
--- a/tests/core/Test_gparity.cc
+++ b/tests/core/Test_gparity.cc
@@ -114,7 +114,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG4_2f(UGrid_2f);  RNG4_2f.SeedFixedIntegers(seeds4);
 
   GparityGaugeField Umu_2f(UGrid_2f);
-  SU3::HotConfiguration(RNG4_2f,Umu_2f);
+  SU<Nc>::HotConfiguration(RNG4_2f,Umu_2f);
 
   StandardFermionField    src   (FGrid_2f); 
   StandardFermionField    tmpsrc(FGrid_2f); 
diff --git a/tests/core/Test_gpwilson_even_odd.cc b/tests/core/Test_gpwilson_even_odd.cc
index bf37f4d5..69ace859 100644
--- a/tests/core/Test_gpwilson_even_odd.cc
+++ b/tests/core/Test_gpwilson_even_odd.cc
@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
   FermionField    ref(&Grid);    ref=Zero();
   FermionField    tmp(&Grid);    tmp=Zero();
   FermionField    err(&Grid);    tmp=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
   std::vector<LatticeColourMatrix> U(4,&Grid);
 
   double volume=1;
diff --git a/tests/core/Test_lie_generators.cc b/tests/core/Test_lie_generators.cc
index 471cea25..e044378c 100644
--- a/tests/core/Test_lie_generators.cc
+++ b/tests/core/Test_lie_generators.cc
@@ -66,7 +66,7 @@ int main(int argc, char** argv) {
 
   std::cout << GridLogMessage << "*********************************************"
             << std::endl;
-  std::cout << GridLogMessage << "* Generators for SU(3)" << std::endl;
+  std::cout << GridLogMessage << "* Generators for SU(Nc" << std::endl;
   std::cout << GridLogMessage << "*********************************************"
             << std::endl;
   SU3::printGenerators();
@@ -114,8 +114,8 @@ int main(int argc, char** argv) {
 
   
   LatticeGaugeField U(grid), V(grid);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V);
 
   // Adjoint representation
   // Test group structure
@@ -123,8 +123,8 @@ int main(int argc, char** argv) {
   LatticeGaugeField UV(grid);
   UV = Zero();
   for (int mu = 0; mu < Nd; mu++) {
-    SU<Nc>::LatticeMatrix Umu = peekLorentz(U,mu);
-    SU<Nc>::LatticeMatrix Vmu = peekLorentz(V,mu);
+    SU3::LatticeMatrix Umu = peekLorentz(U,mu);
+    SU3::LatticeMatrix Vmu = peekLorentz(V,mu);
     pokeLorentz(UV,Umu*Vmu, mu);
   }
 
@@ -151,16 +151,16 @@ int main(int argc, char** argv) {
 
   // Check correspondence of algebra and group transformations
   // Create a random vector
-  SU<Nc>::LatticeAlgebraVector h_adj(grid);
+  SU3::LatticeAlgebraVector h_adj(grid);
   typename AdjointRep<Nc>::LatticeMatrix Ar(grid);
   random(gridRNG,h_adj);
   h_adj = real(h_adj);
   SU_Adjoint<Nc>::AdjointLieAlgebraMatrix(h_adj,Ar);
 
   // Re-extract h_adj
-  SU<Nc>::LatticeAlgebraVector h_adj2(grid);
+  SU3::LatticeAlgebraVector h_adj2(grid);
   SU_Adjoint<Nc>::projectOnAlgebra(h_adj2, Ar);
-  SU<Nc>::LatticeAlgebraVector h_diff = h_adj - h_adj2;
+  SU3::LatticeAlgebraVector h_diff = h_adj - h_adj2;
   std::cout << GridLogMessage << "Projections structure check vector difference (Adjoint representation) : " << norm2(h_diff) << std::endl;
 
   // Exponentiate
@@ -183,14 +183,14 @@ int main(int argc, char** argv) {
       
 
   // Construct the fundamental matrix in the group
-  SU<Nc>::LatticeMatrix Af(grid);
-  SU<Nc>::FundamentalLieAlgebraMatrix(h_adj,Af);
-  SU<Nc>::LatticeMatrix Ufund(grid);
+  SU3::LatticeMatrix Af(grid);
+  SU3::FundamentalLieAlgebraMatrix(h_adj,Af);
+  SU3::LatticeMatrix Ufund(grid);
   Ufund  = expMat(Af, 1.0, 16);
   // Check unitarity
-  SU<Nc>::LatticeMatrix uno_f(grid);
+  SU3::LatticeMatrix uno_f(grid);
   uno_f = 1.0;
-  SU<Nc>::LatticeMatrix UnitCheck(grid);
+  SU3::LatticeMatrix UnitCheck(grid);
   UnitCheck = Ufund * adj(Ufund) - uno_f;
   std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck)
             << std::endl;
@@ -311,14 +311,14 @@ int main(int argc, char** argv) {
   // Test group structure
   // (U_f * V_f)_r = U_r * V_r
   LatticeGaugeField U2(grid), V2(grid);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U2);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V2);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U2);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V2);
   
   LatticeGaugeField UV2(grid);
   UV2 = Zero();
   for (int mu = 0; mu < Nd; mu++) {
-    SU<Nc>::LatticeMatrix Umu2 = peekLorentz(U2,mu);
-    SU<Nc>::LatticeMatrix Vmu2 = peekLorentz(V2,mu);
+    SU3::LatticeMatrix Umu2 = peekLorentz(U2,mu);
+    SU3::LatticeMatrix Vmu2 = peekLorentz(V2,mu);
     pokeLorentz(UV2,Umu2*Vmu2, mu);
   }
   
@@ -345,16 +345,16 @@ int main(int argc, char** argv) {
   
   // Check correspondence of algebra and group transformations
   // Create a random vector
-  SU<Nc>::LatticeAlgebraVector h_sym(grid);
+  SU3::LatticeAlgebraVector h_sym(grid);
   typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix Ar_sym(grid);
   random(gridRNG,h_sym);
   h_sym = real(h_sym);
   SU_TwoIndex<Nc,Symmetric>::TwoIndexLieAlgebraMatrix(h_sym,Ar_sym);
   
   // Re-extract h_sym
-  SU<Nc>::LatticeAlgebraVector h_sym2(grid);
+  SU3::LatticeAlgebraVector h_sym2(grid);
   SU_TwoIndex< Nc, Symmetric>::projectOnAlgebra(h_sym2, Ar_sym);
-  SU<Nc>::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2;
+  SU3::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2;
   std::cout << GridLogMessage << "Projections structure check vector difference (Two Index Symmetric): " << norm2(h_diff_sym) << std::endl;
 
   
@@ -379,11 +379,11 @@ int main(int argc, char** argv) {
   
   
   // Construct the fundamental matrix in the group
-  SU<Nc>::LatticeMatrix Af_sym(grid);
-  SU<Nc>::FundamentalLieAlgebraMatrix(h_sym,Af_sym);
-  SU<Nc>::LatticeMatrix Ufund2(grid);
+  SU3::LatticeMatrix Af_sym(grid);
+  SU3::FundamentalLieAlgebraMatrix(h_sym,Af_sym);
+  SU3::LatticeMatrix Ufund2(grid);
   Ufund2  = expMat(Af_sym, 1.0, 16);
-  SU<Nc>::LatticeMatrix UnitCheck2(grid);
+  SU3::LatticeMatrix UnitCheck2(grid);
   UnitCheck2 = Ufund2 * adj(Ufund2) - uno_f;
   std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2)
       << std::endl;
@@ -421,14 +421,14 @@ int main(int argc, char** argv) {
   // Test group structure
   // (U_f * V_f)_r = U_r * V_r
   LatticeGaugeField U2A(grid), V2A(grid);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U2A);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V2A);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U2A);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V2A);
   
   LatticeGaugeField UV2A(grid);
   UV2A = Zero();
   for (int mu = 0; mu < Nd; mu++) {
-    SU<Nc>::LatticeMatrix Umu2A = peekLorentz(U2,mu);
-    SU<Nc>::LatticeMatrix Vmu2A = peekLorentz(V2,mu);
+    SU3::LatticeMatrix Umu2A = peekLorentz(U2,mu);
+    SU3::LatticeMatrix Vmu2A = peekLorentz(V2,mu);
     pokeLorentz(UV2A,Umu2A*Vmu2A, mu);
   }
   
@@ -455,16 +455,16 @@ int main(int argc, char** argv) {
   
   // Check correspondence of algebra and group transformations
   // Create a random vector
-  SU<Nc>::LatticeAlgebraVector h_Asym(grid);
+  SU3::LatticeAlgebraVector h_Asym(grid);
   typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ar_Asym(grid);
   random(gridRNG,h_Asym);
   h_Asym = real(h_Asym);
   SU_TwoIndex< Nc, AntiSymmetric>::TwoIndexLieAlgebraMatrix(h_Asym,Ar_Asym);
   
   // Re-extract h_sym
-  SU<Nc>::LatticeAlgebraVector h_Asym2(grid);
+  SU3::LatticeAlgebraVector h_Asym2(grid);
   SU_TwoIndex< Nc, AntiSymmetric>::projectOnAlgebra(h_Asym2, Ar_Asym);
-  SU<Nc>::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2;
+  SU3::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2;
   std::cout << GridLogMessage << "Projections structure check vector difference (Two Index anti-Symmetric): " << norm2(h_diff_Asym) << std::endl;
 
   
@@ -489,11 +489,11 @@ int main(int argc, char** argv) {
   
   
   // Construct the fundamental matrix in the group
-  SU<Nc>::LatticeMatrix Af_Asym(grid);
-  SU<Nc>::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym);
-  SU<Nc>::LatticeMatrix Ufund2A(grid);
+  SU3::LatticeMatrix Af_Asym(grid);
+  SU3::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym);
+  SU3::LatticeMatrix Ufund2A(grid);
   Ufund2A  = expMat(Af_Asym, 1.0, 16);
-  SU<Nc>::LatticeMatrix UnitCheck2A(grid);
+  SU3::LatticeMatrix UnitCheck2A(grid);
   UnitCheck2A = Ufund2A * adj(Ufund2A) - uno_f;
   std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2A)
       << std::endl;
diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc
index af8b747b..d7ed04ba 100644
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@@ -444,7 +444,7 @@ int main(int argc, char **argv) {
       // Lattice 12x12 GEMM
       scFooBar = scFoo * scBar;
 
-      // Benchmark some simple operations LatticeSU3 * Lattice SU3.
+      // Benchmark some simple operations LatticeSU<Nc> * Lattice SU<Nc>.
       double t0, t1, flops;
       double bytes;
       int ncall = 5000;
diff --git a/tests/core/Test_mobius_eofa_even_odd.cc b/tests/core/Test_mobius_eofa_even_odd.cc
index 68091229..7339f156 100644
--- a/tests/core/Test_mobius_eofa_even_odd.cc
+++ b/tests/core/Test_mobius_eofa_even_odd.cc
@@ -73,7 +73,7 @@ int main (int argc, char ** argv)
     LatticeFermion    ref   (FGrid); ref = Zero();
     LatticeFermion    tmp   (FGrid); tmp = Zero();
     LatticeFermion    err   (FGrid); err = Zero();
-    LatticeGaugeField Umu   (UGrid); SU3::HotConfiguration(RNG4, Umu);
+    LatticeGaugeField Umu   (UGrid); SU<Nc>::HotConfiguration(RNG4, Umu);
     std::vector<LatticeColourMatrix> U(4,UGrid);
 
     // Only one non-zero (y)
diff --git a/tests/core/Test_quenched_update.cc b/tests/core/Test_quenched_update.cc
index ef428d1b..22675913 100644
--- a/tests/core/Test_quenched_update.cc
+++ b/tests/core/Test_quenched_update.cc
@@ -55,7 +55,7 @@ int main (int argc, char ** argv)
   GridParallelRNG  pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
   GridSerialRNG    sRNG;       sRNG.SeedFixedIntegers(sseeds);
 
-  // SU3 colour operatoions
+  // SU<Nc> colour operatoions
   LatticeColourMatrix link(grid);
   LatticeColourMatrix staple(grid);
 
@@ -87,10 +87,10 @@ int main (int argc, char ** argv)
 
 	link = PeekIndex<LorentzIndex>(Umu,mu);
 
-	for( int subgroup=0;subgroup<SU3::su2subgroups();subgroup++ ) {
+	for( int subgroup=0;subgroup<SU<Nc>::su2subgroups();subgroup++ ) {
 
 	  // update Even checkerboard
-	  SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask);
+	  SU<Nc>::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask);
 
 	}
 
diff --git a/tests/core/Test_reunitarise.cc b/tests/core/Test_reunitarise.cc
new file mode 100644
index 00000000..3e78b961
--- /dev/null
+++ b/tests/core/Test_reunitarise.cc
@@ -0,0 +1,137 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_quenched_update.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+ ;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt({8,8,8,8});
+  GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, 
+							GridDefaultSimd(Nd,vComplexD::Nsimd()),
+							GridDefaultMpi());
+
+  GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt, 
+							GridDefaultSimd(Nd,vComplexF::Nsimd()),
+							GridDefaultMpi());
+  
+
+  ///////////////////////////////
+  // Configuration of known size
+  ///////////////////////////////
+  LatticeColourMatrixD ident(grid);
+  LatticeColourMatrixD U(grid);
+  LatticeColourMatrixD UU(grid);
+  LatticeColourMatrixD tmp(grid);
+  LatticeColourMatrixD org(grid);
+  LatticeColourMatrixF UF(gridF);
+
+  LatticeGaugeField Umu(grid);
+
+  ident =1.0;
+
+  // RNG set up for test
+  std::vector<int> pseeds({1,2,3,4,5}); // once I caught a fish alive
+  std::vector<int> sseeds({6,7,8,9,10});// then i let it go again
+  GridParallelRNG  pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
+  GridSerialRNG    sRNG;       sRNG.SeedFixedIntegers(sseeds);
+
+  SU<Nc>::HotConfiguration(pRNG,Umu);
+
+  U = PeekIndex<LorentzIndex>(Umu,0);
+  org=U;
+
+
+  tmp=  U*adj(U) - ident ;
+  RealD Def1 = norm2( tmp );
+  std::cout << " Defect1 "<<Def1<<std::endl;
+
+  tmp = U - org;
+  std::cout << "Diff1 "<<norm2(tmp)<<std::endl;
+  precisionChange(UF,U);
+  precisionChange(U,UF);
+
+  tmp=  U*adj(U) - ident ;
+  RealD Def2 = norm2(  tmp );
+  std::cout << " Defect2 "<<Def2<<std::endl;
+
+  tmp = U - org;
+  std::cout << "Diff2 "<<norm2(tmp)<<std::endl;
+
+  U = ProjectOnGroup(U);
+
+  tmp=  U*adj(U) - ident ;
+  RealD Def3 = norm2(  tmp);
+  std::cout << " Defect3 "<<Def3<<std::endl;
+
+
+  tmp = U - org;
+  std::cout << "Diff3 "<<norm2(tmp)<<std::endl;
+
+  LatticeComplexD detU(grid);
+  LatticeComplexD detUU(grid);
+
+  detU= Determinant(U) ;
+  std::cout << "Determinant before screw up " <<detU<<std::endl;
+
+  std::cout << " Screwing up determinant " << std::endl;
+
+  RealD theta = 0.2;
+  ComplexD phase(cos(theta),sin(theta));
+  for(int i=0;i<Nc;i++){
+    auto element = PeekIndex<ColourIndex>(U,Nc-1,i);
+    element = element * phase;
+    PokeIndex<ColourIndex>(U,element,Nc-1,i);
+  }  
+  UU=U;
+
+  detU= Determinant(U) ;
+  std::cout << "Determinant after screw up " <<detU<<std::endl;
+
+  ProjectSU3(U);
+  detU= Determinant(U) ;
+  std::cout << "Determinant ProjectSU3  " <<detU<<std::endl;
+  detU= detU -1.0;
+  std::cout << "Determinant ProjectSU3 defect " <<norm2(detU)<<std::endl;
+
+  ProjectSUn<3>(UU);
+  detUU= Determinant(UU);
+  std::cout << "Determinant ProjectSUn " <<detUU<<std::endl;
+  std::cout << "Determinant ProjectSUn defect " <<norm2(detUU)<<std::endl;
+  
+  Grid_finalize();
+}
+
+
+
+
diff --git a/tests/core/Test_staggered.cc b/tests/core/Test_staggered.cc
index 1f42ff0d..51f92993 100644
--- a/tests/core/Test_staggered.cc
+++ b/tests/core/Test_staggered.cc
@@ -64,7 +64,7 @@ int main (int argc, char ** argv)
   FermionField    err(&Grid);    tmp=Zero();
   FermionField phi   (&Grid); random(pRNG,phi);
   FermionField chi   (&Grid); random(pRNG,chi);
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
   std::vector<LatticeColourMatrix> U(4,&Grid);
 
 
diff --git a/tests/core/Test_staggered5D.cc b/tests/core/Test_staggered5D.cc
index 3d175890..6ab15873 100644
--- a/tests/core/Test_staggered5D.cc
+++ b/tests/core/Test_staggered5D.cc
@@ -75,7 +75,7 @@ int main (int argc, char ** argv)
   FermionField phi   (FGrid); random(pRNG5,phi);
   FermionField chi   (FGrid); random(pRNG5,chi);
 
-  LatticeGaugeField Umu(UGrid); SU3::ColdConfiguration(pRNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::ColdConfiguration(pRNG4,Umu);
   LatticeGaugeField Umua(UGrid); Umua=Umu;
 
   double volume=Ls;
diff --git a/tests/core/Test_staggered5Dvec.cc b/tests/core/Test_staggered5Dvec.cc
index 73241276..ef8da662 100644
--- a/tests/core/Test_staggered5Dvec.cc
+++ b/tests/core/Test_staggered5Dvec.cc
@@ -84,7 +84,7 @@ int main (int argc, char ** argv)
   FermionField chi   (FGrid); random(pRNG5,chi);
 
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(pRNG4,Umu);
+  SU<Nc>::HotConfiguration(pRNG4,Umu);
 
   /*
   for(int mu=1;mu<4;mu++){
diff --git a/tests/core/Test_staggered5DvecF.cc b/tests/core/Test_staggered5DvecF.cc
index 2386d054..6893551c 100644
--- a/tests/core/Test_staggered5DvecF.cc
+++ b/tests/core/Test_staggered5DvecF.cc
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
   FermionField chi   (FGrid); random(pRNG5,chi);
 
   LatticeGaugeFieldF Umu(UGrid);
-  SU3::HotConfiguration(pRNG4,Umu);
+  SU<Nc>::HotConfiguration(pRNG4,Umu);
 
   /*
   for(int mu=1;mu<4;mu++){
diff --git a/tests/core/Test_staggered_naive.cc b/tests/core/Test_staggered_naive.cc
index 9fe35a54..f41d723d 100644
--- a/tests/core/Test_staggered_naive.cc
+++ b/tests/core/Test_staggered_naive.cc
@@ -64,7 +64,7 @@ int main (int argc, char ** argv)
   FermionField    err(&Grid);    tmp=Zero();
   FermionField phi   (&Grid); random(pRNG,phi);
   FermionField chi   (&Grid); random(pRNG,chi);
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
   std::vector<LatticeColourMatrix> U(4,&Grid);
 
 
diff --git a/tests/core/Test_unary.cc b/tests/core/Test_unary.cc
new file mode 100644
index 00000000..2ad6ba7b
--- /dev/null
+++ b/tests/core/Test_unary.cc
@@ -0,0 +1,106 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_quenched_update.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+ ;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt({8,8,8,8});
+  GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, 
+							GridDefaultSimd(Nd,vComplexD::Nsimd()),
+							GridDefaultMpi());
+
+  GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt, 
+							GridDefaultSimd(Nd,vComplexF::Nsimd()),
+							GridDefaultMpi());
+  
+
+  ///////////////////////////////
+  // Configuration of known size
+  ///////////////////////////////
+  LatticeColourMatrixD ident(grid);
+  LatticeColourMatrixD U(grid);
+  LatticeColourMatrixD tmp(grid);
+  LatticeColourMatrixD org(grid);
+  LatticeColourMatrixF UF(gridF);
+
+  LatticeGaugeField Umu(grid);
+
+  ident =1.0;
+
+  // RNG set up for test
+  std::vector<int> pseeds({1,2,3,4,5}); // once I caught a fish alive
+  std::vector<int> sseeds({6,7,8,9,10});// then i let it go again
+  GridParallelRNG  pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
+  GridSerialRNG    sRNG;       sRNG.SeedFixedIntegers(sseeds);
+
+  SU<Nc>::HotConfiguration(pRNG,Umu);
+
+  U = PeekIndex<LorentzIndex>(Umu,0);
+  org=U;
+
+
+  tmp=  U*adj(U) - ident ;
+  RealD Def1 = norm2( tmp );
+  std::cout << " Defect1 "<<Def1<<std::endl;
+
+  tmp = U - org;
+  std::cout << "Diff1 "<<norm2(tmp)<<std::endl;
+  precisionChange(UF,U);
+  precisionChange(U,UF);
+
+  tmp=  U*adj(U) - ident ;
+  RealD Def2 = norm2(  tmp );
+  std::cout << " Defect2 "<<Def2<<std::endl;
+
+  tmp = U - org;
+  std::cout << "Diff2 "<<norm2(tmp)<<std::endl;
+
+  U = ProjectOnGroup(U);
+
+  tmp=  U*adj(U) - ident ;
+  RealD Def3 = norm2(  tmp);
+  std::cout << " Defect3 "<<Def3<<std::endl;
+
+
+  tmp = U - org;
+  std::cout << "Diff3 "<<norm2(tmp)<<std::endl;
+
+
+  Grid_finalize();
+}
+
+
+
+
diff --git a/tests/core/Test_wilson_clover.cc b/tests/core/Test_wilson_clover.cc
index 3e31f7f6..642c30a8 100644
--- a/tests/core/Test_wilson_clover.cc
+++ b/tests/core/Test_wilson_clover.cc
@@ -74,7 +74,7 @@ int main(int argc, char **argv)
   FermionField chi(&Grid);
   random(pRNG, chi);
   LatticeGaugeField Umu(&Grid);
-  SU3::HotConfiguration(pRNG, Umu);
+  SU<Nc>::HotConfiguration(pRNG, Umu);
   std::vector<LatticeColourMatrix> U(4, &Grid);
 
   double volume = 1;
diff --git a/tests/core/Test_wilson_even_odd.cc b/tests/core/Test_wilson_even_odd.cc
index dc49cf81..e7733a79 100644
--- a/tests/core/Test_wilson_even_odd.cc
+++ b/tests/core/Test_wilson_even_odd.cc
@@ -70,7 +70,7 @@ int main (int argc, char ** argv)
   LatticeFermion    tmp(&Grid);    tmp=Zero();
   LatticeFermion    err(&Grid);    tmp=Zero();
   LatticeGaugeField Umu(&Grid); 
-  SU3::HotConfiguration(pRNG,Umu);
+  SU<Nc>::HotConfiguration(pRNG,Umu);
   std::vector<LatticeColourMatrix> U(4,&Grid);
 
   double volume=1;
diff --git a/tests/core/Test_wilson_twisted_mass_even_odd.cc b/tests/core/Test_wilson_twisted_mass_even_odd.cc
index ba80fd0e..e0f73456 100644
--- a/tests/core/Test_wilson_twisted_mass_even_odd.cc
+++ b/tests/core/Test_wilson_twisted_mass_even_odd.cc
@@ -71,7 +71,7 @@ int main (int argc, char ** argv)
   LatticeFermion    ref(&Grid);    ref=Zero();
   LatticeFermion    tmp(&Grid);    tmp=Zero();
   LatticeFermion    err(&Grid);    tmp=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
   std::vector<LatticeColourMatrix> U(4,&Grid);
 
   double volume=1;
diff --git a/tests/debug/Test_cayley_cg.cc b/tests/debug/Test_cayley_cg.cc
index 5a9c696f..5418a8af 100644
--- a/tests/debug/Test_cayley_cg.cc
+++ b/tests/debug/Test_cayley_cg.cc
@@ -116,7 +116,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField Umu(UGrid);
   LatticeGaugeFieldF UmuF(UGridF);
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
   precisionChange(UmuF,Umu);
   std::vector<LatticeColourMatrix> U(4,UGrid);
 
diff --git a/tests/debug/Test_cayley_coarsen_support.cc b/tests/debug/Test_cayley_coarsen_support.cc
index e91b3070..b2f691d7 100644
--- a/tests/debug/Test_cayley_coarsen_support.cc
+++ b/tests/debug/Test_cayley_coarsen_support.cc
@@ -77,7 +77,7 @@ int main (int argc, char ** argv)
   LatticeFermion    ref(FGrid); ref=Zero();
   LatticeFermion    tmp(FGrid);
   LatticeFermion    err(FGrid);
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
 
 #if 0
   std::vector<LatticeColourMatrix> U(4,UGrid);
diff --git a/tests/debug/Test_cayley_even_odd.cc b/tests/debug/Test_cayley_even_odd.cc
index 433f0722..5e800b26 100644
--- a/tests/debug/Test_cayley_even_odd.cc
+++ b/tests/debug/Test_cayley_even_odd.cc
@@ -70,7 +70,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
   std::vector<LatticeColourMatrix> U(4,UGrid);
 
   RealD mass=0.1;
diff --git a/tests/debug/Test_cayley_ldop_cr.cc b/tests/debug/Test_cayley_ldop_cr.cc
index 82f388ab..416017e5 100644
--- a/tests/debug/Test_cayley_ldop_cr.cc
+++ b/tests/debug/Test_cayley_ldop_cr.cc
@@ -71,9 +71,9 @@ int main (int argc, char ** argv)
   std::string file("./ckpoint_lat.400");
   NerscIO::readConfiguration(Umu,header,file);
 
-  //  SU3::ColdConfiguration(RNG4,Umu);
-  //  SU3::TepidConfiguration(RNG4,Umu);
-  //  SU3::HotConfiguration(RNG4,Umu);
+  //  SU<Nc>::ColdConfiguration(RNG4,Umu);
+  //  SU<Nc>::TepidConfiguration(RNG4,Umu);
+  //  SU<Nc>::HotConfiguration(RNG4,Umu);
   //  Umu=Zero();
 
   RealD mass=0.1;
diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc
index 2ad605b8..2e56fa81 100644
--- a/tests/debug/Test_cayley_mres.cc
+++ b/tests/debug/Test_cayley_mres.cc
@@ -108,8 +108,8 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 
   LatticeGaugeField Umu(UGrid);
-  SU3::ColdConfiguration(Umu);
-  //  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::ColdConfiguration(Umu);
+  //  SU<Nc>::HotConfiguration(RNG4,Umu);
 
   RealD mass=0.3;
   RealD M5  =1.0;
diff --git a/tests/debug/Test_heatbath_dwf_eofa.cc b/tests/debug/Test_heatbath_dwf_eofa.cc
index 1e64a568..9d453a96 100644
--- a/tests/debug/Test_heatbath_dwf_eofa.cc
+++ b/tests/debug/Test_heatbath_dwf_eofa.cc
@@ -73,7 +73,7 @@ int main(int argc, char** argv)
 
   // Random gauge field
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
   DomainWallEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5);
   DomainWallEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0,  1, M5);
diff --git a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc
index cc118d1d..22cc1e90 100644
--- a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc
+++ b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc
@@ -77,7 +77,7 @@ int main(int argc, char** argv)
 
   // Random gauge field
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
   // GparityDomainWallFermionR::ImplParams params;
   FermionAction::ImplParams params;
diff --git a/tests/debug/Test_heatbath_mobius_eofa.cc b/tests/debug/Test_heatbath_mobius_eofa.cc
index 95ab935e..4cf4bf53 100644
--- a/tests/debug/Test_heatbath_mobius_eofa.cc
+++ b/tests/debug/Test_heatbath_mobius_eofa.cc
@@ -75,7 +75,7 @@ int main(int argc, char** argv)
 
   // Random gauge field
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
   MobiusEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5, b, c);
   MobiusEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0,  1, M5, b, c);
diff --git a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc
index 7ed3a308..2fcb4b9f 100644
--- a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc
+++ b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc
@@ -79,7 +79,7 @@ int main(int argc, char** argv)
 
   // Random gauge field
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
   FermionAction::ImplParams params;
   FermionAction Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5, b, c, params);
diff --git a/tests/debug/Test_reweight_dwf_eofa.cc b/tests/debug/Test_reweight_dwf_eofa.cc
index 728fbf78..a150b18f 100644
--- a/tests/debug/Test_reweight_dwf_eofa.cc
+++ b/tests/debug/Test_reweight_dwf_eofa.cc
@@ -102,7 +102,7 @@ int main(int argc, char **argv)
 
   // Random gauge field
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
   // Initialize RHMC fermion operators
   DomainWallFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5);
diff --git a/tests/debug/Test_reweight_dwf_eofa_gparity.cc b/tests/debug/Test_reweight_dwf_eofa_gparity.cc
index fcc01b8d..df2d95a0 100644
--- a/tests/debug/Test_reweight_dwf_eofa_gparity.cc
+++ b/tests/debug/Test_reweight_dwf_eofa_gparity.cc
@@ -104,7 +104,7 @@ int main(int argc, char **argv)
 
   // Random gauge field
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
   // Initialize RHMC fermion operators
   GparityDomainWallFermionR::ImplParams params;
diff --git a/tests/debug/Test_reweight_mobius_eofa.cc b/tests/debug/Test_reweight_mobius_eofa.cc
index c5e46bcf..88ecab7d 100644
--- a/tests/debug/Test_reweight_mobius_eofa.cc
+++ b/tests/debug/Test_reweight_mobius_eofa.cc
@@ -104,7 +104,7 @@ int main(int argc, char **argv)
 
   // Random gauge field
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
   // Initialize RHMC fermion operators
   MobiusFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, b, c);
diff --git a/tests/debug/Test_reweight_mobius_eofa_gparity.cc b/tests/debug/Test_reweight_mobius_eofa_gparity.cc
index bfc7543a..31708265 100644
--- a/tests/debug/Test_reweight_mobius_eofa_gparity.cc
+++ b/tests/debug/Test_reweight_mobius_eofa_gparity.cc
@@ -106,7 +106,7 @@ int main(int argc, char **argv)
 
   // Random gauge field
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
   // Initialize RHMC fermion operators
   GparityDomainWallFermionR::ImplParams params;
diff --git a/tests/forces/Test_contfrac_force.cc b/tests/forces/Test_contfrac_force.cc
index cb30faad..dc9eedce 100644
--- a/tests/forces/Test_contfrac_force.cc
+++ b/tests/forces/Test_contfrac_force.cc
@@ -59,7 +59,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField U(UGrid);
 
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
   
   ////////////////////////////////////
   // Unmodified matrix element
@@ -93,7 +93,7 @@ int main (int argc, char ** argv)
 
   for(int mu=0;mu<Nd;mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom,mommu,mu);
 
diff --git a/tests/forces/Test_dwf_force.cc b/tests/forces/Test_dwf_force.cc
index 81a1b8c4..e7d17347 100644
--- a/tests/forces/Test_dwf_force.cc
+++ b/tests/forces/Test_dwf_force.cc
@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField U(UGrid);
 
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
   
   ////////////////////////////////////
   // Unmodified matrix element
@@ -94,7 +94,7 @@ int main (int argc, char ** argv)
 
   for(int mu=0;mu<Nd;mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom,mommu,mu);
 
diff --git a/tests/forces/Test_dwf_force_eofa.cc b/tests/forces/Test_dwf_force_eofa.cc
index 0b0ba346..80d36934 100644
--- a/tests/forces/Test_dwf_force_eofa.cc
+++ b/tests/forces/Test_dwf_force_eofa.cc
@@ -72,7 +72,7 @@ int main (int argc, char** argv)
   LatticeFermion MphiPrime  (FGrid);
 
   LatticeGaugeField U(UGrid);
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
 
   ////////////////////////////////////
   // Unmodified matrix element
@@ -105,7 +105,7 @@ int main (int argc, char** argv)
 
   for(int mu=0; mu<Nd; mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom, mommu, mu);
 
diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc
index b39fdd14..28133cc6 100644
--- a/tests/forces/Test_dwf_gpforce.cc
+++ b/tests/forces/Test_dwf_gpforce.cc
@@ -63,8 +63,8 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField U(UGrid);
 
-  SU3::HotConfiguration(RNG4,U);
-  //  SU3::ColdConfiguration(pRNG,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
+  //  SU<Nc>::ColdConfiguration(pRNG,U);
   
   ////////////////////////////////////
   // Unmodified matrix element
@@ -112,7 +112,7 @@ int main (int argc, char ** argv)
 
   for(int mu=0;mu<Nd;mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     Hmom -= real(sum(trace(mommu*mommu)));
 
diff --git a/tests/forces/Test_dwf_gpforce_eofa.cc b/tests/forces/Test_dwf_gpforce_eofa.cc
index 58258a5e..7e480e7a 100644
--- a/tests/forces/Test_dwf_gpforce_eofa.cc
+++ b/tests/forces/Test_dwf_gpforce_eofa.cc
@@ -75,7 +75,7 @@ int main (int argc, char** argv)
   FermionField MphiPrime  (FGrid);
 
   LatticeGaugeField U(UGrid);
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
 
   ////////////////////////////////////
   // Unmodified matrix element
@@ -109,7 +109,7 @@ int main (int argc, char** argv)
 
   for(int mu=0; mu<Nd; mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom, mommu, mu);
 
diff --git a/tests/forces/Test_gp_plaq_force.cc b/tests/forces/Test_gp_plaq_force.cc
index 21f0b9d0..bc2b5b26 100644
--- a/tests/forces/Test_gp_plaq_force.cc
+++ b/tests/forces/Test_gp_plaq_force.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField U(&Grid);
 
-  SU3::HotConfiguration(pRNG,U);
+  SU<Nc>::HotConfiguration(pRNG,U);
   
   double beta = 1.0;
   ConjugateWilsonGaugeActionR Action(beta);
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
 
   for(int mu=0;mu<Nd;mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom,mommu,mu);
 
diff --git a/tests/forces/Test_gp_rect_force.cc b/tests/forces/Test_gp_rect_force.cc
index bb4ea6de..98ebb2fa 100644
--- a/tests/forces/Test_gp_rect_force.cc
+++ b/tests/forces/Test_gp_rect_force.cc
@@ -54,7 +54,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField U(&Grid);
 
-  SU3::HotConfiguration(pRNG,U);
+  SU<Nc>::HotConfiguration(pRNG,U);
   
   double beta = 1.0;
   double c1   = 0.331;
@@ -82,7 +82,7 @@ int main (int argc, char ** argv)
 
   for(int mu=0;mu<Nd;mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom,mommu,mu);
 
diff --git a/tests/forces/Test_gpdwf_force.cc b/tests/forces/Test_gpdwf_force.cc
index bdc332d9..d6744080 100644
--- a/tests/forces/Test_gpdwf_force.cc
+++ b/tests/forces/Test_gpdwf_force.cc
@@ -63,7 +63,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField U(UGrid);
 
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
   
   ////////////////////////////////////
   // Unmodified matrix element
@@ -100,7 +100,7 @@ int main (int argc, char ** argv)
 
   for(int mu=0;mu<Nd;mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom,mommu,mu);
 
diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc
index 1c85a5d9..d731f27a 100644
--- a/tests/forces/Test_gpwilson_force.cc
+++ b/tests/forces/Test_gpwilson_force.cc
@@ -57,7 +57,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField U(UGrid);
 
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
   
   ////////////////////////////////////
   // Unmodified matrix element
@@ -94,7 +94,7 @@ int main (int argc, char ** argv)
   for(int mu=0;mu<Nd;mu++){
 
     // Traceless antihermitian momentum; gaussian in lie alg
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); 
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); 
 
     PokeIndex<LorentzIndex>(mom,mommu,mu);
 
diff --git a/tests/forces/Test_laplacian_force.cc b/tests/forces/Test_laplacian_force.cc
index 639378dc..18508860 100644
--- a/tests/forces/Test_laplacian_force.cc
+++ b/tests/forces/Test_laplacian_force.cc
@@ -58,7 +58,7 @@ int main (int argc, char ** argv)
     PokeIndex<LorentzIndex>(P, P_mu, mu);
   }
 
-  SU3::HotConfiguration(pRNG,U);
+  SU<Nc>::HotConfiguration(pRNG,U);
   
 
   ConjugateGradient<LatticeGaugeField> CG(1.0e-8, 10000);
@@ -95,7 +95,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "Update the U " << std::endl;
   for(int mu=0;mu<Nd;mu++){
   // Traceless antihermitian momentum; gaussian in lie algebra
-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); 
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); 
     auto Umu = PeekIndex<LorentzIndex>(U, mu);
     PokeIndex<LorentzIndex>(mom,mommu,mu);
     Umu = expMat(mommu, dt, 12) * Umu;
diff --git a/tests/forces/Test_mobius_force.cc b/tests/forces/Test_mobius_force.cc
index 11e69652..ba7bc363 100644
--- a/tests/forces/Test_mobius_force.cc
+++ b/tests/forces/Test_mobius_force.cc
@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField U(UGrid);
 
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
   
   ////////////////////////////////////
   // Unmodified matrix element
@@ -96,7 +96,7 @@ int main (int argc, char ** argv)
 
   for(int mu=0;mu<Nd;mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom,mommu,mu);
 
diff --git a/tests/forces/Test_mobius_force_eofa.cc b/tests/forces/Test_mobius_force_eofa.cc
index f85501fa..28523e9c 100644
--- a/tests/forces/Test_mobius_force_eofa.cc
+++ b/tests/forces/Test_mobius_force_eofa.cc
@@ -72,7 +72,7 @@ int main (int argc, char** argv)
   LatticeFermion MphiPrime  (FGrid);
 
   LatticeGaugeField U(UGrid);
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
 
   ////////////////////////////////////
   // Unmodified matrix element
@@ -107,7 +107,7 @@ int main (int argc, char** argv)
 
   for(int mu=0; mu<Nd; mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom, mommu, mu);
 
diff --git a/tests/forces/Test_mobius_gpforce_eofa.cc b/tests/forces/Test_mobius_gpforce_eofa.cc
index 68163e63..9c80b2aa 100644
--- a/tests/forces/Test_mobius_gpforce_eofa.cc
+++ b/tests/forces/Test_mobius_gpforce_eofa.cc
@@ -76,7 +76,7 @@ int main (int argc, char** argv)
   FermionField MphiPrime  (FGrid);
 
   LatticeGaugeField U(UGrid);
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
 
   ////////////////////////////////////
   // Unmodified matrix element
@@ -112,7 +112,7 @@ int main (int argc, char** argv)
 
   for(int mu=0; mu<Nd; mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom, mommu, mu);
     autoView( U_v , U, CpuRead);
diff --git a/tests/forces/Test_partfrac_force.cc b/tests/forces/Test_partfrac_force.cc
index 17dce530..33f7b5fd 100644
--- a/tests/forces/Test_partfrac_force.cc
+++ b/tests/forces/Test_partfrac_force.cc
@@ -62,7 +62,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField U(UGrid);
 
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
   
   ////////////////////////////////////
   // Unmodified matrix element
@@ -96,7 +96,7 @@ int main (int argc, char ** argv)
 
   for(int mu=0;mu<Nd;mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom,mommu,mu);
 
diff --git a/tests/forces/Test_rect_force.cc b/tests/forces/Test_rect_force.cc
index ed72f2c0..c9326f8d 100644
--- a/tests/forces/Test_rect_force.cc
+++ b/tests/forces/Test_rect_force.cc
@@ -54,7 +54,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField U(&Grid);
 
-  SU3::HotConfiguration(pRNG,U);
+  SU<Nc>::HotConfiguration(pRNG,U);
   
   double beta = 1.0;
   double c1   = -0.331;
@@ -82,7 +82,7 @@ int main (int argc, char ** argv)
 
   for(int mu=0;mu<Nd;mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom,mommu,mu);
 
diff --git a/tests/forces/Test_wilson_force.cc b/tests/forces/Test_wilson_force.cc
index c8b3a7f4..b7bf1268 100644
--- a/tests/forces/Test_wilson_force.cc
+++ b/tests/forces/Test_wilson_force.cc
@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
   LatticeGaugeField U(&Grid);
 
   //SU2::HotConfiguration(pRNG,U);
-  SU3::ColdConfiguration(pRNG,U);
+  SU<Nc>::ColdConfiguration(pRNG,U);
   
   ////////////////////////////////////
   // Unmodified matrix element
@@ -98,7 +98,7 @@ int main (int argc, char ** argv)
   for(int mu=0;mu<Nd;mu++){
 
     // Traceless antihermitian momentum; gaussian in lie alg
-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); 
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); 
 
     Hmom -= real(sum(trace(mommu*mommu)));
 
diff --git a/tests/forces/Test_wilsonclover_force.cc b/tests/forces/Test_wilsonclover_force.cc
index f26f0ac9..6a28e4e2 100644
--- a/tests/forces/Test_wilsonclover_force.cc
+++ b/tests/forces/Test_wilsonclover_force.cc
@@ -62,8 +62,8 @@ int main(int argc, char **argv)
 
   LatticeGaugeField U(&Grid);
 
-  SU3::HotConfiguration(pRNG, U);
-  //SU3::ColdConfiguration(pRNG, U);// Clover term Zero()
+  SU<Nc>::HotConfiguration(pRNG, U);
+  //SU<Nc>::ColdConfiguration(pRNG, U);// Clover term Zero()
 
   ////////////////////////////////////
   // Unmodified matrix element
@@ -101,7 +101,7 @@ int main(int argc, char **argv)
   for (int mu = 0; mu < Nd; mu++)
   {
     // Traceless antihermitian momentum; gaussian in lie alg
-    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu);
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu);
     Hmom -= real(sum(trace(mommu * mommu)));
     PokeIndex<LorentzIndex>(mom, mommu, mu);
 
diff --git a/tests/forces/Test_zmobius_force.cc b/tests/forces/Test_zmobius_force.cc
index e24ae601..89673bc7 100644
--- a/tests/forces/Test_zmobius_force.cc
+++ b/tests/forces/Test_zmobius_force.cc
@@ -59,7 +59,7 @@ int main (int argc, char ** argv)
 
   LatticeGaugeField U(UGrid);
 
-  SU3::HotConfiguration(RNG4,U);
+  SU<Nc>::HotConfiguration(RNG4,U);
   
   ////////////////////////////////////
   // Unmodified matrix element
@@ -109,7 +109,7 @@ int main (int argc, char ** argv)
 
   for(int mu=0;mu<Nd;mu++){
 
-    SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
 
     PokeIndex<LorentzIndex>(mom,mommu,mu);
 
diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
index d9249e0d..3766e069 100644
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
@@ -293,7 +293,7 @@ int main (int argc, char ** argv) {
   {
     std::vector<int> seeds4({1,2,3,4});
     GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-    SU3::HotConfiguration(RNG4, Umu);
+    SU<Nc>::HotConfiguration(RNG4, Umu);
   }
   std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << "   Ls: " << Ls << std::endl;
 
diff --git a/tests/lanczos/Test_dwf_lanczos.cc b/tests/lanczos/Test_dwf_lanczos.cc
index 12283921..00d29ec0 100644
--- a/tests/lanczos/Test_dwf_lanczos.cc
+++ b/tests/lanczos/Test_dwf_lanczos.cc
@@ -54,7 +54,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG5rb(FrbGrid);  RNG5.SeedFixedIntegers(seeds5);
 
   LatticeGaugeField Umu(UGrid); 
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
   std::vector<LatticeColourMatrix> U(4,UGrid);
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/lanczos/Test_wilson_lanczos.cc b/tests/lanczos/Test_wilson_lanczos.cc
index 01258ac7..af21fb1d 100644
--- a/tests/lanczos/Test_wilson_lanczos.cc
+++ b/tests/lanczos/Test_wilson_lanczos.cc
@@ -61,7 +61,7 @@ int main(int argc, char** argv) {
   RNG5.SeedFixedIntegers(seeds5);
 
   LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
 /*
   std::vector<LatticeColourMatrix> U(4, UGrid);
diff --git a/tests/qdpxx/Test_qdpxx_baryon.cc b/tests/qdpxx/Test_qdpxx_baryon.cc
index a1d8f738..d8225f82 100644
--- a/tests/qdpxx/Test_qdpxx_baryon.cc
+++ b/tests/qdpxx/Test_qdpxx_baryon.cc
@@ -280,7 +280,7 @@ void make_gauge(GaugeField &Umu, Grid::LatticePropagator &q1,Grid::LatticePropag
   Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu.Grid();
   Grid::GridParallelRNG RNG4(UGrid);
   RNG4.SeedFixedIntegers(seeds4);
-  Grid::SU3::HotConfiguration(RNG4, Umu);
+  Grid::SU<Nc>::HotConfiguration(RNG4, Umu);
 
   // Propagator
   Grid::gaussian(RNG4, q1);
diff --git a/tests/qdpxx/Test_qdpxx_loops_staples.cc b/tests/qdpxx/Test_qdpxx_loops_staples.cc
index bbb41f4e..33057eeb 100644
--- a/tests/qdpxx/Test_qdpxx_loops_staples.cc
+++ b/tests/qdpxx/Test_qdpxx_loops_staples.cc
@@ -277,7 +277,7 @@ double calc_grid_p(Grid::LatticeGaugeField & Umu)
   Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu.Grid();
   Grid::GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 
-  Grid::SU3::HotConfiguration(RNG4,Umu);
+  Grid::SU<Nc>::HotConfiguration(RNG4,Umu);
 
   Grid::LatticeColourMatrix tmp(UGrid); 
   tmp = Grid::zero;
diff --git a/tests/qdpxx/Test_qdpxx_munprec.cc b/tests/qdpxx/Test_qdpxx_munprec.cc
index fbc1ec82..82874546 100644
--- a/tests/qdpxx/Test_qdpxx_munprec.cc
+++ b/tests/qdpxx/Test_qdpxx_munprec.cc
@@ -502,7 +502,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF
   Grid::gaussian(RNG5,src);
   Grid::gaussian(RNG5,res);
 
-  Grid::SU3::HotConfiguration(RNG4,Umu);
+  Grid::SU<Nc>::HotConfiguration(RNG4,Umu);
 
   /*
   Grid::LatticeColourMatrix U(UGrid);
diff --git a/tests/qdpxx/Test_qdpxx_stag.cc b/tests/qdpxx/Test_qdpxx_stag.cc
index f283d5a9..8f81fa99 100644
--- a/tests/qdpxx/Test_qdpxx_stag.cc
+++ b/tests/qdpxx/Test_qdpxx_stag.cc
@@ -333,7 +333,7 @@ void make_gauge(GaugeField & Umu,FermionField &src)
 
   Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu.Grid();
   Grid::GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-  Grid::SU3::HotConfiguration(RNG4,Umu);
+  Grid::SU<Nc>::HotConfiguration(RNG4,Umu);
   Grid::gaussian(RNG4,src);
 }
 
diff --git a/tests/qdpxx/Test_qdpxx_wilson.cc b/tests/qdpxx/Test_qdpxx_wilson.cc
index fdf59982..8ce28dca 100644
--- a/tests/qdpxx/Test_qdpxx_wilson.cc
+++ b/tests/qdpxx/Test_qdpxx_wilson.cc
@@ -348,7 +348,7 @@ void make_gauge(GaugeField &Umu, FermionField &src)
   Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu._grid;
   Grid::GridParallelRNG RNG4(UGrid);
   RNG4.SeedFixedIntegers(seeds4);
-  Grid::SU3::HotConfiguration(RNG4, Umu);
+  Grid::SU<Nc>::HotConfiguration(RNG4, Umu);
 
   // Fermion field
   Grid::gaussian(RNG4, src);
diff --git a/tests/smearing/Test_smearing.cc b/tests/smearing/Test_smearing.cc
index c1c7c457..adab1c6e 100644
--- a/tests/smearing/Test_smearing.cc
+++ b/tests/smearing/Test_smearing.cc
@@ -47,8 +47,8 @@ int main (int argc, char ** argv)
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
   LatticeGaugeField Umu(&Grid);
-  //  SU3::HotConfiguration(pRNG,Umu);
-  SU3::ColdConfiguration(Umu);
+  //  SU<Nc>::HotConfiguration(pRNG,Umu);
+  SU<Nc>::ColdConfiguration(Umu);
   std::vector<LatticeColourMatrix> U(4,&Grid);
 
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_cf_cr_unprec.cc b/tests/solver/Test_cf_cr_unprec.cc
index 8b0ce873..aa750175 100644
--- a/tests/solver/Test_cf_cr_unprec.cc
+++ b/tests/solver/Test_cf_cr_unprec.cc
@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
 
   LatticeFermion    src(FGrid); random(RNG5,src);
   LatticeFermion result(FGrid); result=Zero();
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
 
   std::vector<LatticeColourMatrix> U(4,UGrid);
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_contfrac_cg.cc b/tests/solver/Test_contfrac_cg.cc
index 36614dfa..afabae4c 100644
--- a/tests/solver/Test_contfrac_cg.cc
+++ b/tests/solver/Test_contfrac_cg.cc
@@ -94,7 +94,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
   std::vector<LatticeColourMatrix> U(4,UGrid);
 
   RealD mass=0.1;
diff --git a/tests/solver/Test_dwf_cg_prec.cc b/tests/solver/Test_dwf_cg_prec.cc
index cb53894f..debb736a 100644
--- a/tests/solver/Test_dwf_cg_prec.cc
+++ b/tests/solver/Test_dwf_cg_prec.cc
@@ -67,7 +67,7 @@ int main(int argc, char** argv) {
   result = Zero();
   LatticeGaugeField Umu(UGrid);
 
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
   std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt()
             << "   Ls: " << Ls << std::endl;
diff --git a/tests/solver/Test_dwf_cg_schur.cc b/tests/solver/Test_dwf_cg_schur.cc
index 6216c366..6541e73d 100644
--- a/tests/solver/Test_dwf_cg_schur.cc
+++ b/tests/solver/Test_dwf_cg_schur.cc
@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
 
   LatticeFermion    src(FGrid); random(RNG5,src);
   LatticeFermion result(FGrid); result=Zero();
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
 
   std::vector<LatticeColourMatrix> U(4,UGrid);
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_dwf_cg_unprec.cc b/tests/solver/Test_dwf_cg_unprec.cc
index 0af7290c..c867ccf3 100644
--- a/tests/solver/Test_dwf_cg_unprec.cc
+++ b/tests/solver/Test_dwf_cg_unprec.cc
@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
 
   LatticeFermion    src(FGrid); random(RNG5,src);
   LatticeFermion result(FGrid); result=Zero();
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
 
   std::vector<LatticeColourMatrix> U(4,UGrid);
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_dwf_cr_unprec.cc b/tests/solver/Test_dwf_cr_unprec.cc
index 087ae376..8c8583ba 100644
--- a/tests/solver/Test_dwf_cr_unprec.cc
+++ b/tests/solver/Test_dwf_cr_unprec.cc
@@ -65,7 +65,7 @@ int main (int argc, char ** argv)
 
   LatticeFermion    src(FGrid); random(RNG5,src);
   LatticeFermion result(FGrid); result=Zero();
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
 
   std::vector<LatticeColourMatrix> U(4,UGrid);
 
diff --git a/tests/solver/Test_dwf_fpgcr.cc b/tests/solver/Test_dwf_fpgcr.cc
index 156f678a..42cc8de1 100644
--- a/tests/solver/Test_dwf_fpgcr.cc
+++ b/tests/solver/Test_dwf_fpgcr.cc
@@ -68,7 +68,7 @@ int main (int argc, char ** argv)
   LatticeFermion result(FGrid); result=Zero();
   LatticeGaugeField Umu(UGrid); 
 
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
 
 
   ConjugateResidual<LatticeFermion> CR(1.0e-6,10000);
diff --git a/tests/solver/Test_dwf_mrhs_cg.cc b/tests/solver/Test_dwf_mrhs_cg.cc
index 982a8247..b912ba4f 100644
--- a/tests/solver/Test_dwf_mrhs_cg.cc
+++ b/tests/solver/Test_dwf_mrhs_cg.cc
@@ -93,7 +93,7 @@ int main (int argc, char ** argv)
   for(int s=0;s<nrhs;s++) random(pRNG5,src[s]);
   for(int s=0;s<nrhs;s++) result[s]=Zero();
 
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   ///////////////////////////////////////////////////////////////
   // Bounce these fields to disk
diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc
index 8ace9b43..d0a32460 100644
--- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc
+++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc
@@ -136,11 +136,11 @@ int main (int argc, char ** argv)
     std::cout << GridLogMessage << "Intialising 4D RNG "<<std::endl;
     pRNG.SeedFixedIntegers(seeds);
     std::cout << GridLogMessage << "Intialised 4D RNG "<<std::endl;
-    SU3::HotConfiguration(pRNG,Umu);
+    SU<Nc>::HotConfiguration(pRNG,Umu);
     std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
     //    std::cout << " Site zero "<< Umu[0]   <<std::endl;
   } else { 
-    SU3::ColdConfiguration(Umu);
+    SU<Nc>::ColdConfiguration(Umu);
     std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
   }
   /////////////////
diff --git a/tests/solver/Test_dwf_mrhs_cg_mpieo.cc b/tests/solver/Test_dwf_mrhs_cg_mpieo.cc
index 6e5208ea..1906619f 100644
--- a/tests/solver/Test_dwf_mrhs_cg_mpieo.cc
+++ b/tests/solver/Test_dwf_mrhs_cg_mpieo.cc
@@ -87,7 +87,7 @@ int main (int argc, char ** argv)
   for(int s=0;s<nrhs;s++) random(pRNG5,src[s]);
   for(int s=0;s<nrhs;s++) result[s]=Zero();
 
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   /////////////////
   // MPI only sends
diff --git a/tests/solver/Test_dwf_qmr_unprec.cc b/tests/solver/Test_dwf_qmr_unprec.cc
index ba44ee93..370e7409 100644
--- a/tests/solver/Test_dwf_qmr_unprec.cc
+++ b/tests/solver/Test_dwf_qmr_unprec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
 
   LatticeFermion    src(FGrid); random(RNG5,src);
   LatticeFermion result(FGrid); result=Zero();
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
 
   std::vector<LatticeColourMatrix> U(4,UGrid);
 
diff --git a/tests/solver/Test_mobius_bcg.cc b/tests/solver/Test_mobius_bcg.cc
index 8b34a6a5..8092d61c 100644
--- a/tests/solver/Test_mobius_bcg.cc
+++ b/tests/solver/Test_mobius_bcg.cc
@@ -128,7 +128,7 @@ int main (int argc, char ** argv)
     std::cout << GridLogMessage << "Intialising 4D RNG "<<std::endl;
     pRNG.SeedFixedIntegers(seeds);
     std::cout << GridLogMessage << "Intialised 4D RNG "<<std::endl;
-    SU3::HotConfiguration(pRNG,Umu);
+    SU<Nc>::HotConfiguration(pRNG,Umu);
     std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
   } 
 
diff --git a/tests/solver/Test_mobius_bcg_nosplit.cc b/tests/solver/Test_mobius_bcg_nosplit.cc
index 3f0b7058..de02b1e3 100644
--- a/tests/solver/Test_mobius_bcg_nosplit.cc
+++ b/tests/solver/Test_mobius_bcg_nosplit.cc
@@ -93,10 +93,10 @@ int main (int argc, char ** argv)
     GridParallelRNG pRNG(UGrid );  
 
     pRNG.SeedFixedIntegers(seeds);
-    SU3::HotConfiguration(pRNG,Umu);
+    SU<Nc>::HotConfiguration(pRNG,Umu);
     std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
   } else {
-    SU3::ColdConfiguration(Umu);
+    SU<Nc>::ColdConfiguration(Umu);
     std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
   }
 
diff --git a/tests/solver/Test_mobius_bcg_phys_nosplit.cc b/tests/solver/Test_mobius_bcg_phys_nosplit.cc
index f929bd30..2fe573ce 100644
--- a/tests/solver/Test_mobius_bcg_phys_nosplit.cc
+++ b/tests/solver/Test_mobius_bcg_phys_nosplit.cc
@@ -94,10 +94,10 @@ int main (int argc, char ** argv)
     GridParallelRNG pRNG(UGrid );  
 
     pRNG.SeedFixedIntegers(seeds);
-    SU3::HotConfiguration(pRNG,Umu);
+    SU<Nc>::HotConfiguration(pRNG,Umu);
     std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
   } else {
-    SU3::ColdConfiguration(Umu);
+    SU<Nc>::ColdConfiguration(Umu);
     std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
   }
 
diff --git a/tests/solver/Test_mobius_bcg_prec_nosplit.cc b/tests/solver/Test_mobius_bcg_prec_nosplit.cc
index 31120a8e..3ac0d42b 100644
--- a/tests/solver/Test_mobius_bcg_prec_nosplit.cc
+++ b/tests/solver/Test_mobius_bcg_prec_nosplit.cc
@@ -93,10 +93,10 @@ int main (int argc, char ** argv)
     GridParallelRNG pRNG(UGrid );  
 
     pRNG.SeedFixedIntegers(seeds);
-    SU3::HotConfiguration(pRNG,Umu);
+    SU<Nc>::HotConfiguration(pRNG,Umu);
     std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
   } else {
-    SU3::ColdConfiguration(Umu);
+    SU<Nc>::ColdConfiguration(Umu);
     std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
   }
 
diff --git a/tests/solver/Test_split_grid.cc b/tests/solver/Test_split_grid.cc
index 74c51a05..85626c8e 100644
--- a/tests/solver/Test_split_grid.cc
+++ b/tests/solver/Test_split_grid.cc
@@ -94,7 +94,7 @@ int main (int argc, char ** argv)
   for(int s=0;s<nrhs;s++) random(pRNG5,src[s]);
   for(int s=0;s<nrhs;s++) result[s]=Zero();
 
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   /////////////////
   // MPI only sends
diff --git a/tests/solver/Test_staggered_block_cg_prec.cc b/tests/solver/Test_staggered_block_cg_prec.cc
index 2499fc8a..c5306e85 100644
--- a/tests/solver/Test_staggered_block_cg_prec.cc
+++ b/tests/solver/Test_staggered_block_cg_prec.cc
@@ -87,7 +87,7 @@ int main (int argc, char ** argv)
   FermionField result_o(FrbGrid); result_o=Zero(); 
   RealD nrm = norm2(src);
 
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc
index b1797355..a78b449a 100644
--- a/tests/solver/Test_staggered_block_cg_unprec.cc
+++ b/tests/solver/Test_staggered_block_cg_unprec.cc
@@ -71,7 +71,7 @@ int main (int argc, char ** argv)
   FermionField result(FGrid); result=Zero();
   RealD nrm = norm2(src);
 
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   RealD mass=0.003;
   RealD c1=9.0/8.0;
diff --git a/tests/solver/Test_staggered_cagmres_unprec.cc b/tests/solver/Test_staggered_cagmres_unprec.cc
index 8121c90d..1b7a2f56 100644
--- a/tests/solver/Test_staggered_cagmres_unprec.cc
+++ b/tests/solver/Test_staggered_cagmres_unprec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   FermionField result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_staggered_cg_prec.cc b/tests/solver/Test_staggered_cg_prec.cc
index 94373301..854ef632 100644
--- a/tests/solver/Test_staggered_cg_prec.cc
+++ b/tests/solver/Test_staggered_cg_prec.cc
@@ -63,7 +63,7 @@ int main (int argc, char ** argv)
 
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_staggered_cg_schur.cc b/tests/solver/Test_staggered_cg_schur.cc
index 81a2a0fa..d8e5bdd4 100644
--- a/tests/solver/Test_staggered_cg_schur.cc
+++ b/tests/solver/Test_staggered_cg_schur.cc
@@ -58,7 +58,7 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
   GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
 
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   FermionField    src(&Grid); random(pRNG,src);
   FermionField result(&Grid); result=Zero();
diff --git a/tests/solver/Test_staggered_cg_unprec.cc b/tests/solver/Test_staggered_cg_unprec.cc
index 9625a9c8..e023b910 100644
--- a/tests/solver/Test_staggered_cg_unprec.cc
+++ b/tests/solver/Test_staggered_cg_unprec.cc
@@ -65,7 +65,7 @@ int main (int argc, char ** argv)
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   FermionField result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_staggered_fcagmres_prec.cc b/tests/solver/Test_staggered_fcagmres_prec.cc
index d08073c7..692d688e 100644
--- a/tests/solver/Test_staggered_fcagmres_prec.cc
+++ b/tests/solver/Test_staggered_fcagmres_prec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   FermionField result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_staggered_fgmres_prec.cc b/tests/solver/Test_staggered_fgmres_prec.cc
index 98ef83a6..fe6da67c 100644
--- a/tests/solver/Test_staggered_fgmres_prec.cc
+++ b/tests/solver/Test_staggered_fgmres_prec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   FermionField result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_staggered_gmres_unprec.cc b/tests/solver/Test_staggered_gmres_unprec.cc
index e6bb5fc7..ec9d4608 100644
--- a/tests/solver/Test_staggered_gmres_unprec.cc
+++ b/tests/solver/Test_staggered_gmres_unprec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   FermionField result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_staggered_mr_unprec.cc b/tests/solver/Test_staggered_mr_unprec.cc
index d288c9c2..ddbb8de3 100644
--- a/tests/solver/Test_staggered_mr_unprec.cc
+++ b/tests/solver/Test_staggered_mr_unprec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   FermionField result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_staggered_multishift.cc b/tests/solver/Test_staggered_multishift.cc
index 0d455414..856f0b87 100644
--- a/tests/solver/Test_staggered_multishift.cc
+++ b/tests/solver/Test_staggered_multishift.cc
@@ -62,7 +62,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
 
 
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilson_cagmres_unprec.cc b/tests/solver/Test_wilson_cagmres_unprec.cc
index ad502e05..226d0719 100644
--- a/tests/solver/Test_wilson_cagmres_unprec.cc
+++ b/tests/solver/Test_wilson_cagmres_unprec.cc
@@ -47,7 +47,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilson_cg_prec.cc b/tests/solver/Test_wilson_cg_prec.cc
index 5696a24b..a28e014e 100644
--- a/tests/solver/Test_wilson_cg_prec.cc
+++ b/tests/solver/Test_wilson_cg_prec.cc
@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   std::vector<LatticeColourMatrix> U(4,&Grid);
 
diff --git a/tests/solver/Test_wilson_cg_schur.cc b/tests/solver/Test_wilson_cg_schur.cc
index 23383032..97482131 100644
--- a/tests/solver/Test_wilson_cg_schur.cc
+++ b/tests/solver/Test_wilson_cg_schur.cc
@@ -57,7 +57,7 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
   GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
 
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   LatticeFermion    src(&Grid); random(pRNG,src);
   LatticeFermion result(&Grid); result=Zero();
diff --git a/tests/solver/Test_wilson_cg_unprec.cc b/tests/solver/Test_wilson_cg_unprec.cc
index f3335d45..07f6ba7b 100644
--- a/tests/solver/Test_wilson_cg_unprec.cc
+++ b/tests/solver/Test_wilson_cg_unprec.cc
@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilson_cr_unprec.cc b/tests/solver/Test_wilson_cr_unprec.cc
index 1f3ae35d..67510a23 100644
--- a/tests/solver/Test_wilson_cr_unprec.cc
+++ b/tests/solver/Test_wilson_cr_unprec.cc
@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   std::vector<LatticeColourMatrix> U(4,&Grid);
 
diff --git a/tests/solver/Test_wilson_fcagmres_prec.cc b/tests/solver/Test_wilson_fcagmres_prec.cc
index b821a25f..d2a1acf4 100644
--- a/tests/solver/Test_wilson_fcagmres_prec.cc
+++ b/tests/solver/Test_wilson_fcagmres_prec.cc
@@ -47,7 +47,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilson_fgmres_prec.cc b/tests/solver/Test_wilson_fgmres_prec.cc
index cf1818ba..02d8f9f2 100644
--- a/tests/solver/Test_wilson_fgmres_prec.cc
+++ b/tests/solver/Test_wilson_fgmres_prec.cc
@@ -47,7 +47,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilson_gmres_unprec.cc b/tests/solver/Test_wilson_gmres_unprec.cc
index 7d788391..e52c047f 100644
--- a/tests/solver/Test_wilson_gmres_unprec.cc
+++ b/tests/solver/Test_wilson_gmres_unprec.cc
@@ -47,7 +47,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilson_mg.cc b/tests/solver/Test_wilson_mg.cc
index c022590c..99a16e19 100644
--- a/tests/solver/Test_wilson_mg.cc
+++ b/tests/solver/Test_wilson_mg.cc
@@ -46,7 +46,7 @@ int main(int argc, char **argv) {
   // clang-format off
   LatticeFermion    src(FGrid); gaussian(fPRNG, src);
   LatticeFermion result(FGrid); result = Zero();
-  LatticeGaugeField Umu(FGrid); SU3::HotConfiguration(fPRNG, Umu);
+  LatticeGaugeField Umu(FGrid); SU<Nc>::HotConfiguration(fPRNG, Umu);
   // clang-format on
 
   RealD mass = -0.25;
diff --git a/tests/solver/Test_wilson_mg_mp.cc b/tests/solver/Test_wilson_mg_mp.cc
index e631cd15..89bbbf74 100644
--- a/tests/solver/Test_wilson_mg_mp.cc
+++ b/tests/solver/Test_wilson_mg_mp.cc
@@ -52,7 +52,7 @@ int main(int argc, char **argv) {
   LatticeFermionD       src_d(FGrid_d); gaussian(fPRNG, src_d);
   LatticeFermionD resultMGD_d(FGrid_d); resultMGD_d = Zero();
   LatticeFermionD resultMGF_d(FGrid_d); resultMGF_d = Zero();
-  LatticeGaugeFieldD    Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d);
+  LatticeGaugeFieldD    Umu_d(FGrid_d); SU<Nc>::HotConfiguration(fPRNG, Umu_d);
   LatticeGaugeFieldF    Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d);
   // clang-format on
 
diff --git a/tests/solver/Test_wilson_mr_unprec.cc b/tests/solver/Test_wilson_mr_unprec.cc
index 1cc1f418..fef83794 100644
--- a/tests/solver/Test_wilson_mr_unprec.cc
+++ b/tests/solver/Test_wilson_mr_unprec.cc
@@ -47,7 +47,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilson_qmr_unprec.cc b/tests/solver/Test_wilson_qmr_unprec.cc
index 574d8b2b..45e241e6 100644
--- a/tests/solver/Test_wilson_qmr_unprec.cc
+++ b/tests/solver/Test_wilson_qmr_unprec.cc
@@ -45,7 +45,7 @@ int main (int argc, char ** argv)
 
   LatticeFermion    src(Grid); random(RNG4,src);
   LatticeFermion result(Grid); result=Zero();
-  LatticeGaugeField Umu(Grid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(Grid); SU<Nc>::HotConfiguration(RNG4,Umu);
 
   std::vector<LatticeColourMatrix> U(4,Grid);
 
diff --git a/tests/solver/Test_wilsonclover_bicgstab_prec.cc b/tests/solver/Test_wilsonclover_bicgstab_prec.cc
index c1905400..b382b1bb 100644
--- a/tests/solver/Test_wilsonclover_bicgstab_prec.cc
+++ b/tests/solver/Test_wilsonclover_bicgstab_prec.cc
@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilsonclover_bicgstab_schur.cc b/tests/solver/Test_wilsonclover_bicgstab_schur.cc
index af021267..f09d7cd1 100644
--- a/tests/solver/Test_wilsonclover_bicgstab_schur.cc
+++ b/tests/solver/Test_wilsonclover_bicgstab_schur.cc
@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilsonclover_bicgstab_unprec.cc b/tests/solver/Test_wilsonclover_bicgstab_unprec.cc
index 21b7521d..f546a744 100644
--- a/tests/solver/Test_wilsonclover_bicgstab_unprec.cc
+++ b/tests/solver/Test_wilsonclover_bicgstab_unprec.cc
@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilsonclover_cagmres_unprec.cc b/tests/solver/Test_wilsonclover_cagmres_unprec.cc
index e755dcd2..a8818168 100644
--- a/tests/solver/Test_wilsonclover_cagmres_unprec.cc
+++ b/tests/solver/Test_wilsonclover_cagmres_unprec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   FermionField result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilsonclover_cg_prec.cc b/tests/solver/Test_wilsonclover_cg_prec.cc
index 04202860..5ef2dc09 100644
--- a/tests/solver/Test_wilsonclover_cg_prec.cc
+++ b/tests/solver/Test_wilsonclover_cg_prec.cc
@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   std::vector<LatticeColourMatrix> U(4,&Grid);
 
diff --git a/tests/solver/Test_wilsonclover_cg_schur.cc b/tests/solver/Test_wilsonclover_cg_schur.cc
index eaae24b3..567a8283 100644
--- a/tests/solver/Test_wilsonclover_cg_schur.cc
+++ b/tests/solver/Test_wilsonclover_cg_schur.cc
@@ -57,7 +57,7 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
   GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
 
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   LatticeFermion    src(&Grid); random(pRNG,src);
   LatticeFermion result(&Grid); result=Zero();
diff --git a/tests/solver/Test_wilsonclover_cg_unprec.cc b/tests/solver/Test_wilsonclover_cg_unprec.cc
index 49c52cdf..755d80e1 100644
--- a/tests/solver/Test_wilsonclover_cg_unprec.cc
+++ b/tests/solver/Test_wilsonclover_cg_unprec.cc
@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
   LatticeFermion src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   LatticeFermion result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilsonclover_fcagmres_prec.cc b/tests/solver/Test_wilsonclover_fcagmres_prec.cc
index 95fe890e..1a294821 100644
--- a/tests/solver/Test_wilsonclover_fcagmres_prec.cc
+++ b/tests/solver/Test_wilsonclover_fcagmres_prec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   FermionField result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilsonclover_fgmres_prec.cc b/tests/solver/Test_wilsonclover_fgmres_prec.cc
index daea527a..15bb4136 100644
--- a/tests/solver/Test_wilsonclover_fgmres_prec.cc
+++ b/tests/solver/Test_wilsonclover_fgmres_prec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   FermionField result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilsonclover_gmres_unprec.cc b/tests/solver/Test_wilsonclover_gmres_unprec.cc
index 667d17d9..00f33382 100644
--- a/tests/solver/Test_wilsonclover_gmres_unprec.cc
+++ b/tests/solver/Test_wilsonclover_gmres_unprec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   FermionField result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_wilsonclover_mg.cc b/tests/solver/Test_wilsonclover_mg.cc
index a5fa907d..605d225d 100644
--- a/tests/solver/Test_wilsonclover_mg.cc
+++ b/tests/solver/Test_wilsonclover_mg.cc
@@ -47,7 +47,7 @@ int main(int argc, char **argv) {
   // clang-format off
   LatticeFermion    src(FGrid); gaussian(fPRNG, src);
   LatticeFermion result(FGrid); result = zero;
-  LatticeGaugeField Umu(FGrid); SU3::HotConfiguration(fPRNG, Umu);
+  LatticeGaugeField Umu(FGrid); SU<Nc>::HotConfiguration(fPRNG, Umu);
   // clang-format on
 
   RealD mass  = -0.25;
diff --git a/tests/solver/Test_wilsonclover_mg_lime.cc b/tests/solver/Test_wilsonclover_mg_lime.cc
index bd2990d4..0a29c034 100644
--- a/tests/solver/Test_wilsonclover_mg_lime.cc
+++ b/tests/solver/Test_wilsonclover_mg_lime.cc
@@ -75,7 +75,7 @@ int main(int argc, char **argv) {
   NerscIO::readConfiguration(Umu_d,header,file);
 }
 #endif
-  // SU3::HotConfiguration(fPRNG, Umu_d);
+  // SU<Nc>::HotConfiguration(fPRNG, Umu_d);
 
   LatticeGaugeFieldF    Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d);
   // clang-format on
diff --git a/tests/solver/Test_wilsonclover_mg_mp.cc b/tests/solver/Test_wilsonclover_mg_mp.cc
index b5178d2e..2efe5f08 100644
--- a/tests/solver/Test_wilsonclover_mg_mp.cc
+++ b/tests/solver/Test_wilsonclover_mg_mp.cc
@@ -52,7 +52,7 @@ int main(int argc, char **argv) {
   LatticeFermionD       src_d(FGrid_d); gaussian(fPRNG, src_d);
   LatticeFermionD resultMGD_d(FGrid_d); resultMGD_d = zero;
   LatticeFermionD resultMGF_d(FGrid_d); resultMGF_d = zero;
-  LatticeGaugeFieldD    Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d);
+  LatticeGaugeFieldD    Umu_d(FGrid_d); SU<Nc>::HotConfiguration(fPRNG, Umu_d);
   LatticeGaugeFieldF    Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d);
   // clang-format on
 
diff --git a/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc b/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc
index 0af83f8b..d47dac2a 100644
--- a/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc
+++ b/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc
@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
   // clang-format off
   LatticeFermionD    src(FGrid_d);    gaussian(fPRNG, src);
   LatticeFermionD    result(FGrid_d); result = Zero();
-  LatticeGaugeFieldD Umu_d(FGrid_d);  SU3::HotConfiguration(fPRNG, Umu_d);
+  LatticeGaugeFieldD Umu_d(FGrid_d);  SU<Nc>::HotConfiguration(fPRNG, Umu_d);
   LatticeGaugeFieldF Umu_f(FGrid_f);  precisionChange(Umu_f, Umu_d);
   // clang-format on
   
diff --git a/tests/solver/Test_wilsonclover_mixedcg_prec.cc b/tests/solver/Test_wilsonclover_mixedcg_prec.cc
index 8af9036f..95590004 100644
--- a/tests/solver/Test_wilsonclover_mixedcg_prec.cc
+++ b/tests/solver/Test_wilsonclover_mixedcg_prec.cc
@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
   // clang-format off
   LatticeFermionD    src(FGrid_d);    gaussian(fPRNG, src);
   LatticeFermionD    result(FGrid_d); result = Zero();
-  LatticeGaugeFieldD Umu_d(FGrid_d);  SU3::HotConfiguration(fPRNG, Umu_d);
+  LatticeGaugeFieldD Umu_d(FGrid_d);  SU<Nc>::HotConfiguration(fPRNG, Umu_d);
   LatticeGaugeFieldF Umu_f(FGrid_f);  precisionChange(Umu_f, Umu_d);
   // clang-format on
   
diff --git a/tests/solver/Test_wilsonclover_mr_unprec.cc b/tests/solver/Test_wilsonclover_mr_unprec.cc
index c7b5ecfe..ab49ec1f 100644
--- a/tests/solver/Test_wilsonclover_mr_unprec.cc
+++ b/tests/solver/Test_wilsonclover_mr_unprec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   FermionField src(&Grid); random(pRNG,src);
   RealD nrm = norm2(src);
   FermionField result(&Grid); result=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
 
   double volume=1;
   for(int mu=0;mu<Nd;mu++){
diff --git a/tests/solver/Test_zMADWF_prec.cc b/tests/solver/Test_zMADWF_prec.cc
index 13b4c3b2..d1168764 100644
--- a/tests/solver/Test_zMADWF_prec.cc
+++ b/tests/solver/Test_zMADWF_prec.cc
@@ -179,7 +179,7 @@ void run(const TestParams &params){
       assert(header.dimension[i] == GridDefaultLatt()[i]);
     }
   }else{    
-    SU3::HotConfiguration(RNG4, Umu);
+    SU<Nc>::HotConfiguration(RNG4, Umu);
   }
     
   std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt()
diff --git a/tests/solver/Test_zmobius_cg_prec.cc b/tests/solver/Test_zmobius_cg_prec.cc
index fb57cff1..6b007afc 100644
--- a/tests/solver/Test_zmobius_cg_prec.cc
+++ b/tests/solver/Test_zmobius_cg_prec.cc
@@ -67,7 +67,7 @@ int main(int argc, char** argv) {
   result = Zero();
   LatticeGaugeField Umu(UGrid);
 
-  SU3::HotConfiguration(RNG4, Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
 
   std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt()
             << "   Ls: " << Ls << std::endl;