Aurora MPI standalone benchmake and options that work well

Aurora files
Bug fix
2025-11-02 21:14:32 +00:00 · 2024-02-06 16:28:40 +00:00 · 2023-12-21 23:20:17 +00:00 · 2023-12-11 20:57:02 -05:00 · 2023-12-08 16:17:56 -05:00 · 2023-12-08 12:14:00 -05:00
142 changed files with 8577 additions and 1887 deletions
--- a/Grid/Makefile.am
+++ b/Grid/Makefile.am
@@ -66,6 +66,10 @@ if BUILD_FERMION_REPS
  extra_sources+=$(ADJ_FERMION_FILES)
  extra_sources+=$(TWOIND_FERMION_FILES)
 endif
+if BUILD_SP
+    extra_sources+=$(SP_FERMION_FILES)
+    extra_sources+=$(SP_TWOIND_FERMION_FILES)
+endif

 lib_LIBRARIES = libGrid.a

--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -604,8 +604,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;

-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
+    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
+    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
      
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -29,8 +29,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-extern Vector<std::pair<int,int> > Cshift_table; 
+extern std::vector<std::pair<int,int> > Cshift_table; 
+extern commVector<std::pair<int,int> > Cshift_table_device; 

+inline std::pair<int,int> *MapCshiftTable(void)
+{
+  // GPU version
+#ifdef ACCELERATOR_CSHIFT    
+  uint64_t sz=Cshift_table.size();
+  if (Cshift_table_device.size()!=sz )    {
+    Cshift_table_device.resize(sz);
+  }
+  acceleratorCopyToDevice((void *)&Cshift_table[0],
+			  (void *)&Cshift_table_device[0],
+			  sizeof(Cshift_table[0])*sz);
+
+  return &Cshift_table_device[0];
+#else 
+  return &Cshift_table[0];
+#endif
+  // CPU version use identify map
+}
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
@@ -74,8 +93,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
  }
  {
    auto buffer_p = & buffer[0];
-    auto table = &Cshift_table[0];
-#ifdef ACCELERATOR_CSHIFT    
+    auto table = MapCshiftTable();
+#ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
@@ -225,7 +244,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
  
  {
    auto buffer_p = & buffer[0];
-    auto table = &Cshift_table[0];
+    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
@@ -297,30 +316,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  }
 }

-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-
-template <typename T>
-T iDivUp(T a, T b) // Round a / b to nearest higher integer value
-{ return (a % b != 0) ? (a / b + 1) : (a / b); }
-
-template <typename T>
-__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
-{
-    int idx = blockIdx.x*blockDim.x + threadIdx.x;
-    if (idx >= e1*e2) return;
-
-    int n, b, o;
-
-    n = idx / e2;
-    b = idx % e2;
-    o = n*stride + b;
-
-    vector[2*idx + 0] = lo + o;
-    vector[2*idx + 1] = ro + o;
-}
-
-#endif
-
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@@ -345,20 +340,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int ent=0;

  if(cbmask == 0x3 ){
-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-    ent = e1*e2;
-    dim3 blockSize(acceleratorThreads());
-    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
-    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
-    accelerator_barrier();
-#else
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
-#endif
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@@ -372,7 +359,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  }

  {
-    auto table = &Cshift_table[0];
+    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
@@ -409,19 +396,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int ent=0;

  if ( cbmask == 0x3 ) {
-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-    ent = e1*e2;
-    dim3 blockSize(acceleratorThreads());
-    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
-    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
-    accelerator_barrier();
-#else
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
-#endif
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
@@ -432,7 +411,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  }

  {
-    auto table = &Cshift_table[0];
+    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -52,7 +52,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
  int splice_dim      = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);

-
+  RealD t1,t0;
+  t0=usecond();
  if ( !comm_dim ) {
    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
@@ -63,6 +64,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
+  t1=usecond();
+  //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }

@@ -127,16 +130,20 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
    
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
+  RealD tcopy=0.0;
+  RealD tgather=0.0;
+  RealD tscatter=0.0;
+  RealD tcomms=0.0;
+  uint64_t xbytes=0;
  for(int x=0;x<rd;x++){       

    int sx        =  (x+sshift)%rd;
    int comm_proc = ((x+sshift)/rd)%pd;
    
    if (comm_proc==0) {
-
+      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-
+      tcopy+=usecond();
    } else {

      int words = buffer_size;
@@ -144,26 +151,39 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r

      int bytes = words * sizeof(vobj);

+      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
+      tgather+=usecond();

      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-
-      grid->Barrier();
+      
+      tcomms-=usecond();
+      //      grid->Barrier();

      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
+      xbytes+=bytes;
+      //      grid->Barrier();
+      tcomms+=usecond();

-      grid->Barrier();
-
+      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
+      tscatter+=usecond();
    }
  }
+  /*
+  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  */
 }

 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -190,6 +210,12 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  assert(shift>=0);
  assert(shift<fd);

+  RealD tcopy=0.0;
+  RealD tgather=0.0;
+  RealD tscatter=0.0;
+  RealD tcomms=0.0;
+  uint64_t xbytes=0;
+  
  int permute_type=grid->PermuteType(dimension);

  ///////////////////////////////////////////////
@@ -227,7 +253,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      pointers[i] = &send_buf_extract[i][0];
    }
    int sx   = (x+sshift)%rd;
+    tgather-=usecond();
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+    tgather+=usecond();

    for(int i=0;i<Nsimd;i++){
      
@@ -252,7 +280,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 

-	grid->Barrier();
+	tcomms-=usecond();
+	//	grid->Barrier();

 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
@@ -262,7 +291,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 			     recv_from_rank,
 			     bytes);

-	grid->Barrier();
+	xbytes+=bytes;
+	//	grid->Barrier();
+	tcomms+=usecond();

 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
@@ -270,9 +301,17 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      }

    }
+    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
+    tscatter+=usecond();
  }
-
+  /*
+  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  */
 }
 #else 
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -292,6 +331,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(comm_dim==1);
  assert(shift>=0);
  assert(shift<fd);
+  RealD tcopy=0.0;
+  RealD tgather=0.0;
+  RealD tscatter=0.0;
+  RealD tcomms=0.0;
+  uint64_t xbytes=0;
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
@@ -315,7 +359,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
    
    if (comm_proc==0) {

+      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+      tcopy+=usecond();

    } else {

@@ -324,7 +370,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r

      int bytes = words * sizeof(vobj);

+      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
+      tgather+=usecond();

      //      int rank           = grid->_processor;
      int recv_from_rank;
@@ -332,7 +380,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);


-      grid->Barrier();
+      tcomms-=usecond();
+      //      grid->Barrier();

      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
      grid->SendToRecvFrom((void *)&send_buf[0],
@@ -340,13 +389,24 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
+      xbytes+=bytes;
      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);

-      grid->Barrier();
+      //      grid->Barrier();
+      tcomms+=usecond();

+      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
+      tscatter+=usecond();
    }
  }
+  /*
+  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  */
 }

 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -372,6 +432,11 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  assert(simd_layout==2);
  assert(shift>=0);
  assert(shift<fd);
+  RealD tcopy=0.0;
+  RealD tgather=0.0;
+  RealD tscatter=0.0;
+  RealD tcomms=0.0;
+  uint64_t xbytes=0;

  int permute_type=grid->PermuteType(dimension);

@@ -414,8 +479,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    for(int i=0;i<Nsimd;i++){       
      pointers[i] = &send_buf_extract[i][0];
    }
+    tgather-=usecond();
    int sx   = (x+sshift)%rd;
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+    tgather+=usecond();

    for(int i=0;i<Nsimd;i++){
      
@@ -440,7 +507,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 

-	grid->Barrier();
+	tcomms-=usecond();
+	//	grid->Barrier();

 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
@@ -449,17 +517,28 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 			     recv_from_rank,
 			     bytes);
 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
+	xbytes+=bytes;

-	grid->Barrier();
+	//	grid->Barrier();
+	tcomms+=usecond();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
      }

    }
+    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
-  }
+    tscatter+=usecond();

+  }
+  /*
+  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
+  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
+  */
 }
 #endif
 NAMESPACE_END(Grid); 
--- a/Grid/cshift/Cshift_table.cc
+++ b/Grid/cshift/Cshift_table.cc
@@ -1,4 +1,5 @@
 #include <Grid/GridCore.h>       
 NAMESPACE_BEGIN(Grid);
-Vector<std::pair<int,int> > Cshift_table; 
+std::vector<std::pair<int,int> > Cshift_table; 
+commVector<std::pair<int,int> > Cshift_table_device; 
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -47,3 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
+#include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -345,7 +345,9 @@ GridUnopClass(UnaryNot, Not(a));
 GridUnopClass(UnaryTrace, trace(a));
 GridUnopClass(UnaryTranspose, transpose(a));
 GridUnopClass(UnaryTa, Ta(a));
+GridUnopClass(UnarySpTa, SpTa(a));
 GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
+GridUnopClass(UnaryProjectOnSpGroup, ProjectOnSpGroup(a));
 GridUnopClass(UnaryTimesI, timesI(a));
 GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
 GridUnopClass(UnaryAbs, abs(a));
@@ -456,7 +458,9 @@ GRID_DEF_UNOP(operator!, UnaryNot);
 GRID_DEF_UNOP(trace, UnaryTrace);
 GRID_DEF_UNOP(transpose, UnaryTranspose);
 GRID_DEF_UNOP(Ta, UnaryTa);
+GRID_DEF_UNOP(SpTa, UnarySpTa);
 GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
+GRID_DEF_UNOP(ProjectOnSpGroup, UnaryProjectOnSpGroup);
 GRID_DEF_UNOP(timesI, UnaryTimesI);
 GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
 GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -270,5 +270,42 @@ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const L
    return axpby_norm_fast(ret,a,b,x,y);
 }

+/// Trace product
+template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2)
+  -> Lattice<decltype(trace(obj()))>
+{
+  typedef decltype(trace(obj())) robj;
+  Lattice<robj> ret_i(rhs_1.Grid());
+  autoView( rhs1 , rhs_1, AcceleratorRead);
+  autoView( rhs2 , rhs_2, AcceleratorRead);
+  autoView( ret , ret_i, AcceleratorWrite);
+  ret.Checkerboard() = rhs_1.Checkerboard();
+  accelerator_for(ss,rhs1.size(),obj::Nsimd(),{
+      coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss)));
+  });
+  return ret_i;
+}
+
+template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2)
+  -> Lattice<decltype(trace(obj1()))>
+{
+  typedef decltype(trace(obj1())) robj;
+  Lattice<robj> ret_i(rhs_1.Grid());
+  autoView( rhs1 , rhs_1, AcceleratorRead);
+  autoView( ret , ret_i, AcceleratorWrite);
+  ret.Checkerboard() = rhs_1.Checkerboard();
+  accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{
+      coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2));
+  });
+  return ret_i;
+}
+template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1)
+  -> Lattice<decltype(trace(obj1()))>
+{
+  return traceProduct(rhs_1,rhs_2);
+}
+
+
+
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  cudaGetDevice(&device);
 #endif
 #ifdef GRID_HIP
-  hipGetDevice(&device);
+  auto r=hipGetDevice(&device);
 #endif
  
  Iterator warpSize            = gpu_props[device].warpSize;
--- a/Grid/lattice/Lattice_trace.h
+++ b/Grid/lattice/Lattice_trace.h
@@ -66,6 +66,65 @@ inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<
  return ret;
 };

+template<int N, class Vec>
+Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
+{
+  GridBase *grid=Umu.Grid();
+  auto lvol = grid->lSites();
+  Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
+  typedef typename Vec::scalar_type scalar;
+  autoView(Umu_v,Umu,CpuRead);
+  autoView(ret_v,ret,CpuWrite);
+  thread_for(site,lvol,{
+    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
+    Coordinate lcoor;
+    grid->LocalIndexToLocalCoor(site, lcoor);
+    iScalar<iScalar<iMatrix<scalar, N> > > Us;
+    peekLocalSite(Us, Umu_v, lcoor);
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	scalar tmp= Us()()(i,j);
+	ComplexD ztmp(real(tmp),imag(tmp));
+	EigenU(i,j)=ztmp;
+      }}
+    ComplexD detD  = EigenU.determinant();
+    typename Vec::scalar_type det(detD.real(),detD.imag());
+    pokeLocalSite(det,ret_v,lcoor);
+  });
+  return ret;
+}
+
+template<int N>
+Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
+{
+  GridBase *grid=Umu.Grid();
+  auto lvol = grid->lSites();
+  Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
+  
+  autoView(Umu_v,Umu,CpuRead);
+  autoView(ret_v,ret,CpuWrite);
+  thread_for(site,lvol,{
+    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
+    Coordinate lcoor;
+    grid->LocalIndexToLocalCoor(site, lcoor);
+    iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
+    iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
+    peekLocalSite(Us, Umu_v, lcoor);
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	EigenU(i,j) = Us()()(i,j);
+      }}
+    Eigen::MatrixXcd EigenUinv = EigenU.inverse();
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	Ui()()(i,j) = EigenUinv(i,j);
+      }}
+    pokeLocalSite(Ui,ret_v,lcoor);
+  });
+  return ret;
+}
+
+
 NAMESPACE_END(Grid);
 #endif

--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -697,8 +697,68 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
-
  // the above should guarantee that the operations are local
+  
+#if 1
+
+  size_t nsite = 1;
+  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
+  
+  size_t tbytes = 4*nsite*sizeof(int);
+  int *table = (int*)malloc(tbytes);
+ 
+  thread_for(idx, nsite, {
+      Coordinate from_coor, to_coor;
+      size_t rem = idx;
+      for(int i=0;i<nd;i++){
+	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
+	from_coor[i] = base_i + FromLowerLeft[i];
+	to_coor[i] = base_i + ToLowerLeft[i];
+      }
+      
+      int foidx = Fg->oIndex(from_coor);
+      int fiidx = Fg->iIndex(from_coor);
+      int toidx = Tg->oIndex(to_coor);
+      int tiidx = Tg->iIndex(to_coor);
+      int* tt = table + 4*idx;
+      tt[0] = foidx;
+      tt[1] = fiidx;
+      tt[2] = toidx;
+      tt[3] = tiidx;
+    });
+  
+  int* table_d = (int*)acceleratorAllocDevice(tbytes);
+  acceleratorCopyToDevice(table,table_d,tbytes);
+
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  autoView(from_v,From,AcceleratorRead);
+  autoView(to_v,To,AcceleratorWrite);
+  
+  accelerator_for(idx,nsite,1,{
+      static const int words=sizeof(vobj)/sizeof(vector_type);
+      int* tt = table_d + 4*idx;
+      int from_oidx = *tt++;
+      int from_lane = *tt++;
+      int to_oidx = *tt++;
+      int to_lane = *tt;
+
+      const vector_type* from = (const vector_type *)&from_v[from_oidx];
+      vector_type* to = (vector_type *)&to_v[to_oidx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	putlane(to[w], stmp, to_lane);
+      }
+    });
+  
+  acceleratorFreeDevice(table_d);    
+  free(table);
+  
+
+#else  
  Coordinate ldf = Fg->_ldimensions;
  Coordinate rdf = Fg->_rdimensions;
  Coordinate isf = Fg->_istride;
@@ -738,6 +798,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 #endif
    }
  });
+
+#endif
 }


@@ -830,6 +892,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 }


+//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
+//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@@ -851,6 +915,65 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
    }
  }

+#if 1
+  size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
+  size_t tbytes = 4*nsite*sizeof(int);
+  int *table = (int*)malloc(tbytes);
+  
+  thread_for(idx,nsite,{
+    Coordinate lcoor(nl);
+    Coordinate hcoor(nh);
+    lcoor[orthog] = slice_lo;
+    hcoor[orthog] = slice_hi;
+    size_t rem = idx;
+    for(int mu=0;mu<nl;mu++){
+      if(mu != orthog){
+	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu];
+	lcoor[mu] = hcoor[mu] = xmu;
+      }
+    }
+    int loidx = lg->oIndex(lcoor);
+    int liidx = lg->iIndex(lcoor);
+    int hoidx = hg->oIndex(hcoor);
+    int hiidx = hg->iIndex(hcoor);
+    int* tt = table + 4*idx;
+    tt[0] = loidx;
+    tt[1] = liidx;
+    tt[2] = hoidx;
+    tt[3] = hiidx;
+    });
+   
+  int* table_d = (int*)acceleratorAllocDevice(tbytes);
+  acceleratorCopyToDevice(table,table_d,tbytes);
+
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  autoView(lowDim_v,lowDim,AcceleratorRead);
+  autoView(higherDim_v,higherDim,AcceleratorWrite);
+  
+  accelerator_for(idx,nsite,1,{
+      static const int words=sizeof(vobj)/sizeof(vector_type);
+      int* tt = table_d + 4*idx;
+      int from_oidx = *tt++;
+      int from_lane = *tt++;
+      int to_oidx = *tt++;
+      int to_lane = *tt;
+
+      const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
+      vector_type* to = (vector_type *)&higherDim_v[to_oidx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	putlane(to[w], stmp, to_lane);
+      }
+    });
+  
+  acceleratorFreeDevice(table_d);    
+  free(table);
+  
+#else
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
@@ -866,6 +989,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
      pokeLocalSite(s,higherDimv,hcoor);
    }
  });
+#endif
 }


--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -26,14 +26,32 @@ Author: Peter Boyle pboyle@bnl.gov
 /*  END LEGAL */
 #pragma once

+#include<Grid/cshift/Cshift.h>
+
 NAMESPACE_BEGIN(Grid);

+//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
+template<typename vobj>
+struct CshiftImplBase{
+  virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
+  virtual ~CshiftImplBase(){}
+};
+template<typename vobj>
+struct CshiftImplDefault: public CshiftImplBase<vobj>{
+  Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
+};
+template<typename Gimpl>
+struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
+  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
+};  
+
 class PaddedCell {
 public:
  GridCartesian * unpadded_grid;
  int dims;
  int depth;
  std::vector<GridCartesian *> grids;
+
  ~PaddedCell()
  {
    DeleteGrids();
@@ -77,7 +95,7 @@ public:
    }
  };
  template<class vobj>
-  inline Lattice<vobj> Extract(Lattice<vobj> &in)
+  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
  {
    Lattice<vobj> out(unpadded_grid);

@@ -88,19 +106,19 @@ public:
    return out;
  }
  template<class vobj>
-  inline Lattice<vobj> Exchange(Lattice<vobj> &in)
+  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    GridBase *old_grid = in.Grid();
    int dims = old_grid->Nd();
    Lattice<vobj> tmp = in;
    for(int d=0;d<dims;d++){
-      tmp = Expand(d,tmp); // rvalue && assignment
+      tmp = Expand(d,tmp,cshift); // rvalue && assignment
    }
    return tmp;
  }
  // expand up one dim at a time
  template<class vobj>
-  inline Lattice<vobj> Expand(int dim,Lattice<vobj> &in)
+  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
@@ -112,20 +130,40 @@ public:
    else       conformable(old_grid,grids[dim-1]);

    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
+
+    double tins=0, tshift=0;
+    
    // Middle bit
+    double t = usecond();
    for(int x=0;x<local[dim];x++){
      InsertSliceLocal(in,padded,x,depth+x,dim);
    }
+    tins += usecond() - t;
+    
    // High bit
-    shifted = Cshift(in,dim,depth);
+    t = usecond();
+    shifted = cshift.Cshift(in,dim,depth);
+    tshift += usecond() - t;
+
+    t=usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
    }
+    tins += usecond() - t;
+    
    // Low bit
-    shifted = Cshift(in,dim,-depth);
+    t = usecond();
+    shifted = cshift.Cshift(in,dim,-depth);
+    tshift += usecond() - t;
+    
+    t = usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,x,x,dim);
    }
+    tins += usecond() - t;
+
+    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
+    
    return padded;
  }

--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -129,6 +129,22 @@ public:
  virtual ~Action(){}
 };

+template <class GaugeField >
+class EmptyAction : public Action <GaugeField>
+{
+  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
+  virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action
+  virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative
+
+  ///////////////////////////////
+  // Logging
+  ///////////////////////////////
+  virtual std::string action_name()    { return std::string("Level Force Log"); };
+  virtual std::string LogParameters()  { return std::string("No parameters");};
+};
+
+
+
 NAMESPACE_END(Grid);

 #endif // ACTION_BASE_H
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -126,6 +126,16 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermi
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;

+// Sp(2n)
+typedef WilsonFermion<SpWilsonImplF> SpWilsonFermionF;
+typedef WilsonFermion<SpWilsonImplD> SpWilsonFermionD;
+
+typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplF> SpWilsonTwoIndexAntiSymmetricFermionF;
+typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplD> SpWilsonTwoIndexAntiSymmetricFermionD;
+
+typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplF> SpWilsonTwoIndexSymmetricFermionF;
+typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplD> SpWilsonTwoIndexSymmetricFermionD;
+
 // Twisted mass fermion
 typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2;
 typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -261,6 +261,22 @@ typedef WilsonImpl<vComplex,  TwoIndexAntiSymmetricRepresentation, CoeffReal > W
 typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float
 typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double

+//sp 2n
+
+typedef WilsonImpl<vComplex,  SpFundamentalRepresentation, CoeffReal > SpWilsonImplR;  // Real.. whichever prec
+typedef WilsonImpl<vComplexF, SpFundamentalRepresentation, CoeffReal > SpWilsonImplF;  // Float
+typedef WilsonImpl<vComplexD, SpFundamentalRepresentation, CoeffReal > SpWilsonImplD;  // Double
+
+typedef WilsonImpl<vComplex,  SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplR;  // Real.. whichever prec
+typedef WilsonImpl<vComplexF, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplF;  // Float
+typedef WilsonImpl<vComplexD, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplD;  // Double
+
+typedef WilsonImpl<vComplex,  SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplR;  // Real.. whichever prec
+typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplF;  // Float
+typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplD;  // Double
+
+typedef WilsonImpl<vComplex,  SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplR;  // Real.. whichever prec    // adj = 2indx symmetric for Sp(2N)
+typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplF;  // Float     // adj = 2indx symmetric for Sp(2N)
+typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplD;  // Double    // adj = 2indx symmetric for Sp(2N)

 NAMESPACE_END(Grid);
-
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonCloverFermionInstantiationSpWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonCloverFermionInstantiationSpWilsonImplD.cc
@@ -0,0 +1 @@
+../WilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonFermionInstantiationSpWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonFermionInstantiationSpWilsonImplD.cc
@@ -0,0 +1 @@
+../WilsonFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonKernelsInstantiationSpWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonKernelsInstantiationSpWilsonImplD.cc
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonTMFermionInstantiationSpWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonTMFermionInstantiationSpWilsonImplD.cc
@@ -0,0 +1 @@
+../WilsonTMFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/impl.h
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/impl.h
@@ -0,0 +1 @@
+#define IMPLEMENTATION SpWilsonImplD
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonCloverFermionInstantiationSpWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonCloverFermionInstantiationSpWilsonImplF.cc
@@ -0,0 +1 @@
+../WilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonFermionInstantiationSpWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonFermionInstantiationSpWilsonImplF.cc
@@ -0,0 +1 @@
+../WilsonFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonKernelsInstantiationSpWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonKernelsInstantiationSpWilsonImplF.cc
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonTMFermionInstantiationSpWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonTMFermionInstantiationSpWilsonImplF.cc
@@ -0,0 +1 @@
+../WilsonTMFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/impl.h
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/impl.h
@@ -0,0 +1 @@
+#define IMPLEMENTATION SpWilsonImplF
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonCloverFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonCloverFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
@@ -0,0 +1 @@
+../WilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
@@ -0,0 +1 @@
+../WilsonFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonTMFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonTMFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
@@ -0,0 +1 @@
+../WilsonTMFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/impl.h
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/impl.h
@@ -0,0 +1 @@
+#define IMPLEMENTATION SpWilsonTwoIndexAntiSymmetricImplD
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplF/WilsonCloverFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplF/WilsonCloverFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplF.cc
@@ -0,0 +1 @@
+../WilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplF/WilsonFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplF/WilsonFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplF.cc
@@ -0,0 +1 @@
+../WilsonFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationSpWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationSpWilsonTwoIndexAntiSymmetricImplF.cc
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplF/WilsonTMFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplF/WilsonTMFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplF.cc
@@ -0,0 +1 @@
+../WilsonTMFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplF/impl.h
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplF/impl.h
@@ -0,0 +1 @@
+#define IMPLEMENTATION SpWilsonTwoIndexAntiSymmetricImplF
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplD/WilsonCloverFermionInstantiationSpWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplD/WilsonCloverFermionInstantiationSpWilsonTwoIndexSymmetricImplD.cc
@@ -0,0 +1 @@
+../WilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplD/WilsonFermionInstantiationSpWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplD/WilsonFermionInstantiationSpWilsonTwoIndexSymmetricImplD.cc
@@ -0,0 +1 @@
+../WilsonFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationSpWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationSpWilsonTwoIndexSymmetricImplD.cc
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplD/WilsonTMFermionInstantiationSpWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplD/WilsonTMFermionInstantiationSpWilsonTwoIndexSymmetricImplD.cc
@@ -0,0 +1 @@
+../WilsonTMFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplD/impl.h
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplD/impl.h
@@ -0,0 +1 @@
+#define IMPLEMENTATION SpWilsonTwoIndexSymmetricImplD
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplF/WilsonCloverFermionInstantiationSpWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplF/WilsonCloverFermionInstantiationSpWilsonTwoIndexSymmetricImplF.cc
@@ -0,0 +1 @@
+../WilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplF/WilsonFermionInstantiationSpWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplF/WilsonFermionInstantiationSpWilsonTwoIndexSymmetricImplF.cc
@@ -0,0 +1 @@
+../WilsonFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationSpWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationSpWilsonTwoIndexSymmetricImplF.cc
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplF/WilsonTMFermionInstantiationSpWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplF/WilsonTMFermionInstantiationSpWilsonTwoIndexSymmetricImplF.cc
@@ -0,0 +1 @@
+../WilsonTMFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplF/impl.h
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexSymmetricImplF/impl.h
@@ -0,0 +1 @@
+#define IMPLEMENTATION SpWilsonTwoIndexSymmetricImplF
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -10,12 +10,18 @@ WILSON_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD \
 	   WilsonImplD2 \
+	   SpWilsonImplF \
+	   SpWilsonImplD \
 	   WilsonAdjImplF \
 	   WilsonAdjImplD \
 	   WilsonTwoIndexSymmetricImplF \
 	   WilsonTwoIndexSymmetricImplD \
 	   WilsonTwoIndexAntiSymmetricImplF \
 	   WilsonTwoIndexAntiSymmetricImplD \
+	   SpWilsonTwoIndexAntiSymmetricImplF \
+	   SpWilsonTwoIndexAntiSymmetricImplD \
+	   SpWilsonTwoIndexSymmetricImplF \
+	   SpWilsonTwoIndexSymmetricImplD \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "

--- a/Grid/qcd/action/gauge/Gauge.h
+++ b/Grid/qcd/action/gauge/Gauge.h
@@ -39,6 +39,9 @@ NAMESPACE_BEGIN(Grid);
 typedef WilsonGaugeAction<PeriodicGimplR>          WilsonGaugeActionR;
 typedef WilsonGaugeAction<PeriodicGimplF>          WilsonGaugeActionF;
 typedef WilsonGaugeAction<PeriodicGimplD>          WilsonGaugeActionD;
+typedef WilsonGaugeAction<SpPeriodicGimplR>        SpWilsonGaugeActionR;
+typedef WilsonGaugeAction<SpPeriodicGimplF>        SpWilsonGaugeActionF;
+typedef WilsonGaugeAction<SpPeriodicGimplD>        SpWilsonGaugeActionD;
 typedef PlaqPlusRectangleAction<PeriodicGimplR>    PlaqPlusRectangleActionR;
 typedef PlaqPlusRectangleAction<PeriodicGimplF>    PlaqPlusRectangleActionF;
 typedef PlaqPlusRectangleAction<PeriodicGimplD>    PlaqPlusRectangleActionD;
--- a/Grid/qcd/action/gauge/GaugeImplTypes.h
+++ b/Grid/qcd/action/gauge/GaugeImplTypes.h
@@ -61,7 +61,7 @@ NAMESPACE_BEGIN(Grid);
  typedef typename Impl::Field Field;

 // hardcodes the exponential approximation in the template
-template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplTypes {
+template <class S, int Nrepresentation = Nc, int Nexp = 12, class Group = SU<Nc> > class GaugeImplTypes {
 public:
  typedef S Simd;
  typedef typename Simd::scalar_type scalar_type;
@@ -78,8 +78,6 @@ public:
  typedef Lattice<SiteLink>    LinkField; 
  typedef Lattice<SiteField>   Field;

-  typedef SU<Nrepresentation> Group;
-
  // Guido: we can probably separate the types from the HMC functions
  // this will create 2 kind of implementations
  // probably confusing the users
@@ -119,6 +117,7 @@ public:
    //
    LinkField Pmu(P.Grid());
    Pmu = Zero();
+
    for (int mu = 0; mu < Nd; mu++) {
      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
      RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ;
@@ -126,8 +125,12 @@ public:
      PokeIndex<LorentzIndex>(P, Pmu, mu);
    }
  }
-
-  static inline Field projectForce(Field &P) { return Ta(P); }
+    
+  static inline Field projectForce(Field &P) {
+      Field ret(P.Grid());
+      Group::taProj(P, ret);
+      return ret;
+    }

  static inline void update_field(Field& P, Field& U, double ep){
    //static std::chrono::duration<double> diff;
@@ -137,14 +140,15 @@ public:
    autoView(P_v,P,AcceleratorRead);
    accelerator_for(ss, P.Grid()->oSites(),1,{
      for (int mu = 0; mu < Nd; mu++) {
-        U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
+          U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu);
+          U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu));
      }
    });
   //auto end = std::chrono::high_resolution_clock::now();
   // diff += end - start;
   // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
  }
-
+    
  static inline RealD FieldSquareNorm(Field& U){
    LatticeComplex Hloc(U.Grid());
    Hloc = Zero();
@@ -157,7 +161,7 @@ public:
  }

  static inline void Project(Field &U) {
-    ProjectSUn(U);
+    Group::ProjectOnSpecialGroup(U);
  }

  static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
@@ -171,6 +175,7 @@ public:
  static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
    Group::ColdConfiguration(pRNG, U);
  }
+
 };


@@ -178,10 +183,17 @@ typedef GaugeImplTypes<vComplex, Nc> GimplTypesR;
 typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF;
 typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD;

+typedef GaugeImplTypes<vComplex, Nc, 12, Sp<Nc> > SpGimplTypesR;
+typedef GaugeImplTypes<vComplexF, Nc, 12, Sp<Nc> > SpGimplTypesF;
+typedef GaugeImplTypes<vComplexD, Nc, 12, Sp<Nc> > SpGimplTypesD;
+
 typedef GaugeImplTypes<vComplex, SU<Nc>::AdjointDimension> GimplAdjointTypesR;
 typedef GaugeImplTypes<vComplexF, SU<Nc>::AdjointDimension> GimplAdjointTypesF;
 typedef GaugeImplTypes<vComplexD, SU<Nc>::AdjointDimension> GimplAdjointTypesD;

+
+
+
 NAMESPACE_END(Grid);

 #endif // GRID_GAUGE_IMPL_TYPES_H
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -176,7 +176,7 @@ public:
      return PeriodicBC::CshiftLink(Link,mu,shift);
  }

-  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
+  static inline void       setDirections(const std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
 };
@@ -193,6 +193,11 @@ typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever pre
 typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
 typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double

+typedef PeriodicGaugeImpl<SpGimplTypesR> SpPeriodicGimplR; // Real.. whichever prec
+typedef PeriodicGaugeImpl<SpGimplTypesF> SpPeriodicGimplF; // Float
+typedef PeriodicGaugeImpl<SpGimplTypesD> SpPeriodicGimplD; // Double
+
+
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@@ -43,7 +43,7 @@ public:
 private:
  RealD c_plaq;
  RealD c_rect;
-
+  typename WilsonLoops<Gimpl>::StapleAndRectStapleAllWorkspace workspace;
 public:
  PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){};

@@ -79,27 +79,18 @@ public:
    GridBase *grid = Umu.Grid();

    std::vector<GaugeLinkField> U (Nd,grid);
-    std::vector<GaugeLinkField> U2(Nd,grid);
-
    for(int mu=0;mu<Nd;mu++){
      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-      WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
    }
+    std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
+    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);

    GaugeLinkField dSdU_mu(grid);
    GaugeLinkField staple(grid);

    for (int mu=0; mu < Nd; mu++){
-
-      // Staple in direction mu
-
-      WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
-
-      dSdU_mu = Ta(U[mu]*staple)*factor_p;
-
-      WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
-
-      dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
+      dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
+      dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
 	  
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -225,6 +225,18 @@ template <class RepresentationsPolicy,
 using GenericHMCRunnerHirep =
 				     HMCWrapperTemplate<PeriodicGimplR, Integrator, RepresentationsPolicy>;

+// sp2n
+
+template <template <typename, typename, typename> class Integrator>
+using GenericSpHMCRunner = HMCWrapperTemplate<SpPeriodicGimplR, Integrator>;
+
+template <class RepresentationsPolicy,
+          template <typename, typename, typename> class Integrator>
+using GenericSpHMCRunnerHirep =
+                     HMCWrapperTemplate<SpPeriodicGimplR, Integrator, RepresentationsPolicy>;
+
+
+
 template <class Implementation, class RepresentationsPolicy, 
          template <typename, typename, typename> class Integrator>
 using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator, RepresentationsPolicy>;
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -87,6 +87,8 @@ public:

  const ActionSet<Field, RepresentationPolicy> as;

+  ActionSet<Field,RepresentationPolicy> LevelForces;
+  
  //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
  static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){ 
    static MomentumFilterNone<MomentaField> filter;
@@ -124,6 +126,9 @@ public:
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing

+    assert(as.size()==LevelForces.size());
+    
+    Field level_force(U.Grid()); level_force =Zero();
    for (int a = 0; a < as[level].actions.size(); ++a) {

      double start_full = usecond();
@@ -144,7 +149,10 @@ public:
      MomFilter->applyFilter(force);

      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl;
-      
+
+      // track the total
+      level_force = level_force+force;
+
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    

@@ -167,6 +175,16 @@ public:

    }

+    {
+      // total force
+      Real force_abs   = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
+      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
+
+      Real force_max   = std::sqrt(maxLocalNorm2(level_force));
+      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    
+      LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
+    }
+
    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);

@@ -216,6 +234,16 @@ public:

    //Default the momentum filter to "do-nothing"
    MomFilter = getDefaultMomFilter();
+
+    for (int level = 0; level < as.size(); ++level) {
+      int multiplier = as.at(level).multiplier;
+      ActionLevel<Field> * Level = new ActionLevel<Field>(multiplier);
+      Level->push_back(new EmptyAction<Field>); 
+      LevelForces.push_back(*Level);
+      // does it copy by value or reference??
+      // - answer it copies by value, BUT the action level contains a reference that is NOT updated.
+      // Unsafe code in Guido's area
+    }
  };

  virtual ~Integrator() {}
@@ -233,10 +261,14 @@ public:

  void reset_timer(void)
  {
+    assert(as.size()==LevelForces.size());
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        as[level].actions.at(actionID)->reset_timer();
      }
+      int actionID=0;
+      assert(LevelForces.at(level).actions.size()==1);
+      LevelForces.at(level).actions.at(actionID)->reset_timer();
    }
  }
  void print_timer(void)
@@ -298,6 +330,16 @@ public:
 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
 		  << std::endl;
      }
+      int actionID=0;
+      std::cout << GridLogMessage 
+		  << LevelForces[level].actions.at(actionID)->action_name()
+		  <<"["<<level<<"]["<< actionID<<"] :\n\t\t "
+		  <<" force max " << LevelForces[level].actions.at(actionID)->deriv_max_average()
+		  <<" norm "      << LevelForces[level].actions.at(actionID)->deriv_norm_average()
+		  <<" Fdt max  "  << LevelForces[level].actions.at(actionID)->Fdt_max_average()
+		  <<" Fdt norm "  << LevelForces[level].actions.at(actionID)->Fdt_norm_average()
+		  <<" calls "     << LevelForces[level].actions.at(actionID)->deriv_num
+		  << std::endl;
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
@@ -319,6 +361,13 @@ public:
 	std::cout << as[level].actions.at(actionID)->LogParameters();
      }
    }
+    std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <<std::endl;
+    for (int level = 0; level < LevelForces.size(); ++level) {
+      std::cout << GridLogMessage << "[Integrator] ---- Level: "<< level << std::endl;
+      for (int actionID = 0; actionID < LevelForces[level].actions.size(); ++actionID) {
+	std::cout << GridLogMessage << "["<< LevelForces[level].actions.at(actionID)->action_name() << "] ID: " << actionID << std::endl;
+      }
+    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }

@@ -400,6 +449,7 @@ public:
  RealD S(Field& U) 
  {  // here also U not used

+    assert(as.size()==LevelForces.size());
    std::cout << GridLogIntegrator << "Integrator action\n";

    RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
--- a/Grid/qcd/representations/fundamental.h
+++ b/Grid/qcd/representations/fundamental.h
@@ -13,7 +13,7 @@ NAMESPACE_BEGIN(Grid);
 * Empty since HMC updates already the fundamental representation 
 */

-template <int ncolour>
+template <int ncolour, class group_name>
 class FundamentalRep {
 public:
  static const int Dimension = ncolour;
@@ -21,7 +21,7 @@ public:

  // typdef to be used by the Representations class in HMC to get the
  // types for the higher representation fields
-  typedef typename SU<ncolour>::LatticeMatrix LatticeMatrix;
+  typedef typename GaugeGroup<ncolour,group_name>::LatticeMatrix LatticeMatrix;
  typedef LatticeGaugeField LatticeField;
  
  explicit FundamentalRep(GridBase* grid) {} //do nothing
@@ -45,7 +45,8 @@ public:
    

  
-typedef	 FundamentalRep<Nc> FundamentalRepresentation;
+typedef	 FundamentalRep<Nc,GroupName::SU> FundamentalRepresentation;
+typedef	 FundamentalRep<Nc,GroupName::Sp> SpFundamentalRepresentation;

 NAMESPACE_END(Grid);  

--- a/Grid/qcd/representations/two_index.h
+++ b/Grid/qcd/representations/two_index.h
@@ -20,14 +20,14 @@ NAMESPACE_BEGIN(Grid);
 * in the SUnTwoIndex.h file
 */

-template <int ncolour, TwoIndexSymmetry S>
+template <int ncolour, TwoIndexSymmetry S, class group_name = GroupName::SU>
 class TwoIndexRep {
 public:
  // typdef to be used by the Representations class in HMC to get the
  // types for the higher representation fields
-  typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexMatrix LatticeMatrix;
-  typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexField LatticeField;
-  static const int Dimension = ncolour * (ncolour + S) / 2;
+  typedef typename GaugeGroupTwoIndex<ncolour, S, group_name>::LatticeTwoIndexMatrix LatticeMatrix;
+  typedef typename GaugeGroupTwoIndex<ncolour, S, group_name>::LatticeTwoIndexField LatticeField;
+  static const int Dimension = GaugeGroupTwoIndex<ncolour,S,group_name>::Dimension;
  static const bool isFundamental = false;

  LatticeField U;
@@ -43,10 +43,10 @@ public:
    U = Zero();
    LatticeColourMatrix tmp(Uin.Grid());

-    Vector<typename SU<ncolour>::Matrix> eij(Dimension);
+    Vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);

    for (int a = 0; a < Dimension; a++)
-      SU_TwoIndex<ncolour, S>::base(a, eij[a]);
+      GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);

    for (int mu = 0; mu < Nd; mu++) {
      auto Uin_mu = peekLorentz(Uin, mu);
@@ -71,7 +71,7 @@ public:

      out_mu = Zero();

-      typename SU<ncolour>::LatticeAlgebraVector h(in.Grid());
+      typename GaugeGroup<ncolour, group_name>::LatticeAlgebraVector h(in.Grid());
      projectOnAlgebra(h, in_mu, double(Nc + 2 * S));  // factor T(r)/T(fund)
      FundamentalLieAlgebraMatrix(h, out_mu);          // apply scale only once
      pokeLorentz(out, out_mu, mu);
@@ -80,20 +80,23 @@ public:
  }

 private:
-  void projectOnAlgebra(typename SU<ncolour>::LatticeAlgebraVector &h_out,
+  void projectOnAlgebra(typename GaugeGroup<ncolour, group_name>::LatticeAlgebraVector &h_out,
                        const LatticeMatrix &in, Real scale = 1.0) const {
-    SU_TwoIndex<ncolour, S>::projectOnAlgebra(h_out, in, scale);
+    GaugeGroupTwoIndex<ncolour, S,group_name>::projectOnAlgebra(h_out, in, scale);
  }

  void FundamentalLieAlgebraMatrix(
-				   typename SU<ncolour>::LatticeAlgebraVector &h,
-				   typename SU<ncolour>::LatticeMatrix &out, Real scale = 1.0) const {
-    SU<ncolour>::FundamentalLieAlgebraMatrix(h, out, scale);
+				   typename GaugeGroup<ncolour, group_name>::LatticeAlgebraVector &h,
+				   typename GaugeGroup<ncolour, group_name>::LatticeMatrix &out, Real scale = 1.0) const {
+    GaugeGroup<ncolour,group_name>::FundamentalLieAlgebraMatrix(h, out, scale);
  }
 };

-typedef TwoIndexRep<Nc, Symmetric> TwoIndexSymmetricRepresentation;
-typedef TwoIndexRep<Nc, AntiSymmetric> TwoIndexAntiSymmetricRepresentation;
+typedef TwoIndexRep<Nc, Symmetric, GroupName::SU> TwoIndexSymmetricRepresentation;
+typedef TwoIndexRep<Nc, AntiSymmetric, GroupName::SU> TwoIndexAntiSymmetricRepresentation;
+
+typedef TwoIndexRep<Nc, Symmetric, GroupName::Sp> SpTwoIndexSymmetricRepresentation;
+typedef TwoIndexRep<Nc, AntiSymmetric, GroupName::Sp> SpTwoIndexAntiSymmetricRepresentation;

 NAMESPACE_END(Grid);

--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@@ -1,3 +1,4 @@
+
 /*!
  @file GaugeConfiguration.h
  @brief Declares the GaugeConfiguration class
@@ -6,6 +7,15 @@

 NAMESPACE_BEGIN(Grid);

+
+template<class T> void Dump(const Lattice<T> & lat,
+			    std::string s,
+			    Coordinate site = Coordinate({0,0,0,0}))
+{
+  typename T::scalar_object tmp;
+  peekSite(tmp,lat,site);
+  std::cout << " Dump "<<s<<" "<<tmp<<std::endl;
+}
 /*!
  @brief Smeared configuration masked container
  Modified for a multi-subset smearing (aka Luscher Flowed HMC)
@@ -28,6 +38,101 @@ private:
  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
  typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField;

+  void BaseSmearDerivative(GaugeField& SigmaTerm,
+			   const GaugeField& iLambda,
+			   const GaugeField& U,
+			   int mmu, RealD rho)
+  {
+    // Reference
+    // Morningstar, Peardon, Phys.Rev.D69,054501(2004)
+    // Equation 75
+    // Computing Sigma_mu, derivative of S[fat links] with respect to the thin links
+    // Output SigmaTerm
+
+    GridBase *grid = U.Grid();
+
+    WilsonLoops<Gimpl> WL;
+    GaugeLinkField staple(grid), u_tmp(grid);
+    GaugeLinkField iLambda_mu(grid), iLambda_nu(grid);
+    GaugeLinkField U_mu(grid), U_nu(grid);
+    GaugeLinkField sh_field(grid), temp_Sigma(grid);
+    Real rho_munu, rho_numu;
+
+    rho_munu = rho;
+    rho_numu = rho;
+    for(int mu = 0; mu < Nd; ++mu){
+      U_mu       = peekLorentz(      U, mu);
+      iLambda_mu = peekLorentz(iLambda, mu);
+
+      for(int nu = 0; nu < Nd; ++nu){
+	if(nu==mu) continue;
+
+	U_nu       = peekLorentz(      U, nu);
+
+	// Nd(nd-1) = 12 staples normally.
+	// We must compute 6 of these
+	// in FTHMC case
+	if ( (mu==mmu)||(nu==mmu) )
+	  WL.StapleUpper(staple, U, mu, nu);
+	
+	if(nu==mmu) {
+	  iLambda_nu = peekLorentz(iLambda, nu);
+
+	  temp_Sigma = -rho_numu*staple*iLambda_nu;  //ok
+	  //-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x)
+	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
+
+	  sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity?
+
+	  temp_Sigma = rho_numu*sh_field*staple; //ok
+	  //r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)
+	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
+	}
+
+	if ( mu == mmu ) { 
+	  sh_field = Cshift(iLambda_mu, nu, 1);
+
+	  temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok
+	  //-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x)
+	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
+	}
+
+	//	staple = Zero();
+	sh_field = Cshift(U_nu, mu, 1);
+
+	temp_Sigma = Zero();
+
+	if ( mu == mmu )
+	  temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu;
+
+	if ( nu == mmu ) {
+	  temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu;
+
+	  u_tmp = adj(U_nu)*iLambda_nu;
+	  sh_field = Cshift(u_tmp, mu, 1);
+	  temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu;
+	}
+	
+	sh_field = Cshift(temp_Sigma, nu, -1);
+	Gimpl::AddLink(SigmaTerm, sh_field, mu);
+
+      }
+    }
+  }
+  
+  void BaseSmear(GaugeLinkField& Cup, const GaugeField& U,int mu,RealD rho) {
+    GridBase *grid = U.Grid();
+    GaugeLinkField tmp_stpl(grid);
+    WilsonLoops<Gimpl> WL;
+    Cup = Zero();
+    for(int nu=0; nu<Nd; ++nu){
+      if (nu != mu) {
+	// get the staple in direction mu, nu
+	WL.Staple(tmp_stpl, U, mu, nu);  //nb staple conventions of IroIro and Grid differ by a dagger
+	Cup += adj(tmp_stpl*rho);
+      }
+    }
+  }
  // Adjoint vector to GaugeField force
  void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu)
  {
@@ -47,27 +152,54 @@ private:
    GaugeLinkField UtaU(PlaqL.Grid());
    GaugeLinkField D(PlaqL.Grid());
    AdjMatrixField Dbc(PlaqL.Grid());
+    AdjMatrixField Dbc_opt(PlaqL.Grid());
    LatticeComplex tmp(PlaqL.Grid());
    const int Ngen = SU3Adjoint::Dimension;
    Complex ci(0,1);
    ColourMatrix   ta,tb,tc;
-    
+    RealD t=0;
+    RealD tp=0;
+    RealD tta=0;
+    RealD tpk=0;
+    t-=usecond();
    for(int a=0;a<Ngen;a++) {
+      tta-=usecond();
      SU3::generator(a, ta);
+      ta = 2.0 * ci * ta;
      // Qlat Tb = 2i Tb^Grid
-      UtaU= 2.0*ci*adj(PlaqL)*ta*PlaqR;
+      UtaU= adj(PlaqL)*ta*PlaqR; // 6ms
+      tta+=usecond();
+      ////////////////////////////////////////////
+      // Could add this entire C-loop to a projection routine
+      // for performance. Could also pick checkerboard on UtaU
+      // and set checkerboard on result for 2x perf
+      ////////////////////////////////////////////
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, tc);
-	D = Ta( (2.0)*ci*tc *UtaU);
+	tc = 2.0*ci*tc;
+	tp-=usecond(); 
+	D = Ta( tc *UtaU); // 2ms
+#if 1
+	SU3::LieAlgebraProject(Dbc_opt,D,c); // 5.5ms
+#else
 	for(int b=0;b<Ngen;b++){
 	  SU3::generator(b, tb);
 	  tmp =-trace(ci*tb*D); 
 	  PokeIndex<ColourIndex>(Dbc,tmp,b,c);  // Adjoint rep
 	}
+#endif
+	tp+=usecond();
      }
-      tmp = trace(MpInvJx * Dbc);
+      //      Dump(Dbc_opt,"Dbc_opt");
+      //      Dump(Dbc,"Dbc");
+      tpk-=usecond();
+      tmp = trace(MpInvJx * Dbc_opt);
      PokeIndex<ColourIndex>(Fdet2,tmp,a);
+      tpk+=usecond();
    }
+    t+=usecond();
+    std::cout << GridLogPerformance << " Compute_MpInvJx_dNxxdSy " << t/1e3 << " ms  proj "<<tp/1e3<< " ms"
+	      << " ta "<<tta/1e3<<" ms" << " poke "<<tpk/1e3<< " ms"<<std::endl;
  }
  
  void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd)
@@ -79,12 +211,17 @@ private:
    ColourMatrix   tc;
    for(int b=0;b<Ngen;b++) {
      SU3::generator(b, tb);
-      Nx = (2.0)*Ta( adj(PlaqL)*ci*tb * PlaqR );
+      tb = 2.0 * ci * tb;
+      Nx = Ta( adj(PlaqL)*tb * PlaqR );
+#if 1
+      SU3::LieAlgebraProject(NxAd,Nx,b);
+#else
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, tc);
 	auto tmp =closure( -trace(ci*tc*Nx)); 
 	PokeIndex<ColourIndex>(NxAd,tmp,c,b); 
      }
+#endif
    }
  }
  void ApplyMask(GaugeField &U,int smr)
@@ -164,8 +301,7 @@ public:
    // Computes ALL the staples -- could compute one only and do it here
    RealD time;
    time=-usecond();
-    this->StoutSmearing->BaseSmear(C, U);
-    Cmu = peekLorentz(C, mu);
+    BaseSmear(Cmu, U,mu,rho);

    //////////////////////////////////////////////////////////////////
    // Assemble Luscher exp diff map J matrix 
@@ -209,6 +345,36 @@ public:
    // dJ(x)/dxe
    //////////////////////////////////////
    time=-usecond();
+#if 1
+    std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid);
+    std::vector<AdjMatrix> TRb_s; TRb_s.resize(8);
+    AdjMatrixField tbXn(grid);
+    AdjMatrixField sumXtbX(grid);
+    AdjMatrixField t2(grid);
+    AdjMatrixField dt2(grid);
+    AdjMatrixField t3(grid);
+    AdjMatrixField dt3(grid);
+    AdjMatrixField aunit(grid);
+
+    for(int b=0;b<8;b++){
+      SU3Adjoint::generator(b, TRb_s[b]);
+      dJdX[b] = TRb_s[b];
+    }
+    aunit = ComplexD(1.0);
+    // Could put into an accelerator_for
+    X  = (-1.0)*ZxAd; 
+    t2 = X;
+    for (int j = 12; j > 1; --j) {
+      t3  = t2*(1.0 / (j + 1))  + aunit;
+      t2  = X * t3;
+      for(int b=0;b<8;b++){
+	dJdX[b]= TRb_s[b] * t3 + X * dJdX[b]*(1.0 / (j + 1));
+      }
+    }
+    for(int b=0;b<8;b++){
+      dJdX[b] = -dJdX[b];
+    }
+#else
    std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid);
    AdjMatrixField tbXn(grid);
    AdjMatrixField sumXtbX(grid);
@@ -224,14 +390,15 @@ public:
      X  = (-1.0)*ZxAd; 
      t2 = X;
      dt2 = TRb;
-      for (int j = 20; j > 1; --j) {
-	t3 = t2*(1.0 / (j + 1))  + aunit;
+      for (int j = 12; j > 1; --j) {
+	t3  = t2*(1.0 / (j + 1))  + aunit;
 	dt3 = dt2*(1.0 / (j + 1));
 	t2 = X * t3;
 	dt2 = TRb * t3 + X * dt3;
      }
      dJdX[b] = -dt2; 
    }
+#endif  
    time+=usecond();
    std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl;
    /////////////////////////////////////////////////////////////////
@@ -281,8 +448,8 @@ public:
    
    for(int e =0 ; e<8 ; e++){
      LatticeComplexD tr(grid);
-      ColourMatrix te;
-      SU3::generator(e, te);
+      //      ColourMatrix te;
+      //      SU3::generator(e, te);
      tr = trace(dJdX[e] * nMpInv);
      pokeColour(dJdXe_nMpInv,tr,e);
    }
@@ -493,20 +660,25 @@ public:
    //////////////////////////////////////////////////////////////////
    // Assemble the N matrix
    //////////////////////////////////////////////////////////////////
-    // Computes ALL the staples -- could compute one only here
-    this->StoutSmearing->BaseSmear(C, U);
-    Cmu = peekLorentz(C, mu);
+    double rho=this->StoutSmearing->SmearRho[1];
+    BaseSmear(Cmu, U,mu,rho);
+
    Umu = peekLorentz(U, mu);
    Complex ci(0,1);
    for(int b=0;b<Ngen;b++) {
      SU3::generator(b, Tb);
      // Qlat Tb = 2i Tb^Grid
      Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu));
+      // FIXME -- replace this with LieAlgebraProject
+#if 0
+      SU3::LieAlgebraProject(Ncb,tmp,b);
+#else
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, Tc);
 	auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important
 	PokeIndex<ColourIndex>(Ncb,tmp,c,b); 
      }
+#endif
    }      

    //////////////////////////////////////////////////////////////////
@@ -693,15 +865,19 @@ private:
 					  const GaugeField& GaugeK,int level) 
  {
    GridBase* grid = GaugeK.Grid();
-    GaugeField C(grid), SigmaK(grid), iLambda(grid);
+    GaugeField SigmaK(grid), iLambda(grid);
    GaugeField SigmaKPrimeA(grid);
    GaugeField SigmaKPrimeB(grid);
    GaugeLinkField iLambda_mu(grid);
    GaugeLinkField iQ(grid), e_iQ(grid);
    GaugeLinkField SigmaKPrime_mu(grid);
    GaugeLinkField GaugeKmu(grid), Cmu(grid);
-    
-    this->StoutSmearing->BaseSmear(C, GaugeK);
+
+    int mmu= (level/2) %Nd;
+    int cb= (level%2);
+    double rho=this->StoutSmearing->SmearRho[1];
+
+    // Can override this to do one direction only.
    SigmaK = Zero();
    iLambda = Zero();

@@ -712,18 +888,38 @@ private:
    // Could get away with computing only one polarisation here
    // int mu= (smr/2) %Nd;
    // SigmaKprime_A has only one component
-    for (int mu = 0; mu < Nd; mu++)
+#if 0
+    BaseSmear(Cmu, GaugeK,mu,rho);
+    GaugeKmu = peekLorentz(GaugeK, mu);
+    SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
+    iQ = Ta(Cmu * adj(GaugeKmu));
+    this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
+    pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
+    pokeLorentz(iLambda, iLambda_mu, mu);
+    BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho);  // derivative of SmearBase
+#else
+    //    GaugeField C(grid);
+    //    this->StoutSmearing->BaseSmear(C, GaugeK);
+    //    for (int mu = 0; mu < Nd; mu++)
+    int mu =mmu;
+    BaseSmear(Cmu, GaugeK,mu,rho);
    {
-      Cmu = peekLorentz(C, mu);
+      // Cmu = peekLorentz(C, mu);
      GaugeKmu = peekLorentz(GaugeK, mu);
      SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
      iQ = Ta(Cmu * adj(GaugeKmu));
      this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
      pokeLorentz(iLambda, iLambda_mu, mu);
+      std::cout << " mu "<<mu<<" SigmaKPrime_mu"<<norm2(SigmaKPrime_mu)<< " iLambda_mu " <<norm2(iLambda_mu)<<std::endl;
    }
-    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase
-
+    //    GaugeField SigmaKcopy(grid);
+    //    SigmaKcopy = SigmaK;
+    BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho);  // derivative of SmearBase
+    //    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase
+    //    SigmaKcopy = SigmaKcopy - SigmaK;
+    //    std::cout << " BaseSmearDerivative fast path error" <<norm2(SigmaKcopy)<<std::endl;
+#endif
    ////////////////////////////////////////////////////////////////////////////////////
    // propagate the rest of the force as identity map, just add back
    ////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -37,13 +37,14 @@ NAMESPACE_BEGIN(Grid);
 // Make these members of an Impl class for BC's.

 namespace PeriodicBC { 
-
+  //Out(x) = Link(x)*field(x+mu)
  template<class covariant,class gauge> Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link, 
 									   int mu,
 									   const Lattice<covariant> &field)
  {
    return Link*Cshift(field,mu,1);// moves towards negative mu
  }
+  //Out(x) = Link^dag(x-mu)*field(x-mu)
  template<class covariant,class gauge> Lattice<covariant> CovShiftBackward(const Lattice<gauge> &Link, 
 									    int mu,
 									    const Lattice<covariant> &field)
@@ -52,19 +53,19 @@ namespace PeriodicBC {
    tmp = adj(Link)*field;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
-
+  //Out(x) = Link^dag(x-mu)
  template<class gauge> Lattice<gauge>
  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) 
  {
    return Cshift(adj(Link), mu, -1);
  }
-
+  //Out(x) = Link(x)
  template<class gauge> Lattice<gauge>
  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu)
  {
    return Link;
  }
-
+  //Link(x) = Link(x+mu)
  template<class gauge> Lattice<gauge>
  ShiftStaple(const Lattice<gauge> &Link, int mu)
  {
--- a/Grid/qcd/utils/GaugeGroup.h
+++ b/Grid/qcd/utils/GaugeGroup.h
@@ -0,0 +1,528 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/utils/GaugeGroup.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef QCD_UTIL_GAUGEGROUP_H
+#define QCD_UTIL_GAUGEGROUP_H
+
+// Important detail: nvcc requires all template parameters to have names.
+// This is the only reason why the second template parameter has a name.
+#define ONLY_IF_SU                                                       \
+  typename dummy_name = group_name,                                      \
+           typename named_dummy = std::enable_if_t <                                 \
+                          std::is_same<dummy_name, group_name>::value && \
+                      is_su<dummy_name>::value >
+
+#define ONLY_IF_Sp                                                       \
+  typename dummy_name = group_name,                                      \
+           typename named_dummy = std::enable_if_t <                                 \
+                          std::is_same<dummy_name, group_name>::value && \
+                      is_sp<dummy_name>::value >
+
+NAMESPACE_BEGIN(Grid);
+namespace GroupName {
+class SU {};
+class Sp {};
+}  // namespace GroupName
+
+template <typename group_name>
+struct is_su {
+  static const bool value = false;
+};
+
+template <>
+struct is_su<GroupName::SU> {
+  static const bool value = true;
+};
+
+template <typename group_name>
+struct is_sp {
+  static const bool value = false;
+};
+
+template <>
+struct is_sp<GroupName::Sp> {
+  static const bool value = true;
+};
+
+template <typename group_name>
+constexpr int compute_adjoint_dimension(int ncolour);
+
+template <>
+constexpr int compute_adjoint_dimension<GroupName::SU>(int ncolour) {
+  return ncolour * ncolour - 1;
+}
+
+template <>
+constexpr int compute_adjoint_dimension<GroupName::Sp>(int ncolour) {
+  return ncolour / 2 * (ncolour + 1);
+}
+
+template <int ncolour, class group_name>
+class GaugeGroup {
+ public:
+  static const int Dimension = ncolour;
+  static const int AdjointDimension =
+      compute_adjoint_dimension<group_name>(ncolour);
+  static const int AlgebraDimension =
+      compute_adjoint_dimension<group_name>(ncolour);
+
+  template <typename vtype>
+  using iSU2Matrix = iScalar<iScalar<iMatrix<vtype, 2> > >;
+  template <typename vtype>
+  using iGroupMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
+  template <typename vtype>
+  using iAlgebraVector = iScalar<iScalar<iVector<vtype, AdjointDimension> > >;
+  template <typename vtype>
+  using iSUnAlgebraMatrix =
+    iScalar<iScalar<iMatrix<vtype, AdjointDimension> > >;
+  static int su2subgroups(void) { return su2subgroups(group_name()); }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+  // Types can be accessed as SU<2>::Matrix , SU<2>::vSUnMatrix,
+  // SU<2>::LatticeMatrix etc...
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+  typedef iGroupMatrix<Complex> Matrix;
+  typedef iGroupMatrix<ComplexF> MatrixF;
+  typedef iGroupMatrix<ComplexD> MatrixD;
+
+  typedef iGroupMatrix<vComplex> vMatrix;
+  typedef iGroupMatrix<vComplexF> vMatrixF;
+  typedef iGroupMatrix<vComplexD> vMatrixD;
+
+  // For the projectors to the algebra
+  // these should be real...
+  // keeping complex for consistency with the SIMD vector types
+  typedef iAlgebraVector<Complex> AlgebraVector;
+  typedef iAlgebraVector<ComplexF> AlgebraVectorF;
+  typedef iAlgebraVector<ComplexD> AlgebraVectorD;
+
+  typedef iAlgebraVector<vComplex> vAlgebraVector;
+  typedef iAlgebraVector<vComplexF> vAlgebraVectorF;
+  typedef iAlgebraVector<vComplexD> vAlgebraVectorD;
+
+  typedef Lattice<vMatrix> LatticeMatrix;
+  typedef Lattice<vMatrixF> LatticeMatrixF;
+  typedef Lattice<vMatrixD> LatticeMatrixD;
+  
+  typedef Lattice<vAlgebraVector> LatticeAlgebraVector;
+  typedef Lattice<vAlgebraVectorF> LatticeAlgebraVectorF;
+  typedef Lattice<vAlgebraVectorD> LatticeAlgebraVectorD;
+   
+  typedef iSUnAlgebraMatrix<vComplex>  vAlgebraMatrix;
+  typedef iSUnAlgebraMatrix<vComplexF> vAlgebraMatrixF;
+  typedef iSUnAlgebraMatrix<vComplexD> vAlgebraMatrixD;
+
+  typedef Lattice<vAlgebraMatrix>  LatticeAlgebraMatrix;
+  typedef Lattice<vAlgebraMatrixF> LatticeAlgebraMatrixF;
+  typedef Lattice<vAlgebraMatrixD> LatticeAlgebraMatrixD;
+  
+
+  typedef iSU2Matrix<Complex> SU2Matrix;
+  typedef iSU2Matrix<ComplexF> SU2MatrixF;
+  typedef iSU2Matrix<ComplexD> SU2MatrixD;
+
+  typedef iSU2Matrix<vComplex> vSU2Matrix;
+  typedef iSU2Matrix<vComplexF> vSU2MatrixF;
+  typedef iSU2Matrix<vComplexD> vSU2MatrixD;
+
+  typedef Lattice<vSU2Matrix> LatticeSU2Matrix;
+  typedef Lattice<vSU2MatrixF> LatticeSU2MatrixF;
+  typedef Lattice<vSU2MatrixD> LatticeSU2MatrixD;
+
+  // Private implementation details are specified in the following files:
+  // Grid/qcd/utils/SUn.impl
+  // Grid/qcd/utils/SUn.impl
+  // The public part of the interface follows below and refers to these
+  // private member functions.
+
+#include <Grid/qcd/utils/SUn.impl.h>
+#include <Grid/qcd/utils/Sp2n.impl.h>
+
+ public:
+  template <class cplx>
+  static void generator(int lieIndex, iGroupMatrix<cplx> &ta) {
+    return generator(lieIndex, ta, group_name());
+  }
+
+  static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
+    return su2SubGroupIndex(i1, i2, su2_index, group_name());
+  }
+
+  static void testGenerators(void) { testGenerators(group_name()); }
+
+  static void printGenerators(void) {
+    for (int gen = 0; gen < AlgebraDimension; gen++) {
+      Matrix ta;
+      generator(gen, ta);
+      std::cout << GridLogMessage << "Nc = " << ncolour << " t_" << gen
+                << std::endl;
+      std::cout << GridLogMessage << ta << std::endl;
+    }
+  }
+
+  template <typename LatticeMatrixType>
+  static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out,
+                           double scale = 1.0) {
+    GridBase *grid = out.Grid();
+
+    typedef typename LatticeMatrixType::vector_type vector_type;
+
+    typedef iSinglet<vector_type> vTComplexType;
+
+    typedef Lattice<vTComplexType> LatticeComplexType;
+    typedef typename GridTypeMapper<
+        typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
+
+    LatticeComplexType ca(grid);
+    LatticeMatrixType lie(grid);
+    LatticeMatrixType la(grid);
+    ComplexD ci(0.0, scale);
+    MatrixType ta;
+
+    lie = Zero();
+
+    for (int a = 0; a < AlgebraDimension; a++) {
+      random(pRNG, ca);
+
+      ca = (ca + conjugate(ca)) * 0.5;
+      ca = ca - 0.5;
+
+      generator(a, ta);
+
+      la = ci * ca * ta;
+
+      lie = lie + la;  // e^{i la ta}
+    }
+    taExp(lie, out);
+  }
+
+  static void GaussianFundamentalLieAlgebraMatrix(GridParallelRNG &pRNG,
+                                                  LatticeMatrix &out,
+                                                  Real scale = 1.0) {
+    GridBase *grid = out.Grid();
+    LatticeReal ca(grid);
+    LatticeMatrix la(grid);
+    Complex ci(0.0, scale);
+    Matrix ta;
+
+    out = Zero();
+    for (int a = 0; a < AlgebraDimension; a++) {
+      gaussian(pRNG, ca);
+      generator(a, ta);
+      la = toComplex(ca) * ta;
+      out += la;
+    }
+    out *= ci;
+  }
+
+  static void FundamentalLieAlgebraMatrix(const LatticeAlgebraVector &h,
+                                          LatticeMatrix &out,
+                                          Real scale = 1.0) {
+    conformable(h, out);
+    GridBase *grid = out.Grid();
+    LatticeMatrix la(grid);
+    Matrix ta;
+
+    out = Zero();
+    for (int a = 0; a < AlgebraDimension; a++) {
+      generator(a, ta);
+      la = peekColour(h, a) * timesI(ta) * scale;
+      out += la;
+    }
+  }
+
+  // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1
+  // ) inverse operation: FundamentalLieAlgebraMatrix
+  static void projectOnAlgebra(LatticeAlgebraVector &h_out,
+                               const LatticeMatrix &in, Real scale = 1.0) {
+    conformable(h_out, in);
+    h_out = Zero();
+    Matrix Ta;
+
+    for (int a = 0; a < AlgebraDimension; a++) {
+      generator(a, Ta);
+      pokeColour(h_out, -2.0 * (trace(timesI(Ta) * in)) * scale, a);
+    }
+  }
+
+   
+  template <class vtype>
+  accelerator_inline static iScalar<vtype> ProjectOnGeneralGroup(const iScalar<vtype> &r) {
+    return ProjectOnGeneralGroup(r, group_name());
+  }
+
+  template <class vtype, int N>
+  accelerator_inline static iVector<vtype,N> ProjectOnGeneralGroup(const iVector<vtype,N> &r) {
+    return ProjectOnGeneralGroup(r, group_name());
+  }
+
+  template <class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr>
+  accelerator_inline static iMatrix<vtype,N> ProjectOnGeneralGroup(const iMatrix<vtype,N> &arg) {
+    return ProjectOnGeneralGroup(arg, group_name());
+  }
+
+  template <int N,class vComplex_t>                  // Projects on the general groups U(N), Sp(2N)xZ2 i.e. determinant is allowed a complex phase.
+  static void ProjectOnGeneralGroup(Lattice<iVector<iScalar<iMatrix<vComplex_t, N> >, Nd> > &U) {
+    for (int mu = 0; mu < Nd; mu++) {
+      auto Umu = PeekIndex<LorentzIndex>(U, mu);
+      Umu = ProjectOnGeneralGroup(Umu);
+    }
+  }
+       
+
+  
+  template <int N,class vComplex_t>
+  static Lattice<iScalar<iScalar<iMatrix<vComplex_t, N> > > > ProjectOnGeneralGroup(const Lattice<iScalar<iScalar<iMatrix<vComplex_t, N> > > > &Umu) {
+    return ProjectOnGeneralGroup(Umu, group_name());
+  }
+
+  template <int N,class vComplex_t>       // Projects on SU(N), Sp(2N), with unit determinant, by first projecting on general group and then enforcing unit determinant
+  static void ProjectOnSpecialGroup(Lattice<iScalar<iScalar<iMatrix<vComplex_t, N> > > > &Umu) {
+       Umu = ProjectOnGeneralGroup(Umu);
+       auto det = Determinant(Umu);
+
+       det = conjugate(det);
+
+       for (int i = 0; i < N; i++) {
+           auto element = PeekIndex<ColourIndex>(Umu, N - 1, i);
+           element = element * det;
+           PokeIndex<ColourIndex>(Umu, element, Nc - 1, i);
+       }
+   }
+
+  template <int N,class vComplex_t>    // reunitarise, resimplectify... previously ProjectSUn
+    static void ProjectOnSpecialGroup(Lattice<iVector<iScalar<iMatrix<vComplex_t, N> >, Nd> > &U) {
+      // Reunitarise
+      for (int mu = 0; mu < Nd; mu++) {
+        auto Umu = PeekIndex<LorentzIndex>(U, mu);
+        ProjectOnSpecialGroup(Umu);
+        PokeIndex<LorentzIndex>(U, Umu, mu);
+      }
+    }
+    
+  template <typename GaugeField>
+  static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) {
+    typedef typename GaugeField::vector_type vector_type;
+    typedef iGroupMatrix<vector_type> vMatrixType;
+    typedef Lattice<vMatrixType> LatticeMatrixType;
+
+    LatticeMatrixType Umu(out.Grid());
+    LatticeMatrixType tmp(out.Grid());
+    for (int mu = 0; mu < Nd; mu++) {
+      //      LieRandomize(pRNG, Umu, 1.0);
+      //      PokeIndex<LorentzIndex>(out, Umu, mu);
+      gaussian(pRNG,Umu);
+      tmp = Ta(Umu);
+      taExp(tmp,Umu);
+      ProjectOnSpecialGroup(Umu);
+      //      ProjectSUn(Umu);
+      PokeIndex<LorentzIndex>(out, Umu, mu);
+    }
+  }
+  template <typename GaugeField>
+  static void TepidConfiguration(GridParallelRNG &pRNG, GaugeField &out) {
+    typedef typename GaugeField::vector_type vector_type;
+    typedef iGroupMatrix<vector_type> vMatrixType;
+    typedef Lattice<vMatrixType> LatticeMatrixType;
+
+    LatticeMatrixType Umu(out.Grid());
+    for (int mu = 0; mu < Nd; mu++) {
+      LieRandomize(pRNG, Umu, 0.01);
+      PokeIndex<LorentzIndex>(out, Umu, mu);
+    }
+  }
+    
+  template <typename GaugeField>
+  static void ColdConfiguration(GaugeField &out) {
+    typedef typename GaugeField::vector_type vector_type;
+    typedef iGroupMatrix<vector_type> vMatrixType;
+    typedef Lattice<vMatrixType> LatticeMatrixType;
+
+    LatticeMatrixType Umu(out.Grid());
+    Umu = 1.0;
+    for (int mu = 0; mu < Nd; mu++) {
+      PokeIndex<LorentzIndex>(out, Umu, mu);
+    }
+  }
+    
+  template <typename GaugeField>
+  static void ColdConfiguration(GridParallelRNG &pRNG, GaugeField &out) {
+    ColdConfiguration(out);
+  }
+
+  template <typename LatticeMatrixType>
+  static void taProj(const LatticeMatrixType &in, LatticeMatrixType &out) {
+    taProj(in, out, group_name());
+  }
+    
+  template <typename LatticeMatrixType>
+  static void taExp(const LatticeMatrixType &x, LatticeMatrixType &ex) {
+    typedef typename LatticeMatrixType::scalar_type ComplexType;
+
+    LatticeMatrixType xn(x.Grid());
+    RealD nfac = 1.0;
+
+    xn = x;
+    ex = xn + ComplexType(1.0);  // 1+x
+
+    // Do a 12th order exponentiation
+    for (int i = 2; i <= 12; ++i) {
+      nfac = nfac / RealD(i);  // 1/2, 1/2.3 ...
+      xn = xn * x;             // x2, x3,x4....
+      ex = ex + xn * nfac;     // x2/2!, x3/3!....
+    }
+  }
+
+// Ta are hermitian (?)
+// Anti herm is i Ta basis
+static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, int b)
+{
+  conformable(in, out);
+  GridBase *grid = out.Grid();
+  LatticeComplex tmp(grid);
+  Matrix ta;
+  // Using Luchang's projection convention
+  //  2 Tr{Ta Tb} A_b= 2/2 delta ab A_b = A_a
+  autoView(out_v,out,AcceleratorWrite);
+  autoView(in_v,in,AcceleratorRead);
+  int N = ncolour;
+  int NNm1 = N * (N - 1);
+  int hNNm1= NNm1/2;
+  RealD sqrt_2 = sqrt(2.0);
+  Complex ci(0.0,1.0);
+  for(int su2Index=0;su2Index<hNNm1;su2Index++){
+    int i1, i2;
+    su2SubGroupIndex(i1, i2, su2Index);
+    int ax = su2Index*2;
+    int ay = su2Index*2+1;
+    accelerator_for(ss,grid->oSites(),1,{
+	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
+	// trace( Ta x Ci in)
+	// Bet I need to move to real part with mult by -i
+	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
+	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
+      });
+  }
+  for(int diagIndex=0;diagIndex<N-1;diagIndex++){
+    int k = diagIndex + 1; // diagIndex starts from 0
+    int a = NNm1+diagIndex;
+    RealD scale = 1.0/sqrt(2.0*k*(k+1));
+    accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
+	auto tmp = in_v[ss]()()(0,0);
+	for(int i=1;i<k;i++){
+	  tmp=tmp+in_v[ss]()()(i,i);
+	}
+	tmp = tmp - in_v[ss]()()(k,k)*k;
+	out_v[ss]()()(a,b) =imag(tmp) * scale;
+      });
+    }
+}
+
+  
+};
+    
+template <int ncolour>
+using SU = GaugeGroup<ncolour, GroupName::SU>;
+
+template <int ncolour>
+using Sp = GaugeGroup<ncolour, GroupName::Sp>;
+
+typedef SU<2> SU2;
+typedef SU<3> SU3;
+typedef SU<4> SU4;
+typedef SU<5> SU5;
+
+typedef SU<Nc> FundamentalMatrices;
+    
+typedef Sp<2> Sp2;
+typedef Sp<4> Sp4;
+typedef Sp<6> Sp6;
+typedef Sp<8> Sp8;
+
+template <int N,class vComplex_t>
+static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<vComplex_t, N> > > > &Umu)
+{
+    GaugeGroup<N,GroupName::SU>::ProjectOnSpecialGroup(Umu);
+}
+  
+template <int N,class vComplex_t>
+static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<vComplex_t, N> >,Nd> > &U)
+{
+    GaugeGroup<N,GroupName::SU>::ProjectOnSpecialGroup(U);
+}
+    
+template <int N,class vComplex_t>
+static void ProjectSpn(Lattice<iScalar<iScalar<iMatrix<vComplex_t, N> > > > &Umu)
+{
+    GaugeGroup<N,GroupName::Sp>::ProjectOnSpecialGroup(Umu);
+}
+    
+template <int N,class vComplex_t>
+static void ProjectSpn(Lattice<iVector<iScalar<iMatrix<vComplex_t, N> >,Nd> > &U)
+{
+    GaugeGroup<N,GroupName::Sp>::ProjectOnSpecialGroup(U);
+}
+
+// Explicit specialisation for SU(3).
+static void ProjectSU3(Lattice<iScalar<iScalar<iMatrix<vComplexD, 3> > > > &Umu)
+{
+  GridBase *grid = Umu.Grid();
+  const int x = 0;
+  const int y = 1;
+  const int z = 2;
+  // Reunitarise
+  Umu = ProjectOnGroup(Umu);
+  autoView(Umu_v, Umu, CpuWrite);
+  thread_for(ss, grid->oSites(), {
+    auto cm = Umu_v[ss];
+    cm()()(2, x) = adj(cm()()(0, y) * cm()()(1, z) -
+                       cm()()(0, z) * cm()()(1, y));  // x= yz-zy
+    cm()()(2, y) = adj(cm()()(0, z) * cm()()(1, x) -
+                       cm()()(0, x) * cm()()(1, z));  // y= zx-xz
+    cm()()(2, z) = adj(cm()()(0, x) * cm()()(1, y) -
+                       cm()()(0, y) * cm()()(1, x));  // z= xy-yx
+    Umu_v[ss] = cm;
+  });
+}
+static void ProjectSU3(Lattice<iVector<iScalar<iMatrix<vComplexD, 3> >, Nd> > &U)
+{
+  GridBase *grid = U.Grid();
+  // Reunitarise
+  for (int mu = 0; mu < Nd; mu++) {
+    auto Umu = PeekIndex<LorentzIndex>(U, mu);
+    Umu = ProjectOnGroup(Umu);
+    ProjectSU3(Umu);
+    PokeIndex<LorentzIndex>(U, Umu, mu);
+  }
+}
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/qcd/utils/GaugeGroupTwoIndex.h
+++ b/Grid/qcd/utils/GaugeGroupTwoIndex.h
@@ -0,0 +1,371 @@
+////////////////////////////////////////////////////////////////////////
+//
+// * Two index representation generators
+//
+// * Normalisation for the fundamental generators:
+//   trace ta tb = 1/2 delta_ab = T_F delta_ab
+//   T_F = 1/2  for SU(N) groups
+//
+//
+//   base for NxN two index (anti-symmetric) matrices
+//   normalized to 1 (d_ij is the kroenecker delta)
+//
+//   (e^(ij)_{kl} = 1 / sqrt(2) (d_ik d_jl +/- d_jk d_il)
+//
+//   Then the generators are written as
+//
+//   (iT_a)^(ij)(lk) = i * ( tr[e^(ij)^dag e^(lk) T^trasp_a] +
+//   tr[e^(lk)e^(ij)^dag T_a] )  //
+//
+//
+////////////////////////////////////////////////////////////////////////
+
+// Authors: David Preti, Guido Cossu
+
+#ifndef QCD_UTIL_GAUGEGROUPTWOINDEX_H
+#define QCD_UTIL_GAUGEGROUPTWOINDEX_H
+
+NAMESPACE_BEGIN(Grid);
+
+enum TwoIndexSymmetry { Symmetric = 1, AntiSymmetric = -1 };
+
+constexpr inline Real delta(int a, int b) { return (a == b) ? 1.0 : 0.0; }
+
+namespace detail {
+
+template <class cplx, int nc, TwoIndexSymmetry S>
+struct baseOffDiagonalSpHelper;
+
+template <class cplx, int nc>
+struct baseOffDiagonalSpHelper<cplx, nc, AntiSymmetric> {
+  static const int ngroup = nc / 2;
+  static void baseOffDiagonalSp(int i, int j, iScalar<iScalar<iMatrix<cplx, nc> > > &eij) {
+    eij = Zero();
+    RealD tmp;
+
+    if ((i == ngroup + j) && (1 <= j) && (j < ngroup)) {
+      for (int k = 0; k < j+1; k++) {
+        if (k < j) {
+          tmp = 1 / sqrt(j * (j + 1));
+          eij()()(k, k + ngroup) = tmp;
+          eij()()(k + ngroup, k) = -tmp;
+        }
+        if (k == j) {
+          tmp = -j / sqrt(j * (j + 1));
+          eij()()(k, k + ngroup) = tmp;
+          eij()()(k + ngroup, k) = -tmp;
+        }
+      }
+
+    }
+
+    else if (i != ngroup + j) {
+      for (int k = 0; k < nc; k++)
+        for (int l = 0; l < nc; l++) {
+          eij()()(l, k) =
+              delta(i, k) * delta(j, l) - delta(j, k) * delta(i, l);
+        }
+    }
+    RealD nrm = 1. / std::sqrt(2.0);
+    eij = eij * nrm;
+  }
+};
+
+template <class cplx, int nc>
+struct baseOffDiagonalSpHelper<cplx, nc, Symmetric> {
+  static void baseOffDiagonalSp(int i, int j, iScalar<iScalar<iMatrix<cplx, nc> > > &eij) {
+    eij = Zero();
+    for (int k = 0; k < nc; k++)
+      for (int l = 0; l < nc; l++)
+        eij()()(l, k) =
+            delta(i, k) * delta(j, l) + delta(j, k) * delta(i, l);
+
+    RealD nrm = 1. / std::sqrt(2.0);
+    eij = eij * nrm;
+  }
+};
+
+}   // closing detail namespace
+
+template <int ncolour, TwoIndexSymmetry S, class group_name>
+class GaugeGroupTwoIndex : public GaugeGroup<ncolour, group_name> {
+ public:
+  // The chosen convention is that we are taking ncolour to be N in SU<N> but 2N
+  // in Sp(2N). ngroup is equal to N for SU but 2N/2 = N for Sp(2N).
+  static_assert(std::is_same<group_name, GroupName::SU>::value or
+                    std::is_same<group_name, GroupName::Sp>::value,
+                "ngroup is only implemented for SU and Sp currently.");
+  static const int ngroup =
+      std::is_same<group_name, GroupName::SU>::value ? ncolour : ncolour / 2;
+  static const int Dimension =
+      (ncolour * (ncolour + S) / 2) + (std::is_same<group_name, GroupName::Sp>::value ? (S - 1) / 2 : 0);
+  static const int DimensionAS =
+      (ncolour * (ncolour - 1) / 2) + (std::is_same<group_name, GroupName::Sp>::value ? (- 1) : 0);
+  static const int DimensionS =
+      ncolour * (ncolour + 1) / 2;
+  static const int NumGenerators =
+      GaugeGroup<ncolour, group_name>::AlgebraDimension;
+
+  template <typename vtype>
+  using iGroupTwoIndexMatrix = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
+
+  typedef iGroupTwoIndexMatrix<Complex> TIMatrix;
+  typedef iGroupTwoIndexMatrix<ComplexF> TIMatrixF;
+  typedef iGroupTwoIndexMatrix<ComplexD> TIMatrixD;
+
+  typedef iGroupTwoIndexMatrix<vComplex> vTIMatrix;
+  typedef iGroupTwoIndexMatrix<vComplexF> vTIMatrixF;
+  typedef iGroupTwoIndexMatrix<vComplexD> vTIMatrixD;
+
+  typedef Lattice<vTIMatrix> LatticeTwoIndexMatrix;
+  typedef Lattice<vTIMatrixF> LatticeTwoIndexMatrixF;
+  typedef Lattice<vTIMatrixD> LatticeTwoIndexMatrixD;
+
+  typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> >
+      LatticeTwoIndexField;
+  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> >
+      LatticeTwoIndexFieldF;
+  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> >
+      LatticeTwoIndexFieldD;
+
+  template <typename vtype>
+  using iGroupMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
+
+  typedef iGroupMatrix<Complex> Matrix;
+  typedef iGroupMatrix<ComplexF> MatrixF;
+  typedef iGroupMatrix<ComplexD> MatrixD;
+    
+private:
+  template <class cplx>
+  static void baseDiagonal(int Index, iGroupMatrix<cplx> &eij) {
+    eij = Zero();
+    eij()()(Index - ncolour * (ncolour - 1) / 2,
+            Index - ncolour * (ncolour - 1) / 2) = 1.0;
+  }
+    
+  template <class cplx>
+  static void baseOffDiagonal(int i, int j, iGroupMatrix<cplx> &eij, GroupName::SU) {
+    eij = Zero();
+    for (int k = 0; k < ncolour; k++)
+      for (int l = 0; l < ncolour; l++)
+        eij()()(l, k) =
+            delta(i, k) * delta(j, l) + S * delta(j, k) * delta(i, l);
+
+    RealD nrm = 1. / std::sqrt(2.0);
+    eij = eij * nrm;
+  }
+    
+  template <class cplx>
+  static void baseOffDiagonal(int i, int j, iGroupMatrix<cplx> &eij, GroupName::Sp) {
+    detail::baseOffDiagonalSpHelper<cplx, ncolour, S>::baseOffDiagonalSp(i, j, eij);
+  }
+
+public:
+    
+  template <class cplx>
+  static void base(int Index, iGroupMatrix<cplx> &eij) {
+  // returns (e)^(ij)_{kl} necessary for change of base U_F -> U_R
+    assert(Index < Dimension);
+    eij = Zero();
+  // for the linearisation of the 2 indexes
+    static int a[ncolour * (ncolour - 1) / 2][2];  // store the a <-> i,j
+    static bool filled = false;
+    if (!filled) {
+      int counter = 0;
+      for (int i = 1; i < ncolour; i++) {
+      for (int j = 0; j < i; j++) {
+        if (std::is_same<group_name, GroupName::Sp>::value)
+          {
+            if (j==0 && i==ngroup+j && S==-1) {
+            //std::cout << "skipping" << std::endl; // for Sp2n this vanishes identically.
+              j = j+1;
+            }
+          }
+          a[counter][0] = i;
+          a[counter][1] = j;
+          counter++;
+          }
+      }
+      filled = true;
+    }
+    if (Index < ncolour*ncolour - DimensionS)
+    {
+      baseOffDiagonal(a[Index][0], a[Index][1], eij, group_name());
+    } else {
+      baseDiagonal(Index, eij);
+    }
+  }
+    
+  static void printBase(void) {
+    for (int gen = 0; gen < Dimension; gen++) {
+      Matrix tmp;
+      base(gen, tmp);
+      std::cout << GridLogMessage << "Nc = " << ncolour << " t_" << gen
+                << std::endl;
+      std::cout << GridLogMessage << tmp << std::endl;
+    }
+  }
+
+  template <class cplx>
+  static void generator(int Index, iGroupTwoIndexMatrix<cplx> &i2indTa) {
+    Vector<iGroupMatrix<cplx> > ta(NumGenerators);
+    Vector<iGroupMatrix<cplx> > eij(Dimension);
+    iGroupMatrix<cplx> tmp;
+
+    for (int a = 0; a < NumGenerators; a++)
+      GaugeGroup<ncolour, group_name>::generator(a, ta[a]);
+
+    for (int a = 0; a < Dimension; a++) base(a, eij[a]);
+
+    for (int a = 0; a < Dimension; a++) {
+      tmp = transpose(eij[a]*ta[Index]) + transpose(eij[a]) * ta[Index];
+      for (int b = 0; b < Dimension; b++) {
+        Complex iTr = TensorRemove(timesI(trace(tmp * eij[b])));
+        i2indTa()()(a, b) = iTr;
+      }
+    }
+  }
+
+  static void printGenerators(void) {
+    for (int gen = 0; gen < NumGenerators; gen++) {
+      TIMatrix i2indTa;
+      generator(gen, i2indTa);
+      std::cout << GridLogMessage << "Nc = " << ncolour << " t_" << gen
+                << std::endl;
+      std::cout << GridLogMessage << i2indTa << std::endl;
+    }
+  }
+
+  static void testGenerators(void) {
+    TIMatrix i2indTa, i2indTb;
+    std::cout << GridLogMessage << "2IndexRep - Checking if traceless"
+              << std::endl;
+    for (int a = 0; a < NumGenerators; a++) {
+      generator(a, i2indTa);
+      std::cout << GridLogMessage << a << std::endl;
+      assert(norm2(trace(i2indTa)) < 1.0e-6);
+    }
+    std::cout << GridLogMessage << std::endl;
+
+    std::cout << GridLogMessage << "2IndexRep - Checking if antihermitean"
+              << std::endl;
+    for (int a = 0; a < NumGenerators; a++) {
+      generator(a, i2indTa);
+      std::cout << GridLogMessage << a << std::endl;
+      assert(norm2(adj(i2indTa) + i2indTa) < 1.0e-6);
+    }
+
+    std::cout << GridLogMessage << std::endl;
+    std::cout << GridLogMessage
+              << "2IndexRep - Checking Tr[Ta*Tb]=delta(a,b)*(N +- 2)/2"
+              << std::endl;
+    for (int a = 0; a < NumGenerators; a++) {
+      for (int b = 0; b < NumGenerators; b++) {
+        generator(a, i2indTa);
+        generator(b, i2indTb);
+
+        // generator returns iTa, so we need a minus sign here
+        Complex Tr = -TensorRemove(trace(i2indTa * i2indTb));
+        std::cout << GridLogMessage << "a=" << a << "b=" << b << "Tr=" << Tr
+                  << std::endl;
+        if (a == b) {
+          assert(real(Tr) - ((ncolour + S * 2) * 0.5) < 1e-8);
+        } else {
+          assert(real(Tr) < 1e-8);
+        }
+        assert(imag(Tr) < 1e-8);
+      }
+    }
+    std::cout << GridLogMessage << std::endl;
+  }
+
+  static void TwoIndexLieAlgebraMatrix(
+      const typename GaugeGroup<ncolour, group_name>::LatticeAlgebraVector &h,
+      LatticeTwoIndexMatrix &out, Real scale = 1.0) {
+    conformable(h, out);
+    GridBase *grid = out.Grid();
+    LatticeTwoIndexMatrix la(grid);
+    TIMatrix i2indTa;
+
+    out = Zero();
+    for (int a = 0; a < NumGenerators; a++) {
+      generator(a, i2indTa);
+      la = peekColour(h, a) * i2indTa;
+      out += la;
+    }
+    out *= scale;
+  }
+
+  // Projects the algebra components
+  // of a lattice matrix ( of dimension ncol*ncol -1 )
+  static void projectOnAlgebra(
+      typename GaugeGroup<ncolour, group_name>::LatticeAlgebraVector &h_out,
+      const LatticeTwoIndexMatrix &in, Real scale = 1.0) {
+    conformable(h_out, in);
+    h_out = Zero();
+    TIMatrix i2indTa;
+    Real coefficient = -2.0 / (ncolour + 2 * S) * scale;
+    // 2/(Nc +/- 2) for the normalization of the trace in the two index rep
+    for (int a = 0; a < NumGenerators; a++) {
+      generator(a, i2indTa);
+      pokeColour(h_out, real(trace(i2indTa * in)) * coefficient, a);
+    }
+  }
+
+  // a projector that keeps the generators stored to avoid the overhead of
+  // recomputing them
+  static void projector(
+      typename GaugeGroup<ncolour, group_name>::LatticeAlgebraVector &h_out,
+      const LatticeTwoIndexMatrix &in, Real scale = 1.0) {
+    conformable(h_out, in);
+    // to store the generators
+    static std::vector<TIMatrix> i2indTa(NumGenerators);
+    h_out = Zero();
+    static bool precalculated = false;
+    if (!precalculated) {
+      precalculated = true;
+      for (int a = 0; a < NumGenerators; a++) generator(a, i2indTa[a]);
+    }
+
+    Real coefficient =
+        -2.0 / (ncolour + 2 * S) * scale;  // 2/(Nc +/- 2) for the normalization
+    // of the trace in the two index rep
+
+    for (int a = 0; a < NumGenerators; a++) {
+      auto tmp = real(trace(i2indTa[a] * in)) * coefficient;
+      pokeColour(h_out, tmp, a);
+    }
+  }
+};
+
+template <int ncolour, TwoIndexSymmetry S>
+using SU_TwoIndex = GaugeGroupTwoIndex<ncolour, S, GroupName::SU>;
+
+// Some useful type names
+typedef SU_TwoIndex<Nc, Symmetric> TwoIndexSymmMatrices;
+typedef SU_TwoIndex<Nc, AntiSymmetric> TwoIndexAntiSymmMatrices;
+
+typedef SU_TwoIndex<2, Symmetric> SU2TwoIndexSymm;
+typedef SU_TwoIndex<3, Symmetric> SU3TwoIndexSymm;
+typedef SU_TwoIndex<4, Symmetric> SU4TwoIndexSymm;
+typedef SU_TwoIndex<5, Symmetric> SU5TwoIndexSymm;
+
+typedef SU_TwoIndex<2, AntiSymmetric> SU2TwoIndexAntiSymm;
+typedef SU_TwoIndex<3, AntiSymmetric> SU3TwoIndexAntiSymm;
+typedef SU_TwoIndex<4, AntiSymmetric> SU4TwoIndexAntiSymm;
+typedef SU_TwoIndex<5, AntiSymmetric> SU5TwoIndexAntiSymm;
+
+template <int ncolour, TwoIndexSymmetry S>
+using Sp_TwoIndex = GaugeGroupTwoIndex<ncolour, S, GroupName::Sp>;
+
+typedef Sp_TwoIndex<Nc, Symmetric> SpTwoIndexSymmMatrices;
+typedef Sp_TwoIndex<Nc, AntiSymmetric> SpTwoIndexAntiSymmMatrices;
+
+typedef Sp_TwoIndex<2, Symmetric> Sp2TwoIndexSymm;
+typedef Sp_TwoIndex<4, Symmetric> Sp4TwoIndexSymm;
+
+typedef Sp_TwoIndex<4, AntiSymmetric> Sp4TwoIndexAntiSymm;
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@@ -1,932 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/utils/SUn.h
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-			   /*  END LEGAL */
-#ifndef QCD_UTIL_SUN_H
-#define QCD_UTIL_SUN_H
-
-NAMESPACE_BEGIN(Grid);
-
-template<int N, class Vec>
-Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
-{
-  GridBase *grid=Umu.Grid();
-  auto lvol = grid->lSites();
-  Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
-  typedef typename Vec::scalar_type scalar;
-  autoView(Umu_v,Umu,CpuRead);
-  autoView(ret_v,ret,CpuWrite);
-  thread_for(site,lvol,{
-    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
-    Coordinate lcoor;
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    iScalar<iScalar<iMatrix<scalar, N> > > Us;
-    peekLocalSite(Us, Umu_v, lcoor);
-    for(int i=0;i<N;i++){
-      for(int j=0;j<N;j++){
-	scalar tmp= Us()()(i,j);
-	ComplexD ztmp(real(tmp),imag(tmp));
-	EigenU(i,j)=ztmp;
-      }}
-    ComplexD detD  = EigenU.determinant();
-    typename Vec::scalar_type det(detD.real(),detD.imag());
-    pokeLocalSite(det,ret_v,lcoor);
-  });
-  return ret;
-}
-
-template<int N, class Vec>
-static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
-{
-  Umu      = ProjectOnGroup(Umu);
-  auto det = Determinant(Umu);
-
-  det = conjugate(det);
-
-  for(int i=0;i<N;i++){
-    auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
-    element = element * det;
-    PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
-  }
-}
-template<int N,class Vec>
-static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<Vec, N> >,Nd> > &U)
-{
-  GridBase *grid=U.Grid();
-  // Reunitarise
-  for(int mu=0;mu<Nd;mu++){
-    auto Umu = PeekIndex<LorentzIndex>(U,mu);
-    Umu      = ProjectOnGroup(Umu);
-    ProjectSUn(Umu);
-    PokeIndex<LorentzIndex>(U,Umu,mu);
-  }
-}
-
-template <int ncolour>
-class SU {
-public:
-  static const int Dimension = ncolour;
-  static const int AdjointDimension = ncolour * ncolour - 1;
-  static int su2subgroups(void) { return (ncolour * (ncolour - 1)) / 2; }
-
-  template <typename vtype>
-  using iSUnMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
-  template <typename vtype>
-  using iSU2Matrix = iScalar<iScalar<iMatrix<vtype, 2> > >;
-  template <typename vtype>
-  using iSUnAlgebraVector =
-    iScalar<iScalar<iVector<vtype, AdjointDimension> > >;
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////
-  // Types can be accessed as SU<2>::Matrix , SU<2>::vSUnMatrix,
-  // SU<2>::LatticeMatrix etc...
-  //////////////////////////////////////////////////////////////////////////////////////////////////
-  typedef iSUnMatrix<Complex> Matrix;
-  typedef iSUnMatrix<ComplexF> MatrixF;
-  typedef iSUnMatrix<ComplexD> MatrixD;
-
-  typedef iSUnMatrix<vComplex> vMatrix;
-  typedef iSUnMatrix<vComplexF> vMatrixF;
-  typedef iSUnMatrix<vComplexD> vMatrixD;
-
-  // For the projectors to the algebra
-  // these should be real...
-  // keeping complex for consistency with the SIMD vector types
-  typedef iSUnAlgebraVector<Complex> AlgebraVector;
-  typedef iSUnAlgebraVector<ComplexF> AlgebraVectorF;
-  typedef iSUnAlgebraVector<ComplexD> AlgebraVectorD;
-
-  typedef iSUnAlgebraVector<vComplex> vAlgebraVector;
-  typedef iSUnAlgebraVector<vComplexF> vAlgebraVectorF;
-  typedef iSUnAlgebraVector<vComplexD> vAlgebraVectorD;
-
-  typedef Lattice<vMatrix> LatticeMatrix;
-  typedef Lattice<vMatrixF> LatticeMatrixF;
-  typedef Lattice<vMatrixD> LatticeMatrixD;
-
-  typedef Lattice<vAlgebraVector> LatticeAlgebraVector;
-  typedef Lattice<vAlgebraVectorF> LatticeAlgebraVectorF;
-  typedef Lattice<vAlgebraVectorD> LatticeAlgebraVectorD;
-
-  typedef iSU2Matrix<Complex> SU2Matrix;
-  typedef iSU2Matrix<ComplexF> SU2MatrixF;
-  typedef iSU2Matrix<ComplexD> SU2MatrixD;
-
-  typedef iSU2Matrix<vComplex> vSU2Matrix;
-  typedef iSU2Matrix<vComplexF> vSU2MatrixF;
-  typedef iSU2Matrix<vComplexD> vSU2MatrixD;
-
-  typedef Lattice<vSU2Matrix> LatticeSU2Matrix;
-  typedef Lattice<vSU2MatrixF> LatticeSU2MatrixF;
-  typedef Lattice<vSU2MatrixD> LatticeSU2MatrixD;
-
-  ////////////////////////////////////////////////////////////////////////
-  // There are N^2-1 generators for SU(N).
-  //
-  // We take a traceless hermitian generator basis as follows
-  //
-  // * Normalisation: trace ta tb = 1/2 delta_ab = T_F delta_ab
-  //   T_F = 1/2  for SU(N) groups
-  //
-  // * Off diagonal
-  //    - pairs of rows i1,i2 behaving like pauli matrices signma_x, sigma_y
-  //
-  //    - there are (Nc-1-i1) slots for i2 on each row [ x  0  x ]
-  //      direct count off each row
-  //
-  //    - Sum of all pairs is Nc(Nc-1)/2: proof arithmetic series
-  //
-  //      (Nc-1) + (Nc-2)+...  1      ==> Nc*(Nc-1)/2
-  //      1+ 2+          +   + Nc-1
-  //
-  //    - There are 2 x Nc (Nc-1)/ 2 of these = Nc^2 - Nc
-  //
-  //    - We enumerate the row-col pairs.
-  //    - for each row col pair there is a (sigma_x) and a (sigma_y) like
-  //    generator
-  //
-  //
-  //   t^a_ij = { in 0.. Nc(Nc-1)/2 -1} =>  1/2(delta_{i,i1} delta_{j,i2} +
-  //   delta_{i,i1} delta_{j,i2})
-  //   t^a_ij = { in Nc(Nc-1)/2 ... Nc(Nc-1) - 1} =>  i/2( delta_{i,i1}
-  //   delta_{j,i2} - i delta_{i,i1} delta_{j,i2})
-  //
-  // * Diagonal; must be traceless and normalised
-  //   - Sequence is
-  //   N  (1,-1,0,0...)
-  //   N  (1, 1,-2,0...)
-  //   N  (1, 1, 1,-3,0...)
-  //   N  (1, 1, 1, 1,-4,0...)
-  //
-  //   where 1/2 = N^2 (1+.. m^2)etc.... for the m-th diagonal generator
-  //   NB this gives the famous SU3 result for su2 index 8
-  //
-  //   N= sqrt(1/2 . 1/6 ) = 1/2 . 1/sqrt(3)
-  //
-  //   ( 1      )
-  //   (    1   ) / sqrt(3) /2  = 1/2 lambda_8
-  //   (      -2)
-  //
-  ////////////////////////////////////////////////////////////////////////
-  template <class cplx>
-  static void generator(int lieIndex, iSUnMatrix<cplx> &ta) {
-    // map lie index to which type of generator
-    int diagIndex;
-    int su2Index;
-    int sigxy;
-    int NNm1 = ncolour * (ncolour - 1);
-    if (lieIndex >= NNm1) {
-      diagIndex = lieIndex - NNm1;
-      generatorDiagonal(diagIndex, ta);
-      return;
-    }
-    sigxy = lieIndex & 0x1;  // even or odd
-    su2Index = lieIndex >> 1;
-    if (sigxy)
-      generatorSigmaY(su2Index, ta);
-    else
-      generatorSigmaX(su2Index, ta);
-  }
-  
-  template <class cplx>
-  static void generatorSigmaY(int su2Index, iSUnMatrix<cplx> &ta) {
-    ta = Zero();
-    int i1, i2;
-    su2SubGroupIndex(i1, i2, su2Index);
-    ta()()(i1, i2) = 1.0;
-    ta()()(i2, i1) = 1.0;
-    ta = ta * 0.5;
-  }
-  
-  template <class cplx>
-  static void generatorSigmaX(int su2Index, iSUnMatrix<cplx> &ta) {
-    ta = Zero();
-    cplx i(0.0, 1.0);
-    int i1, i2;
-    su2SubGroupIndex(i1, i2, su2Index);
-    ta()()(i1, i2) = i;
-    ta()()(i2, i1) = -i;
-    ta = ta * 0.5;
-  }
-
-  template <class cplx>
-  static void generatorDiagonal(int diagIndex, iSUnMatrix<cplx> &ta) {
-    // diag ({1, 1, ..., 1}(k-times), -k, 0, 0, ...)
-    ta = Zero();
-    int k = diagIndex + 1;                  // diagIndex starts from 0
-    for (int i = 0; i <= diagIndex; i++) {  // k iterations
-      ta()()(i, i) = 1.0;
-    }
-    ta()()(k, k) = -k;  // indexing starts from 0
-    RealD nrm = 1.0 / std::sqrt(2.0 * k * (k + 1));
-    ta = ta * nrm;
-  }
-
-
-
-  ////////////////////////////////////////////////////////////////////////
-  // Map a su2 subgroup number to the pair of rows that are non zero
-  ////////////////////////////////////////////////////////////////////////
-  static void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
-    assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
-
-    int spare = su2_index;
-    for (i1 = 0; spare >= (ncolour - 1 - i1); i1++) {
-      spare = spare - (ncolour - 1 - i1);  // remove the Nc-1-i1 terms
-    }
-    i2 = i1 + 1 + spare;
-  }
-
-  //////////////////////////////////////////////////////////////////////////////////////////
-  // Pull out a subgroup and project on to real coeffs x pauli basis
-  //////////////////////////////////////////////////////////////////////////////////////////
-  template <class vcplx>
-  static void su2Extract(Lattice<iSinglet<vcplx> > &Determinant,
-                         Lattice<iSU2Matrix<vcplx> > &subgroup,
-                         const Lattice<iSUnMatrix<vcplx> > &source,
-                         int su2_index) {
-    GridBase *grid(source.Grid());
-    conformable(subgroup, source);
-    conformable(subgroup, Determinant);
-    int i0, i1;
-    su2SubGroupIndex(i0, i1, su2_index);
-
-    autoView( subgroup_v , subgroup,AcceleratorWrite);
-    autoView( source_v   , source,AcceleratorRead);
-    autoView( Determinant_v , Determinant,AcceleratorWrite);
-    accelerator_for(ss, grid->oSites(), 1, {
-
-      subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0);
-      subgroup_v[ss]()()(0, 1) = source_v[ss]()()(i0, i1);
-      subgroup_v[ss]()()(1, 0) = source_v[ss]()()(i1, i0);
-      subgroup_v[ss]()()(1, 1) = source_v[ss]()()(i1, i1);
-
-      iSU2Matrix<vcplx> Sigma = subgroup_v[ss];
-
-      Sigma = Sigma - adj(Sigma) + trace(adj(Sigma));
-
-      subgroup_v[ss] = Sigma;
-
-      // this should be purely real
-      Determinant_v[ss] =
-	Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0);
-    });
-  }
-
-  //////////////////////////////////////////////////////////////////////////////////////////
-  // Set matrix to one and insert a pauli subgroup
-  //////////////////////////////////////////////////////////////////////////////////////////
-  template <class vcplx>
-  static void su2Insert(const Lattice<iSU2Matrix<vcplx> > &subgroup,
-                        Lattice<iSUnMatrix<vcplx> > &dest, int su2_index) {
-    GridBase *grid(dest.Grid());
-    conformable(subgroup, dest);
-    int i0, i1;
-    su2SubGroupIndex(i0, i1, su2_index);
-
-    dest = 1.0;  // start out with identity
-    autoView( dest_v , dest, AcceleratorWrite);
-    autoView( subgroup_v, subgroup, AcceleratorRead);
-    accelerator_for(ss, grid->oSites(),1,
-    {
-      dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0);
-      dest_v[ss]()()(i0, i1) = subgroup_v[ss]()()(0, 1);
-      dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0);
-      dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1);
-    });
-
-  }
-
-  ///////////////////////////////////////////////
-  // Generate e^{ Re Tr Staple Link} dlink
-  //
-  // *** Note Staple should be appropriate linear compbination between all
-  // staples.
-  // *** If already by beta pass coefficient 1.0.
-  // *** This routine applies the additional 1/Nc factor that comes after trace
-  // in action.
-  //
-  ///////////////////////////////////////////////
-  static void SubGroupHeatBath(GridSerialRNG &sRNG, GridParallelRNG &pRNG,
-			       RealD beta,  // coeff multiplying staple in action (with no 1/Nc)
-			       LatticeMatrix &link,
-			       const LatticeMatrix &barestaple,  // multiplied by action coeffs so th
-			       int su2_subgroup, int nheatbath, LatticeInteger &wheremask) 
-  {
-    GridBase *grid = link.Grid();
-
-    const RealD twopi = 2.0 * M_PI;
-
-    LatticeMatrix staple(grid);
-
-    staple = barestaple * (beta / ncolour);
-
-    LatticeMatrix V(grid);
-    V = link * staple;
-
-    // Subgroup manipulation in the lie algebra space
-    LatticeSU2Matrix u(grid);  // Kennedy pendleton "u" real projected normalised Sigma
-    LatticeSU2Matrix uinv(grid);
-    LatticeSU2Matrix ua(grid);  // a in pauli form
-    LatticeSU2Matrix b(grid);   // rotated matrix after hb
-
-    // Some handy constant fields
-    LatticeComplex ones(grid);
-    ones = 1.0;
-    LatticeComplex zeros(grid);
-    zeros = Zero();
-    LatticeReal rones(grid);
-    rones = 1.0;
-    LatticeReal rzeros(grid);
-    rzeros = Zero();
-    LatticeComplex udet(grid);  // determinant of real(staple)
-    LatticeInteger mask_true(grid);
-    mask_true = 1;
-    LatticeInteger mask_false(grid);
-    mask_false = 0;
-
-    /*
-      PLB 156 P393 (1985) (Kennedy and Pendleton)
-
-      Note: absorb "beta" into the def of sigma compared to KP paper; staple
-      passed to this routine has "beta" already multiplied in
-
-      Action linear in links h and of form:
-
-      beta S = beta  Sum_p (1 - 1/Nc Re Tr Plaq )
-
-      Writing Sigma = 1/Nc (beta Sigma') where sum over staples is "Sigma' "
-
-      beta S = const - beta/Nc Re Tr h Sigma'
-      = const - Re Tr h Sigma
-
-      Decompose h and Sigma into (1, sigma_j) ; h_i real, h^2=1, Sigma_i complex
-      arbitrary.
-
-      Tr h Sigma = h_i Sigma_j Tr (sigma_i sigma_j)  = h_i Sigma_j 2 delta_ij
-      Re Tr h Sigma = 2 h_j Re Sigma_j
-
-      Normalised re Sigma_j = xi u_j
-
-      With u_j a unit vector and U can be in SU(2);
-
-      Re Tr h Sigma = 2 h_j Re Sigma_j = 2 xi (h.u)
-
-      4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
-      u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
-
-      xi = sqrt(Det)/2;
-
-      Write a= u h in SU(2); a has pauli decomp a_j;
-
-      Note: Product b' xi is unvariant because scaling Sigma leaves
-      normalised vector "u" fixed; Can rescale Sigma so b' = 1.
-    */
-
-    ////////////////////////////////////////////////////////
-    // Real part of Pauli decomposition
-    // Note a subgroup can project to zero in cold start
-    ////////////////////////////////////////////////////////
-    su2Extract(udet, u, V, su2_subgroup);
-
-    //////////////////////////////////////////////////////
-    // Normalising this vector if possible; else identity
-    //////////////////////////////////////////////////////
-    LatticeComplex xi(grid);
-
-    LatticeSU2Matrix lident(grid);
-
-    SU2Matrix ident = Complex(1.0);
-    SU2Matrix pauli1;
-    SU<2>::generator(0, pauli1);
-    SU2Matrix pauli2;
-    SU<2>::generator(1, pauli2);
-    SU2Matrix pauli3;
-    SU<2>::generator(2, pauli3);
-    pauli1 = timesI(pauli1) * 2.0;
-    pauli2 = timesI(pauli2) * 2.0;
-    pauli3 = timesI(pauli3) * 2.0;
-
-    LatticeComplex cone(grid);
-    LatticeReal adet(grid);
-    adet = abs(toReal(udet));
-    lident = Complex(1.0);
-    cone = Complex(1.0);
-    Real machine_epsilon = 1.0e-7;
-    u = where(adet > machine_epsilon, u, lident);
-    udet = where(adet > machine_epsilon, udet, cone);
-
-    xi = 0.5 * sqrt(udet);  // 4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
-    u = 0.5 * u *
-      pow(xi, -1.0);  //  u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
-
-    // Debug test for sanity
-    uinv = adj(u);
-    b = u * uinv - 1.0;
-    assert(norm2(b) < 1.0e-4);
-
-    /*
-      Measure: Haar measure dh has d^4a delta(1-|a^2|)
-      In polars:
-      da = da0 r^2 sin theta dr dtheta dphi delta( 1 - r^2 -a0^2)
-      = da0 r^2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r)(sqrt(1-a0^) +
-      r) )
-      = da0 r/2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r) )
-
-      Action factor Q(h) dh  = e^-S[h]  dh =  e^{  xi Tr uh} dh    // beta enters
-      through xi
-      =  e^{2 xi (h.u)} dh
-      =  e^{2 xi h0u0}.e^{2 xi h1u1}.e^{2 xi
-      h2u2}.e^{2 xi h3u3} dh
-
-      Therefore for each site, take xi for that site
-      i) generate  |a0|<1 with dist
-      (1-a0^2)^0.5 e^{2 xi a0 } da0
-
-      Take alpha = 2 xi  = 2 xi [ recall 2 beta/Nc unmod staple norm]; hence 2.0/Nc
-      factor in Chroma ]
-      A. Generate two uniformly distributed pseudo-random numbers R and R', R'',
-      R''' in the unit interval;
-      B. Set X = -(ln R)/alpha, X' =-(ln R')/alpha;
-      C. Set C = cos^2(2pi R"), with R" another uniform random number in [0,1] ;
-      D. Set A = XC;
-      E. Let d  = X'+A;
-      F. If R'''^2 :> 1 - 0.5 d,  go back to A;
-      G. Set a0 = 1 - d;
-
-      Note that in step D setting B ~ X - A and using B in place of A in step E will
-      generate a second independent a 0 value.
-    */
-
-    /////////////////////////////////////////////////////////
-    // count the number of sites by picking "1"'s out of hat
-    /////////////////////////////////////////////////////////
-    Integer hit = 0;
-    LatticeReal rtmp(grid);
-    rtmp = where(wheremask, rones, rzeros);
-    RealD numSites = sum(rtmp);
-    RealD numAccepted;
-    LatticeInteger Accepted(grid);
-    Accepted = Zero();
-    LatticeInteger newlyAccepted(grid);
-
-    std::vector<LatticeReal> xr(4, grid);
-    std::vector<LatticeReal> a(4, grid);
-    LatticeReal d(grid);
-    d = Zero();
-    LatticeReal alpha(grid);
-
-    //    std::cout<<GridLogMessage<<"xi "<<xi <<std::endl;
-    xi = 2.0 *xi;
-    alpha = toReal(xi);
-
-    do {
-      // A. Generate two uniformly distributed pseudo-random numbers R and R',
-      // R'', R''' in the unit interval;
-      random(pRNG, xr[0]);
-      random(pRNG, xr[1]);
-      random(pRNG, xr[2]);
-      random(pRNG, xr[3]);
-
-      // B. Set X = - ln R/alpha, X' = -ln R'/alpha
-      xr[1] = -log(xr[1]) / alpha;
-      xr[2] = -log(xr[2]) / alpha;
-
-      // C. Set C = cos^2(2piR'')
-      xr[3] = cos(xr[3] * twopi);
-      xr[3] = xr[3] * xr[3];
-
-      LatticeReal xrsq(grid);
-
-      // D. Set A = XC;
-      // E. Let d  = X'+A;
-      xrsq = xr[2] + xr[1] * xr[3];
-
-      d = where(Accepted, d, xr[2] + xr[1] * xr[3]);
-
-      // F. If R'''^2 :> 1 - 0.5 d,  go back to A;
-      LatticeReal thresh(grid);
-      thresh = 1.0 - d * 0.5;
-      xrsq = xr[0] * xr[0];
-      LatticeInteger ione(grid);
-      ione = 1;
-      LatticeInteger izero(grid);
-      izero = Zero();
-
-      newlyAccepted = where(xrsq < thresh, ione, izero);
-      Accepted = where(newlyAccepted, newlyAccepted, Accepted);
-      Accepted = where(wheremask, Accepted, izero);
-
-      // FIXME need an iSum for integer to avoid overload on return type??
-      rtmp = where(Accepted, rones, rzeros);
-      numAccepted = sum(rtmp);
-
-      hit++;
-
-    } while ((numAccepted < numSites) && (hit < nheatbath));
-
-    // G. Set a0 = 1 - d;
-    a[0] = Zero();
-    a[0] = where(wheremask, 1.0 - d, a[0]);
-
-    //////////////////////////////////////////
-    //    ii) generate a_i uniform on two sphere radius (1-a0^2)^0.5
-    //////////////////////////////////////////
-
-    LatticeReal a123mag(grid);
-    a123mag = sqrt(abs(1.0 - a[0] * a[0]));
-
-    LatticeReal cos_theta(grid);
-    LatticeReal sin_theta(grid);
-    LatticeReal phi(grid);
-
-    random(pRNG, phi);
-    phi = phi * twopi;  // uniform in [0,2pi]
-    random(pRNG, cos_theta);
-    cos_theta = (cos_theta * 2.0) - 1.0;  // uniform in [-1,1]
-    sin_theta = sqrt(abs(1.0 - cos_theta * cos_theta));
-
-    a[1] = a123mag * sin_theta * cos(phi);
-    a[2] = a123mag * sin_theta * sin(phi);
-    a[3] = a123mag * cos_theta;
-
-    ua = toComplex(a[0]) * ident  + toComplex(a[1]) * pauli1 +
-         toComplex(a[2]) * pauli2 + toComplex(a[3]) * pauli3;
-
-    b = 1.0;
-    b = where(wheremask, uinv * ua, b);
-    su2Insert(b, V, su2_subgroup);
-
-    // mask the assignment back based on Accptance
-    link = where(Accepted, V * link, link);
-
-    //////////////////////////////
-    // Debug Checks
-    // SU2 check
-    LatticeSU2Matrix check(grid);  // rotated matrix after hb
-    u = Zero();
-    check = ua * adj(ua) - 1.0;
-    check = where(Accepted, check, u);
-    assert(norm2(check) < 1.0e-4);
-
-    check = b * adj(b) - 1.0;
-    check = where(Accepted, check, u);
-    assert(norm2(check) < 1.0e-4);
-
-    LatticeMatrix Vcheck(grid);
-    Vcheck = Zero();
-    Vcheck = where(Accepted, V * adj(V) - 1.0, Vcheck);
-    //    std::cout<<GridLogMessage << "SU3 check " <<norm2(Vcheck)<<std::endl;
-    assert(norm2(Vcheck) < 1.0e-4);
-
-    // Verify the link stays in SU(3)
-    //    std::cout<<GridLogMessage <<"Checking the modified link"<<std::endl;
-    Vcheck = link * adj(link) - 1.0;
-    assert(norm2(Vcheck) < 1.0e-4);
-    /////////////////////////////////
-  }
-
-  static void printGenerators(void) {
-    for (int gen = 0; gen < AdjointDimension; gen++) {
-      Matrix ta;
-      generator(gen, ta);
-      std::cout << GridLogMessage << "Nc = " << ncolour << " t_" << gen
-                << std::endl;
-      std::cout << GridLogMessage << ta << std::endl;
-    }
-  }
-
-
-
-  static void testGenerators(void) {
-    Matrix ta;
-    Matrix tb;
-    std::cout << GridLogMessage
-              << "Fundamental - Checking trace ta tb is 0.5 delta_ab"
-              << std::endl;
-    for (int a = 0; a < AdjointDimension; a++) {
-      for (int b = 0; b < AdjointDimension; b++) {
-        generator(a, ta);
-        generator(b, tb);
-        Complex tr = TensorRemove(trace(ta * tb));
-        std::cout << GridLogMessage << "(" << a << "," << b << ") =  " << tr
-                  << std::endl;
-        if (a == b) assert(abs(tr - Complex(0.5)) < 1.0e-6);
-        if (a != b) assert(abs(tr) < 1.0e-6);
-      }
-      std::cout << GridLogMessage << std::endl;
-    }
-    std::cout << GridLogMessage << "Fundamental - Checking if hermitian"
-              << std::endl;
-    for (int a = 0; a < AdjointDimension; a++) {
-      generator(a, ta);
-      std::cout << GridLogMessage << a << std::endl;
-      assert(norm2(ta - adj(ta)) < 1.0e-6);
-    }
-    std::cout << GridLogMessage << std::endl;
-
-    std::cout << GridLogMessage << "Fundamental - Checking if traceless"
-              << std::endl;
-    for (int a = 0; a < AdjointDimension; a++) {
-      generator(a, ta);
-      Complex tr = TensorRemove(trace(ta));
-      std::cout << GridLogMessage << a << " " << std::endl;
-      assert(abs(tr) < 1.0e-6);
-    }
-    std::cout << GridLogMessage << std::endl;
-  }
-
-  // reunitarise??
-  template <typename LatticeMatrixType>
-  static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, double scale = 1.0) 
-  {
-    GridBase *grid = out.Grid();
-
-    typedef typename LatticeMatrixType::vector_type vector_type;
-
-    typedef iSinglet<vector_type> vTComplexType;
-
-    typedef Lattice<vTComplexType> LatticeComplexType;
-    typedef typename GridTypeMapper<typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
-
-    LatticeComplexType ca(grid);
-    LatticeMatrixType lie(grid);
-    LatticeMatrixType la(grid);
-    ComplexD ci(0.0, scale);
-    //    ComplexD cone(1.0, 0.0);
-    MatrixType ta;
-
-    lie = Zero();
-
-    for (int a = 0; a < AdjointDimension; a++) {
-      random(pRNG, ca);
-
-      ca = (ca + conjugate(ca)) * 0.5;
-      ca = ca - 0.5;
-
-      generator(a, ta);
-
-      la = ci * ca * ta;
-
-      lie = lie + la;  // e^{i la ta}
-
-    }
-    taExp(lie, out);
-  }
-
-  static void GaussianFundamentalLieAlgebraMatrix(GridParallelRNG &pRNG,
-                                                  LatticeMatrix &out,
-                                                  Real scale = 1.0) {
-    GridBase *grid = out.Grid();
-    LatticeReal ca(grid);
-    LatticeMatrix la(grid);
-    Complex ci(0.0, scale);
-    Matrix ta;
-
-    out = Zero();
-    for (int a = 0; a < AdjointDimension; a++) {
-      gaussian(pRNG, ca);
-      generator(a, ta);
-      la = toComplex(ca) * ta;
-      out += la;
-    }
-    out *= ci;
-  }
-
-  static void FundamentalLieAlgebraMatrix(const LatticeAlgebraVector &h,
-                                          LatticeMatrix &out,
-                                          Real scale = 1.0) {
-    conformable(h, out);
-    GridBase *grid = out.Grid();
-    LatticeMatrix la(grid);
-    Matrix ta;
-
-    out = Zero();
-    for (int a = 0; a < AdjointDimension; a++) {
-      generator(a, ta);
-      la = peekColour(h, a) * timesI(ta) * scale;
-      out += la;
-    }
-  }
-/*
- * Fundamental rep gauge xform
- */
-  template<typename Fundamental,typename GaugeMat>
-  static void GaugeTransformFundamental( Fundamental &ferm, GaugeMat &g){
-    GridBase *grid = ferm._grid;
-    conformable(grid,g._grid);
-    ferm = g*ferm;
-  }
-/*
- * Adjoint rep gauge xform
- */
-
-  template<typename Gimpl>
-  static void GaugeTransform(typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
-    GridBase *grid = Umu.Grid();
-    conformable(grid,g.Grid());
-
-    typename Gimpl::GaugeLinkField U(grid);
-    typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);
-
-    for(int mu=0;mu<Nd;mu++){
-      U= PeekIndex<LorentzIndex>(Umu,mu);
-      U = g*U*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
-      PokeIndex<LorentzIndex>(Umu,U,mu);
-    }
-  }
-  template<typename Gimpl>
-  static void GaugeTransform( std::vector<typename Gimpl::GaugeLinkField> &U, typename Gimpl::GaugeLinkField &g){
-    GridBase *grid = g.Grid();
-    typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);
-    for(int mu=0;mu<Nd;mu++){
-      U[mu] = g*U[mu]*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
-    }
-  }
-  template<typename Gimpl>
-  static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
-    LieRandomize(pRNG,g,1.0);
-    GaugeTransform<Gimpl>(Umu,g);
-  }
-
-  // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 )
-  // inverse operation: FundamentalLieAlgebraMatrix
-  static void projectOnAlgebra(LatticeAlgebraVector &h_out, const LatticeMatrix &in, Real scale = 1.0) {
-    conformable(h_out, in);
-    h_out = Zero();
-    Matrix Ta;
-
-    for (int a = 0; a < AdjointDimension; a++) {
-      generator(a, Ta);
-      pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a);
-    }
-  }
-
-  template <typename GaugeField>
-  static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) {
-    typedef typename GaugeField::vector_type vector_type;
-    typedef iSUnMatrix<vector_type> vMatrixType;
-    typedef Lattice<vMatrixType> LatticeMatrixType;
-
-    LatticeMatrixType Umu(out.Grid());
-    LatticeMatrixType tmp(out.Grid());
-    for (int mu = 0; mu < Nd; mu++) {
-      //      LieRandomize(pRNG, Umu, 1.0);
-      //      PokeIndex<LorentzIndex>(out, Umu, mu);
-      gaussian(pRNG,Umu);
-      tmp = Ta(Umu);
-      taExp(tmp,Umu);
-      ProjectSUn(Umu);
-      PokeIndex<LorentzIndex>(out, Umu, mu);
-    }
-  }
-  template<typename GaugeField>
-  static void TepidConfiguration(GridParallelRNG &pRNG,GaugeField &out){
-    typedef typename GaugeField::vector_type vector_type;
-    typedef iSUnMatrix<vector_type> vMatrixType;
-    typedef Lattice<vMatrixType> LatticeMatrixType;
-
-    LatticeMatrixType Umu(out.Grid());
-    for(int mu=0;mu<Nd;mu++){
-      LieRandomize(pRNG,Umu,0.01);
-      PokeIndex<LorentzIndex>(out,Umu,mu);
-    }
-  }
-  template<typename GaugeField>
-  static void ColdConfiguration(GaugeField &out){
-    typedef typename GaugeField::vector_type vector_type;
-    typedef iSUnMatrix<vector_type> vMatrixType;
-    typedef Lattice<vMatrixType> LatticeMatrixType;
-
-    LatticeMatrixType Umu(out.Grid());
-    Umu=1.0;
-    for(int mu=0;mu<Nd;mu++){
-      PokeIndex<LorentzIndex>(out,Umu,mu);
-    }
-  }
-  template<typename GaugeField>
-  static void ColdConfiguration(GridParallelRNG &pRNG,GaugeField &out){
-    ColdConfiguration(out);
-  }
-
-  template<typename LatticeMatrixType>
-  static void taProj( const LatticeMatrixType &in,  LatticeMatrixType &out){
-    out = Ta(in);
-  }
-  template <typename LatticeMatrixType>
-  static void taExp(const LatticeMatrixType &x, LatticeMatrixType &ex) {
-    typedef typename LatticeMatrixType::scalar_type ComplexType;
-
-    LatticeMatrixType xn(x.Grid());
-    RealD nfac = 1.0;
-
-    xn = x;
-    ex = xn + ComplexType(1.0);  // 1+x
-
-    // Do a 12th order exponentiation
-    for (int i = 2; i <= 12; ++i) {
-      nfac = nfac / RealD(i);  // 1/2, 1/2.3 ...
-      xn = xn * x;             // x2, x3,x4....
-      ex = ex + xn * nfac;     // x2/2!, x3/3!....
-    }
-  }
-};
-
-template<int N>
-Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
-{
-  GridBase *grid=Umu.Grid();
-  auto lvol = grid->lSites();
-  Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
-  
-  autoView(Umu_v,Umu,CpuRead);
-  autoView(ret_v,ret,CpuWrite);
-  thread_for(site,lvol,{
-    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
-    Coordinate lcoor;
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
-    iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
-    peekLocalSite(Us, Umu_v, lcoor);
-    for(int i=0;i<N;i++){
-      for(int j=0;j<N;j++){
-	EigenU(i,j) = Us()()(i,j);
-      }}
-    Eigen::MatrixXcd EigenUinv = EigenU.inverse();
-    for(int i=0;i<N;i++){
-      for(int j=0;j<N;j++){
-	Ui()()(i,j) = EigenUinv(i,j);
-      }}
-    pokeLocalSite(Ui,ret_v,lcoor);
-  });
-  return ret;
-}
-// Explicit specialisation for SU(3).
-// Explicit specialisation for SU(3).
-static void
-ProjectSU3 (Lattice<iScalar<iScalar<iMatrix<vComplexD, 3> > > > &Umu)
-{
-  GridBase *grid=Umu.Grid();
-  const int x=0;
-  const int y=1;
-  const int z=2;
-  // Reunitarise
-  Umu = ProjectOnGroup(Umu);
-  autoView(Umu_v,Umu,CpuWrite);
-  thread_for(ss,grid->oSites(),{
-      auto cm = Umu_v[ss];
-      cm()()(2,x) = adj(cm()()(0,y)*cm()()(1,z)-cm()()(0,z)*cm()()(1,y)); //x= yz-zy
-      cm()()(2,y) = adj(cm()()(0,z)*cm()()(1,x)-cm()()(0,x)*cm()()(1,z)); //y= zx-xz
-      cm()()(2,z) = adj(cm()()(0,x)*cm()()(1,y)-cm()()(0,y)*cm()()(1,x)); //z= xy-yx
-      Umu_v[ss]=cm;
-  });
-}
-static void ProjectSU3(Lattice<iVector<iScalar<iMatrix<vComplexD, 3> >,Nd> > &U)
-{
-  GridBase *grid=U.Grid();
-  // Reunitarise
-  for(int mu=0;mu<Nd;mu++){
-    auto Umu = PeekIndex<LorentzIndex>(U,mu);
-    Umu      = ProjectOnGroup(Umu);
-    ProjectSU3(Umu);
-    PokeIndex<LorentzIndex>(U,Umu,mu);
-  }
-}
-
-typedef SU<2> SU2;
-typedef SU<3> SU3;
-typedef SU<4> SU4;
-typedef SU<5> SU5;
-
-
-typedef SU<Nc> FundamentalMatrices;
-
-NAMESPACE_END(Grid);
-#endif
--- a/Grid/qcd/utils/SUn.impl.h
+++ b/Grid/qcd/utils/SUn.impl.h
@@ -0,0 +1,580 @@
+// This file is #included into the body of the class template definition of
+// GaugeGroup. So, image there to be
+//
+// template <int ncolour, class group_name>
+// class GaugeGroup {
+//
+// around it.
+//
+// Please note that the unconventional file extension makes sure that it
+// doesn't get found by the scripts/filelist during bootstrapping.
+
+private:
+
+template <ONLY_IF_SU>
+static int su2subgroups(GroupName::SU) { return (ncolour * (ncolour - 1)) / 2; }
+////////////////////////////////////////////////////////////////////////
+// There are N^2-1 generators for SU(N).
+//
+// We take a traceless hermitian generator basis as follows
+//
+// * Normalisation: trace ta tb = 1/2 delta_ab = T_F delta_ab
+//   T_F = 1/2  for SU(N) groups
+//
+// * Off diagonal
+//    - pairs of rows i1,i2 behaving like pauli matrices signma_x, sigma_y
+//
+//    - there are (Nc-1-i1) slots for i2 on each row [ x  0  x ]
+//      direct count off each row
+//
+//    - Sum of all pairs is Nc(Nc-1)/2: proof arithmetic series
+//
+//      (Nc-1) + (Nc-2)+...  1      ==> Nc*(Nc-1)/2
+//      1+ 2+          +   + Nc-1
+//
+//    - There are 2 x Nc (Nc-1)/ 2 of these = Nc^2 - Nc
+//
+//    - We enumerate the row-col pairs.
+//    - for each row col pair there is a (sigma_x) and a (sigma_y) like
+//    generator
+//
+//
+//   t^a_ij = { in 0.. Nc(Nc-1)/2 -1} =>  1/2(delta_{i,i1} delta_{j,i2} +
+//   delta_{i,i1} delta_{j,i2})
+//   t^a_ij = { in Nc(Nc-1)/2 ... Nc(Nc-1) - 1} =>  i/2( delta_{i,i1}
+//   delta_{j,i2} - i delta_{i,i1} delta_{j,i2})
+//
+// * Diagonal; must be traceless and normalised
+//   - Sequence is
+//   N  (1,-1,0,0...)
+//   N  (1, 1,-2,0...)
+//   N  (1, 1, 1,-3,0...)
+//   N  (1, 1, 1, 1,-4,0...)
+//
+//   where 1/2 = N^2 (1+.. m^2)etc.... for the m-th diagonal generator
+//   NB this gives the famous SU3 result for su2 index 8
+//
+//   N= sqrt(1/2 . 1/6 ) = 1/2 . 1/sqrt(3)
+//
+//   ( 1      )
+//   (    1   ) / sqrt(3) /2  = 1/2 lambda_8
+//   (      -2)
+//
+////////////////////////////////////////////////////////////////////////
+template <class cplx, ONLY_IF_SU>
+static void generator(int lieIndex, iGroupMatrix<cplx> &ta, GroupName::SU) {
+  // map lie index to which type of generator
+  int diagIndex;
+  int su2Index;
+  int sigxy;
+  int NNm1 = ncolour * (ncolour - 1);
+  if (lieIndex >= NNm1) {
+    diagIndex = lieIndex - NNm1;
+    generatorDiagonal(diagIndex, ta);
+    return;
+  }
+  sigxy = lieIndex & 0x1;  // even or odd
+  su2Index = lieIndex >> 1;
+  if (sigxy)
+    generatorSigmaY(su2Index, ta);
+  else
+    generatorSigmaX(su2Index, ta);
+}
+
+template <class cplx, ONLY_IF_SU>
+static void generatorSigmaY(int su2Index, iGroupMatrix<cplx> &ta) {
+  ta = Zero();
+  int i1, i2;
+  su2SubGroupIndex(i1, i2, su2Index);
+  ta()()(i1, i2) = 1.0;
+  ta()()(i2, i1) = 1.0;
+  ta = ta * 0.5;
+}
+
+template <class cplx, ONLY_IF_SU>
+static void generatorSigmaX(int su2Index, iGroupMatrix<cplx> &ta) {
+  ta = Zero();
+  cplx i(0.0, 1.0);
+  int i1, i2;
+  su2SubGroupIndex(i1, i2, su2Index);
+  ta()()(i1, i2) = i;
+  ta()()(i2, i1) = -i;
+  ta = ta * 0.5;
+}
+
+template <class cplx, ONLY_IF_SU>
+static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
+  // diag ({1, 1, ..., 1}(k-times), -k, 0, 0, ...)
+  ta = Zero();
+  int k = diagIndex + 1;                  // diagIndex starts from 0
+  for (int i = 0; i <= diagIndex; i++) {  // k iterations
+    ta()()(i, i) = 1.0;
+  }
+  ta()()(k, k) = -k;  // indexing starts from 0
+  RealD nrm = 1.0 / std::sqrt(2.0 * k * (k + 1));
+  ta = ta * nrm;
+}
+
+////////////////////////////////////////////////////////////////////////
+// Map a su2 subgroup number to the pair of rows that are non zero
+////////////////////////////////////////////////////////////////////////
+static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
+  assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
+
+  int spare = su2_index;
+  for (i1 = 0; spare >= (ncolour - 1 - i1); i1++) {
+    spare = spare - (ncolour - 1 - i1);  // remove the Nc-1-i1 terms
+  }
+  i2 = i1 + 1 + spare;
+}
+
+public:
+//////////////////////////////////////////////////////////////////////////////////////////
+// Pull out a subgroup and project on to real coeffs x pauli basis
+//////////////////////////////////////////////////////////////////////////////////////////
+template <class vcplx, ONLY_IF_SU>
+static void su2Extract(Lattice<iSinglet<vcplx> > &Determinant,
+                       Lattice<iSU2Matrix<vcplx> > &subgroup,
+                       const Lattice<iGroupMatrix<vcplx> > &source,
+                       int su2_index) {
+  GridBase *grid(source.Grid());
+  conformable(subgroup, source);
+  conformable(subgroup, Determinant);
+  int i0, i1;
+  su2SubGroupIndex(i0, i1, su2_index);
+
+  autoView(subgroup_v, subgroup, AcceleratorWrite);
+  autoView(source_v, source, AcceleratorRead);
+  autoView(Determinant_v, Determinant, AcceleratorWrite);
+  accelerator_for(ss, grid->oSites(), 1, {
+    subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0);
+    subgroup_v[ss]()()(0, 1) = source_v[ss]()()(i0, i1);
+    subgroup_v[ss]()()(1, 0) = source_v[ss]()()(i1, i0);
+    subgroup_v[ss]()()(1, 1) = source_v[ss]()()(i1, i1);
+
+    iSU2Matrix<vcplx> Sigma = subgroup_v[ss];
+
+    Sigma = Sigma - adj(Sigma) + trace(adj(Sigma));
+
+    subgroup_v[ss] = Sigma;
+
+    // this should be purely real
+    Determinant_v[ss] =
+        Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0);
+  });
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Set matrix to one and insert a pauli subgroup
+//////////////////////////////////////////////////////////////////////////////////////////
+template <class vcplx, ONLY_IF_SU>
+static void su2Insert(const Lattice<iSU2Matrix<vcplx> > &subgroup,
+                      Lattice<iGroupMatrix<vcplx> > &dest, int su2_index) {
+  GridBase *grid(dest.Grid());
+  conformable(subgroup, dest);
+  int i0, i1;
+  su2SubGroupIndex(i0, i1, su2_index);
+
+  dest = 1.0;  // start out with identity
+  autoView(dest_v, dest, AcceleratorWrite);
+  autoView(subgroup_v, subgroup, AcceleratorRead);
+  accelerator_for(ss, grid->oSites(), 1, {
+    dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0);
+    dest_v[ss]()()(i0, i1) = subgroup_v[ss]()()(0, 1);
+    dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0);
+    dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1);
+  });
+}
+
+///////////////////////////////////////////////
+// Generate e^{ Re Tr Staple Link} dlink
+//
+// *** Note Staple should be appropriate linear compbination between all
+// staples.
+// *** If already by beta pass coefficient 1.0.
+// *** This routine applies the additional 1/Nc factor that comes after trace
+// in action.
+//
+///////////////////////////////////////////////
+template <ONLY_IF_SU>
+static void SubGroupHeatBath(
+    GridSerialRNG &sRNG, GridParallelRNG &pRNG,
+    RealD beta,  // coeff multiplying staple in action (with no 1/Nc)
+    LatticeMatrix &link,
+    const LatticeMatrix &barestaple,  // multiplied by action coeffs so th
+    int su2_subgroup, int nheatbath, LatticeInteger &wheremask) {
+  GridBase *grid = link.Grid();
+
+  const RealD twopi = 2.0 * M_PI;
+
+  LatticeMatrix staple(grid);
+
+  staple = barestaple * (beta / ncolour);
+
+  LatticeMatrix V(grid);
+  V = link * staple;
+
+  // Subgroup manipulation in the lie algebra space
+  LatticeSU2Matrix u(
+      grid);  // Kennedy pendleton "u" real projected normalised Sigma
+  LatticeSU2Matrix uinv(grid);
+  LatticeSU2Matrix ua(grid);  // a in pauli form
+  LatticeSU2Matrix b(grid);   // rotated matrix after hb
+
+  // Some handy constant fields
+  LatticeComplex ones(grid);
+  ones = 1.0;
+  LatticeComplex zeros(grid);
+  zeros = Zero();
+  LatticeReal rones(grid);
+  rones = 1.0;
+  LatticeReal rzeros(grid);
+  rzeros = Zero();
+  LatticeComplex udet(grid);  // determinant of real(staple)
+  LatticeInteger mask_true(grid);
+  mask_true = 1;
+  LatticeInteger mask_false(grid);
+  mask_false = 0;
+
+  /*
+    PLB 156 P393 (1985) (Kennedy and Pendleton)
+
+    Note: absorb "beta" into the def of sigma compared to KP paper; staple
+    passed to this routine has "beta" already multiplied in
+
+    Action linear in links h and of form:
+
+    beta S = beta  Sum_p (1 - 1/Nc Re Tr Plaq )
+
+    Writing Sigma = 1/Nc (beta Sigma') where sum over staples is "Sigma' "
+
+    beta S = const - beta/Nc Re Tr h Sigma'
+    = const - Re Tr h Sigma
+
+    Decompose h and Sigma into (1, sigma_j) ; h_i real, h^2=1, Sigma_i complex
+    arbitrary.
+
+    Tr h Sigma = h_i Sigma_j Tr (sigma_i sigma_j)  = h_i Sigma_j 2 delta_ij
+    Re Tr h Sigma = 2 h_j Re Sigma_j
+
+    Normalised re Sigma_j = xi u_j
+
+    With u_j a unit vector and U can be in SU(2);
+
+    Re Tr h Sigma = 2 h_j Re Sigma_j = 2 xi (h.u)
+
+    4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
+    u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
+
+    xi = sqrt(Det)/2;
+
+    Write a= u h in SU(2); a has pauli decomp a_j;
+
+    Note: Product b' xi is unvariant because scaling Sigma leaves
+    normalised vector "u" fixed; Can rescale Sigma so b' = 1.
+  */
+
+  ////////////////////////////////////////////////////////
+  // Real part of Pauli decomposition
+  // Note a subgroup can project to zero in cold start
+  ////////////////////////////////////////////////////////
+  su2Extract(udet, u, V, su2_subgroup);
+
+  //////////////////////////////////////////////////////
+  // Normalising this vector if possible; else identity
+  //////////////////////////////////////////////////////
+  LatticeComplex xi(grid);
+
+  LatticeSU2Matrix lident(grid);
+
+  SU2Matrix ident = Complex(1.0);
+  SU2Matrix pauli1;
+  GaugeGroup<2, GroupName::SU>::generator(0, pauli1);
+  SU2Matrix pauli2;
+  GaugeGroup<2, GroupName::SU>::generator(1, pauli2);
+  SU2Matrix pauli3;
+  GaugeGroup<2, GroupName::SU>::generator(2, pauli3);
+  pauli1 = timesI(pauli1) * 2.0;
+  pauli2 = timesI(pauli2) * 2.0;
+  pauli3 = timesI(pauli3) * 2.0;
+
+  LatticeComplex cone(grid);
+  LatticeReal adet(grid);
+  adet = abs(toReal(udet));
+  lident = Complex(1.0);
+  cone = Complex(1.0);
+  Real machine_epsilon = 1.0e-7;
+  u = where(adet > machine_epsilon, u, lident);
+  udet = where(adet > machine_epsilon, udet, cone);
+
+  xi = 0.5 * sqrt(udet);        // 4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
+  u = 0.5 * u * pow(xi, -1.0);  //  u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
+
+  // Debug test for sanity
+  uinv = adj(u);
+  b = u * uinv - 1.0;
+  assert(norm2(b) < 1.0e-4);
+
+  /*
+    Measure: Haar measure dh has d^4a delta(1-|a^2|)
+    In polars:
+    da = da0 r^2 sin theta dr dtheta dphi delta( 1 - r^2 -a0^2)
+    = da0 r^2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r)(sqrt(1-a0^) +
+    r) )
+    = da0 r/2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r) )
+
+    Action factor Q(h) dh  = e^-S[h]  dh =  e^{  xi Tr uh} dh    // beta
+    enters through xi =  e^{2 xi (h.u)} dh =  e^{2 xi h0u0}.e^{2 xi h1u1}.e^{2
+    xi h2u2}.e^{2 xi h3u3} dh
+
+    Therefore for each site, take xi for that site
+    i) generate  |a0|<1 with dist
+    (1-a0^2)^0.5 e^{2 xi a0 } da0
+
+    Take alpha = 2 xi  = 2 xi [ recall 2 beta/Nc unmod staple norm];
+    hence 2.0/Nc factor in Chroma ] A. Generate two uniformly distributed
+    pseudo-random numbers R and R', R'', R''' in the unit interval; B. Set X =
+    -(ln R)/alpha, X' =-(ln R')/alpha; C. Set C = cos^2(2pi R"), with R"
+    another uniform random number in [0,1] ; D. Set A = XC; E. Let d  = X'+A;
+    F. If R'''^2 :> 1 - 0.5 d,  go back to A;
+    G. Set a0 = 1 - d;
+
+    Note that in step D setting B ~ X - A and using B in place of A in step E
+    will generate a second independent a 0 value.
+  */
+
+  /////////////////////////////////////////////////////////
+  // count the number of sites by picking "1"'s out of hat
+  /////////////////////////////////////////////////////////
+  Integer hit = 0;
+  LatticeReal rtmp(grid);
+  rtmp = where(wheremask, rones, rzeros);
+  RealD numSites = sum(rtmp);
+  RealD numAccepted;
+  LatticeInteger Accepted(grid);
+  Accepted = Zero();
+  LatticeInteger newlyAccepted(grid);
+
+  std::vector<LatticeReal> xr(4, grid);
+  std::vector<LatticeReal> a(4, grid);
+  LatticeReal d(grid);
+  d = Zero();
+  LatticeReal alpha(grid);
+
+  //    std::cout<<GridLogMessage<<"xi "<<xi <<std::endl;
+  xi = 2.0 * xi;
+  alpha = toReal(xi);
+
+  do {
+    // A. Generate two uniformly distributed pseudo-random numbers R and R',
+    // R'', R''' in the unit interval;
+    random(pRNG, xr[0]);
+    random(pRNG, xr[1]);
+    random(pRNG, xr[2]);
+    random(pRNG, xr[3]);
+
+    // B. Set X = - ln R/alpha, X' = -ln R'/alpha
+    xr[1] = -log(xr[1]) / alpha;
+    xr[2] = -log(xr[2]) / alpha;
+
+    // C. Set C = cos^2(2piR'')
+    xr[3] = cos(xr[3] * twopi);
+    xr[3] = xr[3] * xr[3];
+
+    LatticeReal xrsq(grid);
+
+    // D. Set A = XC;
+    // E. Let d  = X'+A;
+    xrsq = xr[2] + xr[1] * xr[3];
+
+    d = where(Accepted, d, xr[2] + xr[1] * xr[3]);
+
+    // F. If R'''^2 :> 1 - 0.5 d,  go back to A;
+    LatticeReal thresh(grid);
+    thresh = 1.0 - d * 0.5;
+    xrsq = xr[0] * xr[0];
+    LatticeInteger ione(grid);
+    ione = 1;
+    LatticeInteger izero(grid);
+    izero = Zero();
+
+    newlyAccepted = where(xrsq < thresh, ione, izero);
+    Accepted = where(newlyAccepted, newlyAccepted, Accepted);
+    Accepted = where(wheremask, Accepted, izero);
+
+    // FIXME need an iSum for integer to avoid overload on return type??
+    rtmp = where(Accepted, rones, rzeros);
+    numAccepted = sum(rtmp);
+
+    hit++;
+
+  } while ((numAccepted < numSites) && (hit < nheatbath));
+
+  // G. Set a0 = 1 - d;
+  a[0] = Zero();
+  a[0] = where(wheremask, 1.0 - d, a[0]);
+
+  //////////////////////////////////////////
+  //    ii) generate a_i uniform on two sphere radius (1-a0^2)^0.5
+  //////////////////////////////////////////
+
+  LatticeReal a123mag(grid);
+  a123mag = sqrt(abs(1.0 - a[0] * a[0]));
+
+  LatticeReal cos_theta(grid);
+  LatticeReal sin_theta(grid);
+  LatticeReal phi(grid);
+
+  random(pRNG, phi);
+  phi = phi * twopi;  // uniform in [0,2pi]
+  random(pRNG, cos_theta);
+  cos_theta = (cos_theta * 2.0) - 1.0;  // uniform in [-1,1]
+  sin_theta = sqrt(abs(1.0 - cos_theta * cos_theta));
+
+  a[1] = a123mag * sin_theta * cos(phi);
+  a[2] = a123mag * sin_theta * sin(phi);
+  a[3] = a123mag * cos_theta;
+
+  ua = toComplex(a[0]) * ident + toComplex(a[1]) * pauli1 +
+       toComplex(a[2]) * pauli2 + toComplex(a[3]) * pauli3;
+
+  b = 1.0;
+  b = where(wheremask, uinv * ua, b);
+  su2Insert(b, V, su2_subgroup);
+
+  // mask the assignment back based on Accptance
+  link = where(Accepted, V * link, link);
+
+  //////////////////////////////
+  // Debug Checks
+  // SU2 check
+  LatticeSU2Matrix check(grid);  // rotated matrix after hb
+  u = Zero();
+  check = ua * adj(ua) - 1.0;
+  check = where(Accepted, check, u);
+  assert(norm2(check) < 1.0e-4);
+
+  check = b * adj(b) - 1.0;
+  check = where(Accepted, check, u);
+  assert(norm2(check) < 1.0e-4);
+
+  LatticeMatrix Vcheck(grid);
+  Vcheck = Zero();
+  Vcheck = where(Accepted, V * adj(V) - 1.0, Vcheck);
+  //    std::cout<<GridLogMessage << "SU3 check " <<norm2(Vcheck)<<std::endl;
+  assert(norm2(Vcheck) < 1.0e-4);
+
+  // Verify the link stays in SU(3)
+  //    std::cout<<GridLogMessage <<"Checking the modified link"<<std::endl;
+  Vcheck = link * adj(link) - 1.0;
+  assert(norm2(Vcheck) < 1.0e-4);
+  /////////////////////////////////
+}
+
+template <ONLY_IF_SU>
+static void testGenerators(GroupName::SU) {
+  Matrix ta;
+  Matrix tb;
+  std::cout << GridLogMessage
+            << "Fundamental - Checking trace ta tb is 0.5 delta_ab"
+            << std::endl;
+  for (int a = 0; a < AdjointDimension; a++) {
+    for (int b = 0; b < AdjointDimension; b++) {
+      generator(a, ta);
+      generator(b, tb);
+      Complex tr = TensorRemove(trace(ta * tb));
+      std::cout << GridLogMessage << "(" << a << "," << b << ") =  " << tr
+                << std::endl;
+      if (a == b) assert(abs(tr - Complex(0.5)) < 1.0e-6);
+      if (a != b) assert(abs(tr) < 1.0e-6);
+    }
+    std::cout << GridLogMessage << std::endl;
+  }
+  std::cout << GridLogMessage << "Fundamental - Checking if hermitian"
+            << std::endl;
+  for (int a = 0; a < AdjointDimension; a++) {
+    generator(a, ta);
+    std::cout << GridLogMessage << a << std::endl;
+    assert(norm2(ta - adj(ta)) < 1.0e-6);
+  }
+  std::cout << GridLogMessage << std::endl;
+
+  std::cout << GridLogMessage << "Fundamental - Checking if traceless"
+            << std::endl;
+  for (int a = 0; a < AdjointDimension; a++) {
+    generator(a, ta);
+    Complex tr = TensorRemove(trace(ta));
+    std::cout << GridLogMessage << a << " " << std::endl;
+    assert(abs(tr) < 1.0e-6);
+  }
+  std::cout << GridLogMessage << std::endl;
+}
+
+
+template <int N, class vtype>
+static Lattice<iScalar<iScalar<iMatrix<vtype, N> > > >
+ProjectOnGeneralGroup(const Lattice<iScalar<iScalar<iMatrix<vtype, N> > > > &Umu, GroupName::SU) {
+  return ProjectOnGroup(Umu);
+}
+
+template <class vtype>
+accelerator_inline static iScalar<vtype> ProjectOnGeneralGroup(const iScalar<vtype> &r, GroupName::SU) {
+  return ProjectOnGroup(r);
+}
+
+template <class vtype, int N>
+accelerator_inline static iVector<vtype,N> ProjectOnGeneralGroup(const iVector<vtype,N> &r, GroupName::SU) {
+  return ProjectOnGroup(r);
+}
+
+template <class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr>
+accelerator_inline static iMatrix<vtype,N> ProjectOnGeneralGroup(const iMatrix<vtype,N> &arg, GroupName::SU) {
+  return ProjectOnGroup(arg);
+}
+
+template <typename LatticeMatrixType>
+static void taProj(const LatticeMatrixType &in, LatticeMatrixType &out, GroupName::SU) {
+  out = Ta(in);
+}
+
+/*
+ * Fundamental rep gauge xform
+ */
+template<typename Fundamental,typename GaugeMat>
+static void GaugeTransformFundamental( Fundamental &ferm, GaugeMat &g){
+  GridBase *grid = ferm._grid;
+  conformable(grid,g._grid);
+  ferm = g*ferm;
+}
+/*
+ * Adjoint rep gauge xform
+ */
+
+template<typename Gimpl>
+static void GaugeTransform(typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
+  GridBase *grid = Umu.Grid();
+  conformable(grid,g.Grid());
+
+  typename Gimpl::GaugeLinkField U(grid);
+  typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);
+
+  for(int mu=0;mu<Nd;mu++){
+    U= PeekIndex<LorentzIndex>(Umu,mu);
+    U = g*U*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
+    PokeIndex<LorentzIndex>(Umu,U,mu);
+  }
+}
+template<typename Gimpl>
+static void GaugeTransform( std::vector<typename Gimpl::GaugeLinkField> &U, typename Gimpl::GaugeLinkField &g){
+  GridBase *grid = g.Grid();
+  typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = g*U[mu]*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
+  }
+}
+template<typename Gimpl>
+static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
+  LieRandomize(pRNG,g,1.0);
+  GaugeTransform<Gimpl>(Umu,g);
+}
+
--- a/Grid/qcd/utils/SUnAdjoint.h
+++ b/Grid/qcd/utils/SUnAdjoint.h
@@ -51,6 +51,10 @@ public:
  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> > LatticeAdjFieldF;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> > LatticeAdjFieldD;

+
+  template <typename vtype>
+  using iSUnMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
+
  typedef Lattice<iScalar<iScalar<iVector<vComplex, Dimension> > > >  LatticeAdjVector;

  template <class cplx>
@@ -58,8 +62,8 @@ public:
    // returns i(T_Adj)^index necessary for the projectors
    // see definitions above
    iAdjTa = Zero();
-    Vector<typename SU<ncolour>::template iSUnMatrix<cplx> > ta(ncolour * ncolour - 1);
-    typename SU<ncolour>::template iSUnMatrix<cplx> tmp;
+    Vector<iSUnMatrix<cplx> > ta(ncolour * ncolour - 1);
+    iSUnMatrix<cplx> tmp;

    // FIXME not very efficient to get all the generators everytime
    for (int a = 0; a < Dimension; a++) SU<ncolour>::generator(a, ta[a]);
@@ -67,8 +71,7 @@ public:
    for (int a = 0; a < Dimension; a++) {
      tmp = ta[a] * ta[Index] - ta[Index] * ta[a];
      for (int b = 0; b < (ncolour * ncolour - 1); b++) {
-        typename SU<ncolour>::template iSUnMatrix<cplx> tmp1 =
-	  2.0 * tmp * ta[b];  // 2.0 from the normalization
+        iSUnMatrix<cplx> tmp1 = 2.0 * tmp * ta[b];  // 2.0 from the normalization
        Complex iTr = TensorRemove(timesI(trace(tmp1)));
        //iAdjTa()()(b, a) = iTr;
        iAdjTa()()(a, b) = iTr;
@@ -134,8 +137,7 @@ public:

    for (int a = 0; a < Dimension; a++) {
      generator(a, iTa);
-      LatticeComplex tmp = real(trace(iTa * in)) * coefficient;
-      pokeColour(h_out, tmp, a);
+      pokeColour(h_out, real(trace(iTa * in)) * coefficient, a);
    }
  }

--- a/Grid/qcd/utils/SUnTwoIndex.h
+++ b/Grid/qcd/utils/SUnTwoIndex.h
@@ -1,273 +0,0 @@
-////////////////////////////////////////////////////////////////////////
-//
-// * Two index representation generators
-//
-// * Normalisation for the fundamental generators:
-//   trace ta tb = 1/2 delta_ab = T_F delta_ab
-//   T_F = 1/2  for SU(N) groups
-//
-//
-//   base for NxN two index (anti-symmetric) matrices
-//   normalized to 1 (d_ij is the kroenecker delta)
-//
-//   (e^(ij)_{kl} = 1 / sqrt(2) (d_ik d_jl +/- d_jk d_il)
-//
-//   Then the generators are written as
-//
-//   (iT_a)^(ij)(lk) = i * ( tr[e^(ij)^dag e^(lk) T^trasp_a] +
-//   tr[e^(lk)e^(ij)^dag T_a] )  //
-//   
-//
-////////////////////////////////////////////////////////////////////////
-
-// Authors: David Preti, Guido Cossu
-
-#ifndef QCD_UTIL_SUN2INDEX_H
-#define QCD_UTIL_SUN2INDEX_H
-
-
-NAMESPACE_BEGIN(Grid);
-
-enum TwoIndexSymmetry { Symmetric = 1, AntiSymmetric = -1 };
-
-inline Real delta(int a, int b) { return (a == b) ? 1.0 : 0.0; }
-
-template <int ncolour, TwoIndexSymmetry S>
-class SU_TwoIndex : public SU<ncolour> {
-public:
-  static const int Dimension = ncolour * (ncolour + S) / 2;
-  static const int NumGenerators = SU<ncolour>::AdjointDimension;
-
-  template <typename vtype>
-  using iSUnTwoIndexMatrix = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
-
-  typedef iSUnTwoIndexMatrix<Complex> TIMatrix;
-  typedef iSUnTwoIndexMatrix<ComplexF> TIMatrixF;
-  typedef iSUnTwoIndexMatrix<ComplexD> TIMatrixD;
-
-  typedef iSUnTwoIndexMatrix<vComplex> vTIMatrix;
-  typedef iSUnTwoIndexMatrix<vComplexF> vTIMatrixF;
-  typedef iSUnTwoIndexMatrix<vComplexD> vTIMatrixD;
-
-  typedef Lattice<vTIMatrix> LatticeTwoIndexMatrix;
-  typedef Lattice<vTIMatrixF> LatticeTwoIndexMatrixF;
-  typedef Lattice<vTIMatrixD> LatticeTwoIndexMatrixD;
-
-  typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> >
-  LatticeTwoIndexField;
-  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> >
-  LatticeTwoIndexFieldF;
-  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> >
-  LatticeTwoIndexFieldD;
-
-  template <typename vtype>
-  using iSUnMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
-
-  typedef iSUnMatrix<Complex> Matrix;
-  typedef iSUnMatrix<ComplexF> MatrixF;
-  typedef iSUnMatrix<ComplexD> MatrixD;
-
-  template <class cplx>
-  static void base(int Index, iSUnMatrix<cplx> &eij) {
-    // returns (e)^(ij)_{kl} necessary for change of base U_F -> U_R
-    assert(Index < NumGenerators);
-    eij = Zero();
-
-    // for the linearisation of the 2 indexes 
-    static int a[ncolour * (ncolour - 1) / 2][2]; // store the a <-> i,j
-    static bool filled = false;
-    if (!filled) {
-      int counter = 0;
-      for (int i = 1; i < ncolour; i++) {
-        for (int j = 0; j < i; j++) {
-          a[counter][0] = i;
-          a[counter][1] = j;
-          counter++;
-        }
-      }
-      filled = true;
-    }
-
-    if (Index < ncolour * (ncolour - 1) / 2) {
-      baseOffDiagonal(a[Index][0], a[Index][1], eij);
-    } else {
-      baseDiagonal(Index, eij);
-    }
-  }
-
-  template <class cplx>
-  static void baseDiagonal(int Index, iSUnMatrix<cplx> &eij) {
-    eij = Zero();
-    eij()()(Index - ncolour * (ncolour - 1) / 2,
-            Index - ncolour * (ncolour - 1) / 2) = 1.0;
-  }
-
-  template <class cplx>
-  static void baseOffDiagonal(int i, int j, iSUnMatrix<cplx> &eij) {
-    eij = Zero();
-    for (int k = 0; k < ncolour; k++)
-      for (int l = 0; l < ncolour; l++)
-        eij()()(l, k) = delta(i, k) * delta(j, l) +
-	  S * delta(j, k) * delta(i, l);
-
-    RealD nrm = 1. / std::sqrt(2.0);
-    eij = eij * nrm;
-  }
-
-  static void printBase(void) {
-    for (int gen = 0; gen < Dimension; gen++) {
-      Matrix tmp;
-      base(gen, tmp);
-      std::cout << GridLogMessage << "Nc = " << ncolour << " t_" << gen
-                << std::endl;
-      std::cout << GridLogMessage << tmp << std::endl;
-    }
-  }
-
-  template <class cplx>
-  static void generator(int Index, iSUnTwoIndexMatrix<cplx> &i2indTa) {
-    Vector<typename SU<ncolour>::template iSUnMatrix<cplx> > ta(
-								ncolour * ncolour - 1);
-    Vector<typename SU<ncolour>::template iSUnMatrix<cplx> > eij(Dimension);
-    typename SU<ncolour>::template iSUnMatrix<cplx> tmp;
-    i2indTa = Zero();
-    
-    for (int a = 0; a < ncolour * ncolour - 1; a++)
-      SU<ncolour>::generator(a, ta[a]);
-    
-    for (int a = 0; a < Dimension; a++) base(a, eij[a]);
-
-    for (int a = 0; a < Dimension; a++) {
-      tmp = transpose(ta[Index]) * adj(eij[a]) + adj(eij[a]) * ta[Index];
-      for (int b = 0; b < Dimension; b++) {
-        typename SU<ncolour>::template iSUnMatrix<cplx> tmp1 =
-	  tmp * eij[b]; 
-        Complex iTr = TensorRemove(timesI(trace(tmp1)));
-        i2indTa()()(a, b) = iTr;
-      }
-    }
-  }
-
-  static void printGenerators(void) {
-    for (int gen = 0; gen < ncolour * ncolour - 1; gen++) {
-      TIMatrix i2indTa;
-      generator(gen, i2indTa);
-      std::cout << GridLogMessage << "Nc = " << ncolour << " t_" << gen
-                << std::endl;
-      std::cout << GridLogMessage << i2indTa << std::endl;
-    }
-  }
-
-  static void testGenerators(void) {
-    TIMatrix i2indTa, i2indTb;
-    std::cout << GridLogMessage << "2IndexRep - Checking if traceless"
-              << std::endl;
-    for (int a = 0; a < ncolour * ncolour - 1; a++) {
-      generator(a, i2indTa);
-      std::cout << GridLogMessage << a << std::endl;
-      assert(norm2(trace(i2indTa)) < 1.0e-6);
-    }
-    std::cout << GridLogMessage << std::endl;
-
-    std::cout << GridLogMessage << "2IndexRep - Checking if antihermitean"
-              << std::endl;
-    for (int a = 0; a < ncolour * ncolour - 1; a++) {
-      generator(a, i2indTa);
-      std::cout << GridLogMessage << a << std::endl;
-      assert(norm2(adj(i2indTa) + i2indTa) < 1.0e-6);
-    }
-
-    std::cout << GridLogMessage << std::endl;
-    std::cout << GridLogMessage
-              << "2IndexRep - Checking Tr[Ta*Tb]=delta(a,b)*(N +- 2)/2"
-              << std::endl;
-    for (int a = 0; a < ncolour * ncolour - 1; a++) {
-      for (int b = 0; b < ncolour * ncolour - 1; b++) {
-        generator(a, i2indTa);
-        generator(b, i2indTb);
-
-        // generator returns iTa, so we need a minus sign here
-        Complex Tr = -TensorRemove(trace(i2indTa * i2indTb));
-        std::cout << GridLogMessage << "a=" << a << "b=" << b << "Tr=" << Tr
-                  << std::endl;
-      }
-    }
-    std::cout << GridLogMessage << std::endl;
-  }
-
-  static void TwoIndexLieAlgebraMatrix(
-				       const typename SU<ncolour>::LatticeAlgebraVector &h,
-				       LatticeTwoIndexMatrix &out, Real scale = 1.0) {
-    conformable(h, out);
-    GridBase *grid = out.Grid();
-    LatticeTwoIndexMatrix la(grid);
-    TIMatrix i2indTa;
-
-    out = Zero();
-    for (int a = 0; a < ncolour * ncolour - 1; a++) {
-      generator(a, i2indTa);
-      la = peekColour(h, a) * i2indTa;
-      out += la;
-    }
-    out *= scale;
-  }
-
-  // Projects the algebra components 
-  // of a lattice matrix ( of dimension ncol*ncol -1 )
-  static void projectOnAlgebra(
-			       typename SU<ncolour>::LatticeAlgebraVector &h_out,
-			       const LatticeTwoIndexMatrix &in, Real scale = 1.0) {
-    conformable(h_out, in);
-    h_out = Zero();
-    TIMatrix i2indTa;
-    Real coefficient = -2.0 / (ncolour + 2 * S) * scale;
-    // 2/(Nc +/- 2) for the normalization of the trace in the two index rep
-    for (int a = 0; a < ncolour * ncolour - 1; a++) {
-      generator(a, i2indTa);
-      auto tmp = real(trace(i2indTa * in)) * coefficient;
-      pokeColour(h_out, tmp, a);
-    }
-  }
-
-  // a projector that keeps the generators stored to avoid the overhead of
-  // recomputing them
-  static void projector(typename SU<ncolour>::LatticeAlgebraVector &h_out,
-                        const LatticeTwoIndexMatrix &in, Real scale = 1.0) {
-    conformable(h_out, in);
-    // to store the generators
-    static std::vector<TIMatrix> i2indTa(ncolour * ncolour -1); 
-    h_out = Zero();
-    static bool precalculated = false;
-    if (!precalculated) {
-      precalculated = true;
-      for (int a = 0; a < ncolour * ncolour - 1; a++) generator(a, i2indTa[a]);
-    }
-
-    Real coefficient =
-      -2.0 / (ncolour + 2 * S) * scale;  // 2/(Nc +/- 2) for the normalization
-    // of the trace in the two index rep
-
-    for (int a = 0; a < ncolour * ncolour - 1; a++) {
-      auto tmp = real(trace(i2indTa[a] * in)) * coefficient;
-      pokeColour(h_out, tmp, a);
-    }
-  }
-};
-
-// Some useful type names
-typedef SU_TwoIndex<Nc, Symmetric> TwoIndexSymmMatrices;
-typedef SU_TwoIndex<Nc, AntiSymmetric> TwoIndexAntiSymmMatrices;
-
-typedef SU_TwoIndex<2, Symmetric> SU2TwoIndexSymm;
-typedef SU_TwoIndex<3, Symmetric> SU3TwoIndexSymm;
-typedef SU_TwoIndex<4, Symmetric> SU4TwoIndexSymm;
-typedef SU_TwoIndex<5, Symmetric> SU5TwoIndexSymm;
-
-typedef SU_TwoIndex<2, AntiSymmetric> SU2TwoIndexAntiSymm;
-typedef SU_TwoIndex<3, AntiSymmetric> SU3TwoIndexAntiSymm;
-typedef SU_TwoIndex<4, AntiSymmetric> SU4TwoIndexAntiSymm;
-typedef SU_TwoIndex<5, AntiSymmetric> SU5TwoIndexAntiSymm;
-
-NAMESPACE_END(Grid);
-
-#endif
--- a/Grid/qcd/utils/Sp2n.impl.h
+++ b/Grid/qcd/utils/Sp2n.impl.h
@@ -0,0 +1,317 @@
+// This file is #included into the body of the class template definition of
+// GaugeGroup. So, image there to be
+//
+// template <int ncolour, class group_name>
+// class GaugeGroup {
+//
+// around it.
+//
+// Please note that the unconventional file extension makes sure that it
+// doesn't get found by the scripts/filelist during bootstrapping.
+
+private:
+template <ONLY_IF_Sp>
+static int su2subgroups(GroupName::Sp) { return (ncolour/2 * (ncolour/2 - 1)) / 2; }
+
+// Sp(2N) has N(2N+1) = 2N^2+N generators
+//
+// normalise the generators such that
+// Trace ( Ta Tb) = 1/2 delta_ab
+//
+// N generators in the cartan, 2N^2 off
+// off diagonal:
+//     there are 6 types named a,b,c,d and w,z
+//     abcd are N(N-1)/2 each while wz are N each
+
+template <class cplx, ONLY_IF_Sp>
+static void generator(int lieIndex, iGroupMatrix<cplx> &ta, GroupName::Sp) {
+  // map lie index into type of generators: diagonal, abcd type, wz type
+
+  const int nsp = ncolour/2;
+  int diagIndex;
+  int aIndex, bIndex, cIndex, dIndex;
+  int wIndex, zIndex;  // a,b,c,d are N(N-1)/2 and w,z are N
+  const int mod = nsp * (nsp - 1) * 0.5;
+  const int offdiag =
+      2 * nsp * nsp;  // number of generators not in the cartan subalgebra
+  const int wmod = 4 * mod;
+  const int zmod = wmod + nsp;
+  if (lieIndex >= offdiag) {
+    diagIndex = lieIndex - offdiag;  // 0, ... ,N-1
+    // std::cout << GridLogMessage << "diag type " << std::endl;
+    generatorDiagtype(diagIndex, ta);
+    return;
+  }
+  if ((lieIndex >= wmod) && (lieIndex < zmod)) {
+    // std::cout << GridLogMessage << "w type " << std::endl;
+    wIndex = lieIndex - wmod;  // 0, ... ,N-1
+    generatorWtype(wIndex, ta);
+    return;
+  }
+  if ((lieIndex >= zmod) && (lieIndex < offdiag)) {
+    // std::cout << GridLogMessage << "z type " << std::endl;
+    // std::cout << GridLogMessage << "lie index " << lieIndex << std::endl;
+    // std::cout << GridLogMessage << "z mod " << zmod << std::endl;
+    zIndex = lieIndex - zmod;  // 0, ... ,N-1
+    generatorZtype(zIndex, ta);
+    return;
+  }
+  if (lieIndex < mod) {  // atype 0, ... , N(N-1)/2=mod
+    // std::cout << GridLogMessage << "a type " << std::endl;
+    aIndex = lieIndex;
+    // std::cout << GridLogMessage << "a indx " << aIndex << std::endl;
+    generatorAtype(aIndex, ta);
+    return;
+  }
+  if ((lieIndex >= mod) && lieIndex < 2 * mod) {  // btype mod, ... , 2mod-1
+    // std::cout << GridLogMessage << "b type " << std::endl;
+    bIndex = lieIndex - mod;
+    generatorBtype(bIndex, ta);
+    return;
+  }
+  if ((lieIndex >= 2 * mod) &&
+      lieIndex < 3 * mod) {  // ctype 2mod, ... , 3mod-1
+    // std::cout << GridLogMessage << "c type " << std::endl;
+    cIndex = lieIndex - 2 * mod;
+    generatorCtype(cIndex, ta);
+    return;
+  }
+  if ((lieIndex >= 3 * mod) &&
+      lieIndex < wmod) {  // ctype 3mod, ... , 4mod-1 = wmod-1
+    // std::cout << GridLogMessage << "d type " << std::endl;
+    dIndex = lieIndex - 3 * mod;
+    generatorDtype(dIndex, ta);
+    return;
+  }
+
+}  // end of generator
+
+template <class cplx, ONLY_IF_Sp>
+static void generatorDiagtype(int diagIndex, iGroupMatrix<cplx> &ta) {
+  // ta(i,i) = - ta(i+N,i+N) = 1/2 for each i index of the cartan subalgebra
+
+  const int nsp=ncolour/2;
+  ta = Zero();
+  RealD nrm = 1.0 / 2;
+
+  ta()()(diagIndex, diagIndex) = nrm;
+  ta()()(diagIndex + nsp, diagIndex + nsp) = -nrm;
+}
+
+template <class cplx, ONLY_IF_Sp>
+static void generatorAtype(int aIndex, iGroupMatrix<cplx> &ta) {
+  // ta(i,j) = ta(j,i) = -ta(i+N,j+N) = -ta(j+N,i+N) = 1 / 2 sqrt(2)
+  // with i<j and i=0,...,N-2
+  // follows that j=i+1, ... , N
+  int i1, i2;
+  const int nsp=ncolour/2;
+  ta = Zero();
+  RealD nrm = 1 / (2 * std::sqrt(2));
+
+  su2SubGroupIndex(i1, i2, aIndex);
+  ta()()(i1, i2) = 1;
+  ta()()(i2, i1) = 1;
+  ta()()(i1 + nsp, i2 + nsp) = -1;
+  ta()()(i2 + nsp, i1 + nsp) = -1;
+
+  ta = ta * nrm;
+}
+
+template <class cplx, ONLY_IF_Sp>
+static void generatorBtype(int bIndex, iGroupMatrix<cplx> &ta) {
+  // ta(i,j) = -ta(j,i) = ta(i+N,j+N) = -ta(j+N,i+N) = i / 1/ 2 sqrt(2)
+  // with i<j and i=0,...,N-2
+  // follows that j=i+1, ... , N-1
+
+  const int nsp=ncolour/2;
+  int i1, i2;
+  ta = Zero();
+  cplx i(0.0, 1.0);
+  RealD nrm = 1 / (2 * std::sqrt(2));
+  su2SubGroupIndex(i1, i2, bIndex);
+
+  ta()()(i1, i2) = i;
+  ta()()(i2, i1) = -i;
+  ta()()(i1 + nsp, i2 + nsp) = i;
+  ta()()(i2 + nsp, i1 + nsp) = -i;
+
+  ta = ta * nrm;
+}
+
+template <class cplx, ONLY_IF_Sp>
+static void generatorCtype(int cIndex, iGroupMatrix<cplx> &ta) {
+  // ta(i,j+N) = ta(j,i+N) = ta(i+N,j) = ta(j+N,i) = 1 / 2 sqrt(2)
+
+  const int nsp=ncolour/2;
+  int i1, i2;
+  ta = Zero();
+  RealD nrm = 1 / (2 * std::sqrt(2));
+  su2SubGroupIndex(i1, i2, cIndex);
+
+  ta()()(i1, i2 + nsp) = 1;
+  ta()()(i2, i1 + nsp) = 1;
+  ta()()(i1 + nsp, i2) = 1;
+  ta()()(i2 + nsp, i1) = 1;
+
+  ta = ta * nrm;
+}
+
+template <class cplx, ONLY_IF_Sp>
+static void generatorDtype(int dIndex, iGroupMatrix<cplx> &ta) {
+  // ta(i,j+N) = ta(j,i+N) = -ta(i+N,j) = -ta(j+N,i) = i /  2 sqrt(2)
+
+  const int nsp=ncolour/2;
+  int i1, i2;
+  ta = Zero();
+  cplx i(0.0, 1.0);
+  RealD nrm = 1 / (2 * std::sqrt(2));
+  su2SubGroupIndex(i1, i2, dIndex);
+
+  ta()()(i1, i2 + nsp) = i;
+  ta()()(i2, i1 + nsp) = i;
+  ta()()(i1 + nsp, i2) = -i;
+  ta()()(i2 + nsp, i1) = -i;
+
+  ta = ta * nrm;
+}
+
+template <class cplx, ONLY_IF_Sp>
+static void generatorWtype(int wIndex, iGroupMatrix<cplx> &ta) {
+  // ta(i,i+N) =  ta(i+N,i) = 1/2
+
+  const int nsp=ncolour/2;
+  ta = Zero();
+  RealD nrm = 1.0 / 2;  // check
+
+  ta()()(wIndex, wIndex + nsp) = 1;
+  ta()()(wIndex + nsp, wIndex) = 1;
+
+  ta = ta * nrm;
+}
+
+template <class cplx, ONLY_IF_Sp>
+static void generatorZtype(int zIndex, iGroupMatrix<cplx> &ta) {
+  // ta(i,i+N) = - ta(i+N,i) = i/2
+
+  const int nsp=ncolour/2;
+  ta = Zero();
+  RealD nrm = 1.0 / 2;  // check
+  cplx i(0.0, 1.0);
+  ta()()(zIndex, zIndex + nsp) = i;
+  ta()()(zIndex + nsp, zIndex) = -i;
+
+  ta = ta * nrm;
+}
+
+////////////////////////////////////////////////////////////////////////
+// Map a su2 subgroup number to the pair of rows that are non zero
+////////////////////////////////////////////////////////////////////////
+template <ONLY_IF_Sp>
+static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
+  const int nsp=ncolour/2;
+  assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2));
+
+  int spare = su2_index;
+  for (i1 = 0; spare >= (nsp - 1 - i1); i1++) {
+    spare = spare - (nsp - 1 - i1);  // remove the Nc-1-i1 terms
+  }
+  i2 = i1 + 1 + spare;
+}
+
+static void testGenerators(GroupName::Sp) {
+  Matrix ta;
+  Matrix tb;
+  std::cout << GridLogMessage
+            << "Fundamental - Checking trace ta tb is 0.5 delta_ab "
+            << std::endl;
+  for (int a = 0; a < AlgebraDimension; a++) {
+    for (int b = 0; b < AlgebraDimension; b++) {
+      generator(a, ta);
+      generator(b, tb);
+      Complex tr = TensorRemove(trace(ta * tb));
+      std::cout << GridLogMessage << "(" << a << "," << b << ") =  " << tr
+                << std::endl;
+      if (a == b) assert(abs(tr - Complex(0.5)) < 1.0e-6);
+      if (a != b) assert(abs(tr) < 1.0e-6);
+    }
+  }
+  std::cout << GridLogMessage << std::endl;
+  std::cout << GridLogMessage << "Fundamental - Checking if hermitian"
+            << std::endl;
+  for (int a = 0; a < AlgebraDimension; a++) {
+    generator(a, ta);
+    std::cout << GridLogMessage << a << std::endl;
+    assert(norm2(ta - adj(ta)) < 1.0e-6);
+  }
+  std::cout << GridLogMessage << std::endl;
+  std::cout << GridLogMessage << "Fundamental - Checking if traceless"
+            << std::endl;
+  for (int a = 0; a < AlgebraDimension; a++) {
+    generator(a, ta);
+    Complex tr = TensorRemove(trace(ta));
+    std::cout << GridLogMessage << a << std::endl;
+    assert(abs(tr) < 1.0e-6);
+  }
+}
+
+template <int N>
+static Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > >
+ProjectOnGeneralGroup(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu, GroupName::Sp) {
+  return ProjectOnSpGroup(Umu);
+}
+
+template <class vtype>
+accelerator_inline static iScalar<vtype> ProjectOnGeneralGroup(const iScalar<vtype> &r, GroupName::Sp) {
+  return ProjectOnSpGroup(r);
+}
+
+template <class vtype, int N>
+accelerator_inline static iVector<vtype,N> ProjectOnGeneralGroup(const iVector<vtype,N> &r, GroupName::Sp) {
+  return ProjectOnSpGroup(r);
+}
+
+template <class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr>
+accelerator_inline static iMatrix<vtype,N> ProjectOnGeneralGroup(const iMatrix<vtype,N> &arg, GroupName::Sp) {
+  return ProjectOnSpGroup(arg);
+}
+
+template <typename LatticeMatrixType>   
+static void taProj(const LatticeMatrixType &in, LatticeMatrixType &out, GroupName::Sp) {
+  out = SpTa(in);
+}
+
+public:
+
+template <ONLY_IF_Sp>
+static void Omega(LatticeColourMatrixD &in) {
+  const int nsp=ncolour/2;
+  LatticeColourMatrixD OmegaLatt(in.Grid());
+  LatticeColourMatrixD identity(in.Grid());
+  ColourMatrix Omega;
+
+  OmegaLatt = Zero();
+  Omega = Zero();
+  identity = 1.;
+
+  for (int i = 0; i < nsp; i++) {
+    Omega()()(i, nsp + i) = 1.;
+    Omega()()(nsp + i, i) = -1;
+  }
+  OmegaLatt = OmegaLatt + (identity * Omega);
+  in = OmegaLatt;
+}
+
+template <ONLY_IF_Sp, class vtype, int N>
+static void Omega(iScalar<iScalar<iMatrix<vtype, N> > > &in) {
+  const int nsp=ncolour/2;
+    
+  iScalar<iScalar<iMatrix<vtype, N> > > Omega;
+  Omega = Zero();
+
+  for (int i = 0; i < nsp; i++) {
+    Omega()()(i, nsp + i) = 1.;
+    Omega()()(nsp + i, i) = -1;
+  }
+    
+  in = Omega;
+}
--- a/Grid/qcd/utils/Utils.h
+++ b/Grid/qcd/utils/Utils.h
@@ -8,9 +8,9 @@
 #include <Grid/qcd/utils/ScalarObjs.h>

 // Include representations
-#include <Grid/qcd/utils/SUn.h>
+#include <Grid/qcd/utils/GaugeGroup.h>
 #include <Grid/qcd/utils/SUnAdjoint.h>
-#include <Grid/qcd/utils/SUnTwoIndex.h>
+#include <Grid/qcd/utils/GaugeGroupTwoIndex.h>

 // All-to-all contraction kernels that touch the 
 // internal lattice structure
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -290,7 +290,7 @@ public:
  }
 */
  //////////////////////////////////////////////////
-  // the sum over all staples on each site
+  // the sum over all nu-oriented staples for nu != mu on each site
  //////////////////////////////////////////////////
  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

@@ -300,6 +300,10 @@ public:
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
+    Staple(staple, U, mu);
+  }
+
+  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
    staple = Zero();

    for (int nu = 0; nu < Nd; nu++) {
@@ -335,6 +339,203 @@ public:
    }
  }

+  /////////////
+  //Staples for each direction mu, summed over nu != mu
+  //staple: output staples for each mu (Nd)
+  //U: link array (Nd)
+  /////////////
+  static void StapleAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U) {
+    assert(staple.size() == Nd); assert(U.size() == Nd);
+    for(int mu=0;mu<Nd;mu++) Staple(staple[mu], U, mu);
+  }
+
+
+  //A workspace class allowing reuse of the stencil
+  class WilsonLoopPaddedStencilWorkspace{
+    std::unique_ptr<GeneralLocalStencil> stencil;
+    size_t nshift;
+
+    void generateStencil(GridBase* padded_grid){
+      double t0 = usecond();
+      
+      //Generate shift arrays
+      std::vector<Coordinate> shifts = this->getShifts();
+      nshift = shifts.size();
+      
+      double t1 = usecond();
+      //Generate local stencil
+      stencil.reset(new GeneralLocalStencil(padded_grid,shifts));
+      double t2 = usecond();
+      std::cout << GridLogPerformance << " WilsonLoopPaddedWorkspace timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms" << std::endl;   
+    }
+  public:
+    //Get the stencil. If not already generated, or if generated using a different Grid than in PaddedCell, it will be created on-the-fly
+    const GeneralLocalStencil & getStencil(const PaddedCell &pcell){
+      assert(pcell.depth >= this->paddingDepth());
+      if(!stencil || stencil->Grid() != (GridBase*)pcell.grids.back() ) generateStencil((GridBase*)pcell.grids.back());
+      return *stencil;
+    }
+    size_t Nshift() const{ return nshift; }
+    
+    virtual std::vector<Coordinate> getShifts() const = 0;
+    virtual int paddingDepth() const = 0; //padding depth required
+    
+    virtual ~WilsonLoopPaddedStencilWorkspace(){}
+  };
+
+  //This workspace allows the sharing of a common PaddedCell object between multiple stencil workspaces
+  class WilsonLoopPaddedWorkspace{
+    std::vector<WilsonLoopPaddedStencilWorkspace*> stencil_wk;
+    std::unique_ptr<PaddedCell> pcell;
+
+    void generatePcell(GridBase* unpadded_grid){
+      assert(stencil_wk.size());
+      int max_depth = 0;
+      for(auto const &s : stencil_wk) max_depth=std::max(max_depth, s->paddingDepth());
+      
+      pcell.reset(new PaddedCell(max_depth, dynamic_cast<GridCartesian*>(unpadded_grid)));
+    }
+    
+  public:
+    //Add a stencil definition. This should be done before the first call to retrieve a stencil object.
+    //Takes ownership of the pointer
+    void addStencil(WilsonLoopPaddedStencilWorkspace *stencil){
+      assert(!pcell);
+      stencil_wk.push_back(stencil);
+    }
+
+    const GeneralLocalStencil & getStencil(const size_t stencil_idx, GridBase* unpadded_grid){
+      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
+      return stencil_wk[stencil_idx]->getStencil(*pcell);
+    }      
+    const PaddedCell & getPaddedCell(GridBase* unpadded_grid){
+      if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
+      return *pcell;
+    }
+    
+    ~WilsonLoopPaddedWorkspace(){
+      for(auto &s : stencil_wk) delete s;
+    }
+  };
+
+  //A workspace class allowing reuse of the stencil
+  class StaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
+  public:
+    std::vector<Coordinate> getShifts() const override{
+      std::vector<Coordinate> shifts;
+      for(int mu=0;mu<Nd;mu++){
+	for(int nu=0;nu<Nd;nu++){
+	  if(nu != mu){
+	    Coordinate shift_0(Nd,0);
+	    Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
+	    Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
+	    Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
+	    Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
+      
+	    //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
+	    shifts.push_back(shift_0);
+	    shifts.push_back(shift_nu);
+	    shifts.push_back(shift_mu);
+      
+	    //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
+	    shifts.push_back(shift_mnu);
+	    shifts.push_back(shift_mnu);
+	    shifts.push_back(shift_mnu_pmu);
+	  }
+	}
+      }
+      return shifts;
+    }
+
+    int paddingDepth() const override{ return 1; }
+  }; 
+
+  //Padded cell implementation of the staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
+    StaplePaddedAllWorkspace wk;
+    StaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
+  }
+  
+  //Padded cell implementation of the staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  //gStencil: the precomputed generalized local stencil for the staple
+  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil)
+  {
+    double t0 = usecond();
+    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
+    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
+    assert(Cell.depth >= 1);
+    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
+
+    int shift_mu_off = gStencil._npoints/Nd;
+    
+    //Open views to padded gauge links and keep open over mu loop
+    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
+    size_t vsize = Nd*sizeof(GaugeViewType);
+    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
+    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
+    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
+    
+    GaugeMat gStaple(ggrid);
+
+    int outer_off = 0;
+    for(int mu=0;mu<Nd;mu++){
+      { //view scope
+	autoView( gStaple_v , gStaple, AcceleratorWrite);
+	auto gStencil_v = gStencil.View();
+	
+	accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), {
+	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
+	    stencil_ss = Zero();
+	    int off = outer_off;
+	    
+	    for(int nu=0;nu<Nd;nu++){
+	      if(nu != mu){	  
+		GeneralStencilEntry const* e = gStencil_v.GetEntry(off++,ss);
+		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+      
+		stencil_ss = stencil_ss + U2 * U1 * U0;
+
+		e = gStencil_v.GetEntry(off++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(off++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+		stencil_ss = stencil_ss + U2 * U1 * U0;
+	      }
+	    }
+		
+	    coalescedWrite(gStaple_v[ss],stencil_ss);
+	  }
+	  );
+      } //ensure views are all closed!
+      
+      staple[mu] = Cell.Extract(gStaple);
+      outer_off += shift_mu_off;
+    }//mu loop
+
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
+    free(Ug_dirs_v_host);
+    acceleratorFreeDevice(Ug_dirs_v);
+    
+    double t1=usecond();
+    
+    std::cout << GridLogPerformance << "StaplePaddedAll timing:" << (t1-t0)/1000 << "ms" << std::endl;   
+  }
+
+   
  //////////////////////////////////////////////////
  // the sum over all staples on each site in direction mu,nu, upper part
  //////////////////////////////////////////////////
@@ -707,18 +908,14 @@ public:
  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
-    U2 = U * Cshift(U, mu, 1);
+    U2 = U * Gimpl::CshiftLink(U, mu, 1);
  }

  ////////////////////////////////////////////////////////////////////////////
-  // Hop by two optimisation strategy does not work nicely with Gparity. (could
-  // do,
-  // but need to track two deep where cross boundary and apply a conjugation).
-  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do
-  // so .
+  // Hop by two optimisation strategy. Use RectStapleDouble to obtain 'U2'
  ////////////////////////////////////////////////////////////////////////////
-  static void RectStapleOptimised(GaugeMat &Stap, std::vector<GaugeMat> &U2,
-                                  std::vector<GaugeMat> &U, int mu) {
+  static void RectStapleOptimised(GaugeMat &Stap, const std::vector<GaugeMat> &U2,
+                                  const std::vector<GaugeMat> &U, int mu) {

    Stap = Zero();

@@ -732,9 +929,9 @@ public:

        // Up staple    ___ ___
        //             |       |
-        tmp = Cshift(adj(U[nu]), nu, -1);
+        tmp = Gimpl::CshiftLink(adj(U[nu]), nu, -1);
        tmp = adj(U2[mu]) * tmp;
-        tmp = Cshift(tmp, mu, -2);
+        tmp = Gimpl::CshiftLink(tmp, mu, -2);

        Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);

@@ -742,14 +939,14 @@ public:
        //             |___ ___|
        //
        tmp = adj(U2[mu]) * U[nu];
-        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Cshift(tmp, mu, -2));
+        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CshiftLink(tmp, mu, -2));

        //              ___ ___
        //             |    ___|
        //             |___ ___|
        //

-        Stap += Cshift(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
+        Stap += Gimpl::CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);

        //              ___ ___
        //             |___    |
@@ -758,7 +955,7 @@ public:

        //  tmp= Staple2x1* Cshift(U[mu],mu,-2);
        //  Stap+= Cshift(tmp,mu,1) ;
-        Stap += Cshift(Staple2x1, mu, 1) * Cshift(U[mu], mu, -1);
+        Stap += Gimpl::CshiftLink(Staple2x1, mu, 1) * Gimpl::CshiftLink(U[mu], mu, -1);
        ;

        //       --
@@ -766,10 +963,10 @@ public:
        //
        //      |  |

-        tmp = Cshift(adj(U2[nu]), nu, -2);
+        tmp = Gimpl::CshiftLink(adj(U2[nu]), nu, -2);
        tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
-        tmp = U2[nu] * Cshift(tmp, nu, 2);
-        Stap += Cshift(tmp, mu, 1);
+        tmp = U2[nu] * Gimpl::CshiftLink(tmp, nu, 2);
+        Stap += Gimpl::CshiftLink(tmp, mu, 1);

        //      |  |
        //
@@ -778,25 +975,12 @@ public:

        tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
        tmp = adj(U2[nu]) * tmp;
-        tmp = Cshift(tmp, nu, -2);
-        Stap += Cshift(tmp, mu, 1);
+        tmp = Gimpl::CshiftLink(tmp, nu, -2);
+        Stap += Gimpl::CshiftLink(tmp, mu, 1);
      }
    }
  }

-  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
-    RectStapleUnoptimised(Stap, Umu, mu);
-  }
-  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
-                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
-                         int mu) {
-    if (Gimpl::isPeriodicGaugeField()) {
-      RectStapleOptimised(Stap, U2, U, mu);
-    } else {
-      RectStapleUnoptimised(Stap, Umu, mu);
-    }
-  }
-
  static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu,
                                    int mu) {
    GridBase *grid = Umu.Grid();
@@ -895,6 +1079,288 @@ public:
    }
  }

+  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
+    RectStapleUnoptimised(Stap, Umu, mu);
+  }
+  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
+                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
+                         int mu) {
+    RectStapleOptimised(Stap, U2, U, mu);
+  }
+  //////////////////////////////////////////////////////
+  //Compute the rectangular staples for all orientations
+  //Stap : Array of staples (Nd)
+  //U: Gauge links in each direction (Nd)
+  /////////////////////////////////////////////////////
+  static void RectStapleAll(std::vector<GaugeMat> &Stap, const std::vector<GaugeMat> &U){
+    assert(Stap.size() == Nd); assert(U.size() == Nd);
+    std::vector<GaugeMat> U2(Nd,U[0].Grid());
+    for(int mu=0;mu<Nd;mu++) RectStapleDouble(U2[mu], U[mu], mu);
+    for(int mu=0;mu<Nd;mu++) RectStapleOptimised(Stap[mu], U2, U, mu);
+  }
+
+  //A workspace class allowing reuse of the stencil
+  class RectStaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
+  public:
+    std::vector<Coordinate> getShifts() const override{
+      std::vector<Coordinate> shifts;
+      for (int mu = 0; mu < Nd; mu++){
+	for (int nu = 0; nu < Nd; nu++) {
+	  if (nu != mu) {
+	    auto genShift = [&](int mushift,int nushift){
+	      Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
+	    };
+
+	    //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+	    shifts.push_back(genShift(0,0));
+	    shifts.push_back(genShift(0,+1));
+	    shifts.push_back(genShift(+1,+1));
+	    shifts.push_back(genShift(+2,0));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(+1,-1));
+	    shifts.push_back(genShift(+2,-1));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+	    shifts.push_back(genShift(-1,0));
+	    shifts.push_back(genShift(-1,-1));
+	    shifts.push_back(genShift(-1,-1));
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(+1,-1));
+
+	    //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+	    shifts.push_back(genShift(-1,0));
+	    shifts.push_back(genShift(-1,0));
+	    shifts.push_back(genShift(-1,+1));
+	    shifts.push_back(genShift(0,+1));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+	    shifts.push_back(genShift(0,0));
+	    shifts.push_back(genShift(0,+1));
+	    shifts.push_back(genShift(0,+2));
+	    shifts.push_back(genShift(+1,+1));
+	    shifts.push_back(genShift(+1,0));
+
+	    //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+	    shifts.push_back(genShift(0,-1));
+	    shifts.push_back(genShift(0,-2));
+	    shifts.push_back(genShift(0,-2));
+	    shifts.push_back(genShift(+1,-2));
+	    shifts.push_back(genShift(+1,-1));
+	  }
+	}
+      }
+      return shifts;
+    }
+
+    int paddingDepth() const override{ return 2; }
+  }; 
+
+  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
+    RectStaplePaddedAllWorkspace wk;
+    RectStaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
+  }
+  
+  //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
+  //staple: output staple for each mu, summed over nu != mu (Nd)
+  //U_padded: the gauge link fields padded out using the PaddedCell class
+  //Cell: the padded cell class
+  //gStencil: the stencil
+  static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
+    double t0 = usecond();
+    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
+    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
+    assert(Cell.depth >= 2);
+    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
+
+    size_t nshift = gStencil._npoints;
+    int mu_off_delta = nshift / Nd;
+    
+    //Open views to padded gauge links and keep open over mu loop
+    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
+    size_t vsize = Nd*sizeof(GaugeViewType);
+    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
+    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
+    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
+
+    GaugeMat gStaple(ggrid); //temp staple object on padded grid
+
+    int offset = 0;
+    for(int mu=0; mu<Nd; mu++){
+
+      { //view scope
+	autoView( gStaple_v , gStaple, AcceleratorWrite);
+	auto gStencil_v = gStencil.View();
+
+	accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), {
+	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
+	    stencil_ss = Zero();
+	    int s=offset;
+	    for(int nu=0;nu<Nd;nu++){
+	      if(nu != mu){
+		//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
+		GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
+		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+	    
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
+
+		//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
+
+		//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
+		e = gStencil_v.GetEntry(s++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(s++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(s++,ss);
+		U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+
+		stencil_ss = stencil_ss + U4*U3*U2*U1*U0;   
+
+	      }
+	    }
+	    coalescedWrite(gStaple_v[ss],stencil_ss);
+	  }
+	  );
+	offset += mu_off_delta;
+      }//kernel/view scope
+
+      staple[mu] = Cell.Extract(gStaple);    
+    }//mu loop
+  
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
+    free(Ug_dirs_v_host);
+    acceleratorFreeDevice(Ug_dirs_v);
+    
+    double t1 = usecond();
+    
+    std::cout << GridLogPerformance << "RectStaplePaddedAll timings:" << (t1-t0)/1000 << "ms" << std::endl;   
+  }
+
+  //A workspace for reusing the PaddedCell and GeneralLocalStencil objects
+  class StapleAndRectStapleAllWorkspace: public WilsonLoopPaddedWorkspace{
+  public:
+    StapleAndRectStapleAllWorkspace(){
+      this->addStencil(new StaplePaddedAllWorkspace);
+      this->addStencil(new RectStaplePaddedAllWorkspace);
+    }
+  };     
+    
+  //////////////////////////////////////////////////////
+  //Compute the 1x1 and 1x2 staples for all orientations
+  //Stap : Array of staples (Nd)
+  //RectStap: Array of rectangular staples (Nd)
+  //U: Gauge links in each direction (Nd)
+  /////////////////////////////////////////////////////
+  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U){
+    StapleAndRectStapleAllWorkspace wk;
+    StapleAndRectStapleAll(Stap,RectStap,U,wk);
+  }
+  
+  //////////////////////////////////////////////////////
+  //Compute the 1x1 and 1x2 staples for all orientations
+  //Stap : Array of staples (Nd)
+  //RectStap: Array of rectangular staples (Nd)
+  //U: Gauge links in each direction (Nd)
+  //wk: a workspace containing stored PaddedCell and GeneralLocalStencil objects to maximize reuse
+  /////////////////////////////////////////////////////
+  static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U, StapleAndRectStapleAllWorkspace &wk){
+#if 0
+    StapleAll(Stap, U);
+    RectStapleAll(RectStap, U);
+#else
+    double t0 = usecond();
+
+    GridCartesian* unpadded_grid = dynamic_cast<GridCartesian*>(U[0].Grid());
+    const PaddedCell &Ghost = wk.getPaddedCell(unpadded_grid);
+        
+    CshiftImplGauge<Gimpl> cshift_impl;
+    std::vector<GaugeMat> U_pad(Nd, Ghost.grids.back());
+    for(int mu=0;mu<Nd;mu++) U_pad[mu] = Ghost.Exchange(U[mu], cshift_impl);
+    double t1 = usecond();
+    StaplePaddedAll(Stap, U_pad, Ghost, wk.getStencil(0,unpadded_grid) );
+    double t2 = usecond();
+    RectStaplePaddedAll(RectStap, U_pad, Ghost, wk.getStencil(1,unpadded_grid));
+    double t3 = usecond();
+    std::cout << GridLogPerformance << "StapleAndRectStapleAll timings: pad:" << (t1-t0)/1000 << "ms, staple:" << (t2-t1)/1000 << "ms, rect-staple:" << (t3-t2)/1000 << "ms" << std::endl;
+#endif
+  }
+
  //////////////////////////////////////////////////
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
  //////////////////////////////////////////////////
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@@ -43,7 +43,7 @@ class GeneralLocalStencilView {
  int                               _npoints; // Move to template param?
  GeneralStencilEntry*  _entries_p;

-  accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) { 
+  accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) const { 
    return & this->_entries_p[point+this->_npoints*osite]; 
  }

@@ -79,60 +79,60 @@ public:
    this->_entries.resize(npoints* osites);
    this->_entries_p = &_entries[0];

+    thread_for(site, osites, {
+	Coordinate Coor;
+	Coordinate NbrCoor;

-    Coordinate Coor;
-    Coordinate NbrCoor;
-    for(Integer site=0;site<osites;site++){
-      for(Integer ii=0;ii<npoints;ii++){
-	Integer lex = site*npoints+ii;
-	GeneralStencilEntry SE;
-	////////////////////////////////////////////////
-	// Outer index of neighbour Offset calculation
-	////////////////////////////////////////////////
-	grid->oCoorFromOindex(Coor,site);
-	for(int d=0;d<Coor.size();d++){
-	  int rd = grid->_rdimensions[d];
-	  NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
+	for(Integer ii=0;ii<npoints;ii++){
+	  Integer lex = site*npoints+ii;
+	  GeneralStencilEntry SE;
+	  ////////////////////////////////////////////////
+	  // Outer index of neighbour Offset calculation
+	  ////////////////////////////////////////////////
+	  grid->oCoorFromOindex(Coor,site);
+	  for(int d=0;d<Coor.size();d++){
+	    int rd = grid->_rdimensions[d];
+	    NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
+	  }
+	  SE._offset      = grid->oIndexReduced(NbrCoor);
+
+	  ////////////////////////////////////////////////
+	  // Inner index permute calculation
+	  // Simpler version using icoor calculation
+	  ////////////////////////////////////////////////
+	  SE._permute =0;
+	  for(int d=0;d<Coor.size();d++){
+
+	    int fd = grid->_fdimensions[d];
+	    int rd = grid->_rdimensions[d];
+	    int ly = grid->_simd_layout[d];
+
+	    assert((ly==1)||(ly==2));
+
+	    int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
+	    int x = Coor[d];                // x in [0... rd-1] as an oSite 
+
+	    int permute_dim  = grid->PermuteDim(d);
+	    int permute_slice=0;
+	    if(permute_dim){    
+	      int  num = shift%rd; // Slice within dest osite cell of slice zero
+	      int wrap = shift/rd; // Number of osite local volume cells crossed through
+	      // x+num < rd dictates whether we are in same permute state as slice 0
+	      if ( x< rd-num ) permute_slice=wrap;
+	      else             permute_slice=(wrap+1)%ly;
+	    }
+	    if ( permute_slice ) {
+	      int ptype       =grid->PermuteType(d);
+	      uint8_t mask    =0x1<<ptype;
+	      SE._permute    |= mask;
+	    }
+	  }	
+	  ////////////////////////////////////////////////
+	  // Store in look up table
+	  ////////////////////////////////////////////////
+	  this->_entries[lex] = SE;
 	}
-	SE._offset      = grid->oIndexReduced(NbrCoor);
-
-	////////////////////////////////////////////////
-	// Inner index permute calculation
-	// Simpler version using icoor calculation
-	////////////////////////////////////////////////
-	SE._permute =0;
-	for(int d=0;d<Coor.size();d++){
-
-	  int fd = grid->_fdimensions[d];
-	  int rd = grid->_rdimensions[d];
-	  int ly = grid->_simd_layout[d];
-
-	  assert((ly==1)||(ly==2));
-
-	  int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
-	  int x = Coor[d];                // x in [0... rd-1] as an oSite 
-
-	  int permute_dim  = grid->PermuteDim(d);
-	  int permute_slice=0;
-	  if(permute_dim){    
-	    int  num = shift%rd; // Slice within dest osite cell of slice zero
-	    int wrap = shift/rd; // Number of osite local volume cells crossed through
-                                  // x+num < rd dictates whether we are in same permute state as slice 0
-	    if ( x< rd-num ) permute_slice=wrap;
-	    else             permute_slice=(wrap+1)%ly;
-	  }
-	  if ( permute_slice ) {
-	    int ptype       =grid->PermuteType(d);
-	    uint8_t mask    =0x1<<ptype;
-	    SE._permute    |= mask;
-	  }
-	}	
-	////////////////////////////////////////////////
-	// Store in look up table
-	////////////////////////////////////////////////
-	this->_entries[lex] = SE;
-      }
-    }      
+      });
  }
  
 };
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -32,6 +32,7 @@

 #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate
 #include <Grid/stencil/Lebesgue.h>   // subdir aggregate
+#include <Grid/stencil/GeneralLocalStencil.h>

 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
--- a/Grid/tensors/Tensor_SIMT.h
+++ b/Grid/tensors/Tensor_SIMT.h
@@ -73,6 +73,16 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
    return vec;
  }
 }
+//'perm_mask' acts as a bitmask
+template<class vobj> accelerator_inline
+vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0)
+{
+  auto obj = vec, tmp = vec;
+  for (int d=0;d<nd;d++)
+    if (perm_mask & (0x1 << d)) { permute(obj,tmp,d); tmp=obj;}
+  return obj;
+}
+
 template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
 {
@@ -83,7 +93,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 {
  vstream(vec, extracted);
 }
-#else
+#else //==GRID_SIMT


 //#ifndef GRID_SYCL
@@ -166,6 +176,14 @@ typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,
  return extractLane(plane,vec);
 }
 template<class vobj> accelerator_inline
+typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd()))
+{
+  int plane = lane;
+  for (int d=0;d<nd;d++)
+    plane = (perm_mask & (0x1 << d)) ? plane ^ (vobj::Nsimd() >> (d + 1)) : plane;
+  return extractLane(plane,vec);
+}
+template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 {
  insertLane(lane,vec,extracted);
--- a/Grid/tensors/Tensor_Ta.h
+++ b/Grid/tensors/Tensor_Ta.h
@@ -66,13 +66,61 @@ template<class vtype,int N> accelerator_inline iMatrix<vtype,N> Ta(const iMatrix
  return ret;
 }

+template<class vtype> accelerator_inline iScalar<vtype> SpTa(const iScalar<vtype>&r)
+{
+  iScalar<vtype> ret;
+  ret._internal = SpTa(r._internal);
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iVector<vtype,N> SpTa(const iVector<vtype,N>&r)
+{
+  iVector<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    ret._internal[i] = SpTa(r._internal[i]);
+  }
+  return ret;
+}
+template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr>
+accelerator_inline iMatrix<vtype,N> SpTa(const iMatrix<vtype,N> &arg)
+{
+  // Generalises Ta to Sp2n
+  // Applies the following projections
+  // P_{antihermitian} P_{antihermitian-Sp-algebra} P_{traceless}
+  // where the ordering matters
+  // P_{traceless} subtracts the trace
+  // P_{antihermitian-Sp-algebra} provides the block structure of the algebra based on U = exp(T) i.e. anti-hermitian generators
+  // P_{antihermitian} does in-adj(in) / 2
+  iMatrix<vtype,N> ret(arg);
+  double factor = (1.0/(double)N);
+  vtype nrm;
+  nrm = 0.5;
+    
+  ret = arg - (trace(arg)*factor);
+    
+  for(int c1=0;c1<N/2;c1++)
+  {
+      for(int c2=0;c2<N/2;c2++)
+      {
+          ret._internal[c1][c2] = nrm*(conjugate(ret._internal[c1+N/2][c2+N/2]) + ret._internal[c1][c2]); // new[up-left] = old[up-left]+old*[down-right]
+          ret._internal[c1][c2+N/2] = nrm*(ret._internal[c1][c2+N/2] - conjugate(ret._internal[c1+N/2][c2])); // new[up-right] = old[up-right]-old*[down-left]
+      }
+      for(int c2=N/2;c2<N;c2++)
+      {
+          ret._internal[c1+N/2][c2-N/2] = -conjugate(ret._internal[c1][c2]);  //  reconstructs lower blocks
+          ret._internal[c1+N/2][c2] = conjugate(ret._internal[c1][c2-N/2]);   //  from upper blocks
+      }
+  }
+    
+  ret = (ret - adj(ret))*0.5;
+
+  return ret;
+}

 /////////////////////////////////////////////// 
 // ProjectOnGroup function for scalar, vector, matrix 
 // Projects on orthogonal, unitary group
 /////////////////////////////////////////////// 

-
 template<class vtype> accelerator_inline iScalar<vtype> ProjectOnGroup(const iScalar<vtype>&r)
 {
  iScalar<vtype> ret;
@@ -90,10 +138,12 @@ template<class vtype,int N> accelerator_inline iVector<vtype,N> ProjectOnGroup(c
 template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
 accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
 {
+  typedef typename iMatrix<vtype,N>::scalar_type scalar;
  // need a check for the group type?
  iMatrix<vtype,N> ret(arg);
  vtype nrm;
  vtype inner;
+  scalar one(1.0);
  for(int c1=0;c1<N;c1++){

    // Normalises row c1
@@ -102,7 +152,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);

    nrm = sqrt(inner);
-    nrm = 1.0/nrm;
+    nrm = one/nrm;
    for(int c2=0;c2<N;c2++)
      ret._internal[c1][c2]*= nrm;
      
@@ -127,7 +177,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);

    nrm = sqrt(inner);
-    nrm = 1.0/nrm;
+    nrm = one/nrm;
    for(int c2=0;c2<N;c2++)
      ret._internal[c1][c2]*= nrm;
  }
@@ -135,6 +185,85 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
  return ret;
 }

+// re-do for sp2n
+
+// Ta cannot be defined here for Sp2n because I need the generators from the Sp class
+// It is defined in gauge impl types
+
+template<class vtype> accelerator_inline iScalar<vtype> ProjectOnSpGroup(const iScalar<vtype>&r)
+{
+  iScalar<vtype> ret;
+  ret._internal = ProjectOnSpGroup(r._internal);
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iVector<vtype,N> ProjectOnSpGroup(const iVector<vtype,N>&r)
+{
+  iVector<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    ret._internal[i] = ProjectOnSpGroup(r._internal[i]);
+  }
+  return ret;
+}
+
+
+// int N is 2n in Sp(2n)
+template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr>
+accelerator_inline iMatrix<vtype,N> ProjectOnSpGroup(const iMatrix<vtype,N> &arg)
+{
+  // need a check for the group type?
+  iMatrix<vtype,N> ret(arg);
+  vtype nrm;
+  vtype inner;
+  
+  for(int c1=0;c1<N/2;c1++)
+  {
+      
+    for (int b=0; b<c1; b++)                  // remove the b-rows from U_c1
+    {
+      decltype(ret._internal[b][b]*ret._internal[b][b]) pr;
+      decltype(ret._internal[b][b]*ret._internal[b][b]) prn;
+      zeroit(pr);
+      zeroit(prn);
+          
+      for(int c=0; c<N; c++)
+      {
+        pr += conjugate(ret._internal[c1][c])*ret._internal[b][c];        // <U_c1 | U_b >
+        prn += conjugate(ret._internal[c1][c])*ret._internal[b+N/2][c];   // <U_c1 | U_{b+N} >
+      }
+       
+
+      for(int c=0; c<N; c++)
+      {
+        ret._internal[c1][c] -= (conjugate(pr) * ret._internal[b][c] + conjugate(prn) * ret._internal[b+N/2][c] );    //  U_c1 -= (  <U_c1 | U_b > U_b + <U_c1 | U_{b+N} > U_{b+N}  )
+      }
+    }
+    
+    zeroit(inner);
+    for(int c2=0;c2<N;c2++)
+    {
+      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
+    }
+      
+    nrm = sqrt(inner);
+    nrm = 1.0/nrm;
+    for(int c2=0;c2<N;c2++)
+    {
+      ret._internal[c1][c2]*= nrm;
+    }
+      
+    for(int c2=0;c2<N/2;c2++)
+    {
+      ret._internal[c1+N/2][c2+N/2] = conjugate(ret._internal[c1][c2]);          // down right in the new matrix = (up-left)* of the old matrix
+    }
+      
+    for(int c2=N/2;c2<N;c2++)
+    {
+      ret._internal[c1+N/2][c2-N/2] = -conjugate(ret._internal[c1][c2]);;     // down left in the new matrix = -(up-right)* of the old
+    }
+  }
+  return ret;
+}
+
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/tensors/Tensor_exp.h
+++ b/Grid/tensors/Tensor_exp.h
@@ -53,7 +53,6 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
 }


-
 // Specialisation: Cayley-Hamilton exponential for SU(3)
 #if 0
 template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
--- a/Grid/tensors/Tensor_trace.h
+++ b/Grid/tensors/Tensor_trace.h
@@ -69,6 +69,35 @@ accelerator_inline auto trace(const iVector<vtype,N> &arg) -> iVector<decltype(t
  }
  return ret;
 }
+////////////////////////////
+// Fast path traceProduct
+////////////////////////////
+template<class S1 , class S2, IfNotGridTensor<S1> = 0, IfNotGridTensor<S2> = 0>
+accelerator_inline auto traceProduct( const S1 &arg1,const S2 &arg2)
+  -> decltype(arg1*arg2)
+{
+  return arg1*arg2;
+}
+
+template<class vtype,class rtype,int N >
+accelerator_inline auto traceProduct(const iMatrix<vtype,N> &arg1,const iMatrix<rtype,N> &arg2) -> iScalar<decltype(trace(arg1._internal[0][0]*arg2._internal[0][0]))>
+{
+  iScalar<decltype( trace(arg1._internal[0][0]*arg2._internal[0][0] )) > ret;
+  zeroit(ret._internal);
+  for(int i=0;i<N;i++){
+  for(int j=0;j<N;j++){
+    ret._internal=ret._internal+traceProduct(arg1._internal[i][j],arg2._internal[j][i]);
+  }}
+  return ret;
+}
+
+template<class vtype,class rtype >
+accelerator_inline auto traceProduct(const iScalar<vtype> &arg1,const iScalar<rtype> &arg2) -> iScalar<decltype(trace(arg1._internal*arg2._internal))>
+{
+  iScalar<decltype(trace(arg1._internal*arg2._internal))> ret;
+  ret._internal=traceProduct(arg1._internal,arg2._internal);
+  return ret;
+}

 NAMESPACE_END(Grid);

--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -34,9 +34,12 @@ NAMESPACE_BEGIN(Grid);

  // These are the Grid tensors
  template<typename T>     struct isGridTensor                : public std::false_type { static constexpr bool notvalue = true; };
-  template<class T>        struct isGridTensor<iScalar<T>>    : public std::true_type  { static constexpr bool notvalue = false; };
-  template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
-  template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T>        struct isGridTensor<iScalar<T> >   : public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T, int N> struct isGridTensor<iVector<T, N> >: public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T, int N> struct isGridTensor<iMatrix<T, N> >: public std::true_type  { static constexpr bool notvalue = false; };
+
+  template <typename T>  using IfGridTensor    = Invoke<std::enable_if<isGridTensor<T>::value, int> >;
+  template <typename T>  using IfNotGridTensor = Invoke<std::enable_if<!isGridTensor<T>::value, int> >;

  // Traits to identify scalars
  template<typename T>     struct isGridScalar                : public std::false_type { static constexpr bool notvalue = true; };
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -147,7 +147,7 @@ void acceleratorInit(void)
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
-    hipGetDeviceProperties(&gpu_props[i], i);
+    auto r=hipGetDeviceProperties(&gpu_props[i], i);
    hipDeviceProp_t prop; 
    prop = gpu_props[i];
    totalDeviceMem = prop.totalGlobalMem;
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -405,7 +405,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 #define accelerator_barrier(dummy)				\
  {								\
-    hipStreamSynchronize(computeStream);			\
+    auto r=hipStreamSynchronize(computeStream);			\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@@ -438,19 +438,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };

-inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
-inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);};
+inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
 //inline void acceleratorCopySynchronise(void) {  }
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}

 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
-  hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  auto r=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 }
-inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
+inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyStream); };

 #endif

@@ -575,4 +575,11 @@ accelerator_inline void acceleratorFence(void)
  return;
 }

+inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
+{
+  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
+  acceleratorCopySynchronise();
+}
+
+
 NAMESPACE_END(Grid);
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@@ -54,15 +54,16 @@ int main(int argc, char **argv)
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps = 12;
+  MD.MDsteps = 24;
  MD.trajL   = 1.0;

  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 0;
+  HMCparams.StartTrajectory  = 104;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  20;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  HMCparams.StartingType     =std::string("HotStart");
+  //  HMCparams.StartingType     =std::string("HotStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);

@@ -87,6 +88,7 @@ int main(int argc, char **argv)
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
+
  //////////////////////////////////////////////

  const int Ls      = 16;
@@ -134,7 +136,6 @@ int main(int argc, char **argv)
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(2);
-  ActionLevel<HMCWrapper::Field> Level3(4);

  ////////////////////////////////////
  // Strange action
@@ -191,7 +192,7 @@ int main(int argc, char **argv)
  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
-  if( ApplySmearing ) Level2.push_back(&Jacobian);
+  if( ApplySmearing ) Level1.push_back(&Jacobian);
  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;


@@ -200,7 +201,7 @@ int main(int argc, char **argv)
  /////////////////////////////////////////////////////////////
  //  GaugeAction.is_smeared = ApplySmearing;
  GaugeAction.is_smeared = true;
-  Level3.push_back(&GaugeAction);
+  Level2.push_back(&GaugeAction);

  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
@@ -210,10 +211,11 @@ int main(int argc, char **argv)


  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
-
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
-  TheHMC.TheAction.push_back(Level3);
+
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
+  TheHMC.initializeGaugeFieldAndRNGs(U);

  TheHMC.Run(SmearingPolicy); // for smearing

--- a/HMC/FTHMC2p1f_3GeV.cc
+++ b/HMC/FTHMC2p1f_3GeV.cc
@@ -0,0 +1,226 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Copyright (C) 2023
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
+#include <Grid/qcd/smearing/JacobianAction.h>
+
+using namespace Grid;
+
+int main(int argc, char **argv)
+{
+  std::cout << std::setprecision(12);
+  
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef MobiusFermionD FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+  typedef Grid::XmlReader       Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  MD.name    = std::string("Leap Frog");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = 24;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 0;
+  HMCparams.Trajectories     = 200;
+  HMCparams.NoMetropolisUntil=  20;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  //  HMCparams.StartingType     =std::string("HotStart");
+  HMCparams.StartingType     =std::string("ColdStart");
+  //  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_EODWF_lat";
+  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
+  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
+  CPparams.saveInterval  = 1;
+  CPparams.saveSmeared   = true;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+
+  //////////////////////////////////////////////
+
+  const int Ls      = 12;
+  Real beta         = 2.37;
+  Real light_mass   = 0.0047;
+  Real strange_mass = 0.0186;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD b   = 1.0; // Scale factor one, Shamir
+  RealD c   = 0.0;
+
+  OneFlavourRationalParams OFRp;
+  OFRp.lo       = 1.0e-2;
+  OFRp.hi       = 64;
+  OFRp.MaxIter  = 10000;
+  OFRp.tolerance= 1.0e-10;
+  OFRp.degree   = 14;
+  OFRp.precision= 40;
+
+  std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 });
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+  LatticeGaugeField Uhot(GridPtr);
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+
+  double StoppingCondition = 1e-10;
+  double MaxCGIterations = 30000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+
+  bool ApplySmearing = true;
+  
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(2);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+
+  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
+  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 CG,
+	 CG, CG,
+	 CG, CG, 
+	 OFRp, false);
+
+  EOFA.is_smeared = ApplySmearing;
+  Level1.push_back(&EOFA);
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]);
+    light_num.push_back(hasenbusch[h]);
+  }
+  light_num.push_back(pv_mass);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionAction *> Denominators;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
+    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
+  }
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    Quotients[h]->is_smeared = ApplySmearing;
+    Level1.push_back(Quotients[h]);
+  }
+
+  /////////////////////////////////////////////////////////////
+  // lnDetJacobianAction
+  /////////////////////////////////////////////////////////////
+  double rho = 0.1;  // smearing parameter
+  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
+  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
+  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
+  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
+  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
+  if( ApplySmearing ) Level1.push_back(&Jacobian);
+  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
+
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  GaugeAction.is_smeared = ApplySmearing;
+  Level2.push_back(&GaugeAction);
+
+  std::cout << GridLogMessage << " ************************************************"<< std::endl;
+  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
+  std::cout << GridLogMessage << " ************************************************"<< std::endl;
+  std::cout << GridLogMessage <<  std::endl;
+  std::cout << GridLogMessage <<  std::endl;
+
+
+  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
+  TheHMC.initializeGaugeFieldAndRNGs(U);
+
+  TheHMC.Run(SmearingPolicy); // for smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/HMC/HMC2p1f_3GeV.cc
+++ b/HMC/HMC2p1f_3GeV.cc
@@ -0,0 +1,226 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Copyright (C) 2023
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
+#include <Grid/qcd/smearing/JacobianAction.h>
+
+using namespace Grid;
+
+int main(int argc, char **argv)
+{
+  std::cout << std::setprecision(12);
+  
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef MobiusFermionD FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+  typedef Grid::XmlReader       Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  MD.name    = std::string("Leap Frog");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = 24;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 0;
+  HMCparams.Trajectories     = 200;
+  HMCparams.NoMetropolisUntil=  20;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  //  HMCparams.StartingType     =std::string("HotStart");
+  HMCparams.StartingType     =std::string("ColdStart");
+  //  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_EODWF_lat";
+  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
+  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
+  CPparams.saveInterval  = 1;
+  CPparams.saveSmeared   = true;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+
+  //////////////////////////////////////////////
+
+  const int Ls      = 12;
+  Real beta         = 2.37;
+  Real light_mass   = 0.0047;
+  Real strange_mass = 0.0186;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD b   = 1.0; // Scale factor one, Shamir
+  RealD c   = 0.0;
+
+  OneFlavourRationalParams OFRp;
+  OFRp.lo       = 1.0e-2;
+  OFRp.hi       = 64;
+  OFRp.MaxIter  = 10000;
+  OFRp.tolerance= 1.0e-10;
+  OFRp.degree   = 14;
+  OFRp.precision= 40;
+
+  std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 });
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+  LatticeGaugeField Uhot(GridPtr);
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+
+  double StoppingCondition = 1e-10;
+  double MaxCGIterations = 30000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+
+  bool ApplySmearing = false;
+  
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(2);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+
+  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
+  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 CG,
+	 CG, CG,
+	 CG, CG, 
+	 OFRp, false);
+
+  EOFA.is_smeared = ApplySmearing;
+  Level1.push_back(&EOFA);
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]);
+    light_num.push_back(hasenbusch[h]);
+  }
+  light_num.push_back(pv_mass);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionAction *> Denominators;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
+    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
+  }
+
+  for(int h=0;h<n_hasenbusch+1;h++){
+    Quotients[h]->is_smeared = ApplySmearing;
+    Level1.push_back(Quotients[h]);
+  }
+
+  /////////////////////////////////////////////////////////////
+  // lnDetJacobianAction
+  /////////////////////////////////////////////////////////////
+  double rho = 0.1;  // smearing parameter
+  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
+  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
+  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
+  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
+  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
+  if( ApplySmearing ) Level1.push_back(&Jacobian);
+  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
+
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  GaugeAction.is_smeared = ApplySmearing;
+  Level2.push_back(&GaugeAction);
+
+  std::cout << GridLogMessage << " ************************************************"<< std::endl;
+  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
+  std::cout << GridLogMessage << " ************************************************"<< std::endl;
+  std::cout << GridLogMessage <<  std::endl;
+  std::cout << GridLogMessage <<  std::endl;
+
+
+  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
+  TheHMC.initializeGaugeFieldAndRNGs(U);
+
+  TheHMC.Run(SmearingPolicy); // for smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
@@ -0,0 +1,350 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_EODWFRatio.cc
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+
+  Grid_init(&argc, &argv);
+
+  CartesianCommunicator::BarrierWorld();
+  std::cout << GridLogMessage << " Clock skew check" <<std::endl;
+  
+  int threads = GridThread::GetThreads();
+
+   // Typedefs to simplify notation
+  typedef WilsonImplD FermionImplPolicy;
+  typedef MobiusFermionD FermionAction;
+  typedef MobiusEOFAFermionD FermionEOFAAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+  typedef Grid::XmlReader       Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  MD.name    = std::string("Leap Frog");
+  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  MD.name    = std::string("Force Gradient");
+  //typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  // MD.name    = std::string("MinimumNorm2");
+  // TrajL = 2
+  // 4/2 => 0.6 dH
+  // 3/3 => 0.8 dH .. depth 3, slower
+  //MD.MDsteps =  4;
+  MD.MDsteps =  3;
+  MD.trajL   = 0.5;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 1077;
+  HMCparams.Trajectories     = 1;
+  HMCparams.NoMetropolisUntil=  0;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  //  HMCparams.StartingType     =std::string("ColdStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_DDHMC_lat";
+  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
+  CPparams.saveInterval  = 1;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+  std::cout << "loaded NERSC checpointer"<<std::endl;
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 12;
+  RealD M5  = 1.8;
+  RealD b   = 1.5;
+  RealD c   = 0.5;
+  Real beta         = 2.13;
+  //  Real light_mass   = 5.4e-4;
+  Real light_mass     = 7.8e-4;
+  Real light_mass_dir = 0.01;
+  Real strange_mass = 0.0362;
+  Real pv_mass      = 1.0;
+  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  //  std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
+  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
+
+  int SP_iters=9000;
+  
+  RationalActionParams OFRp; // Up/down
+  OFRp.lo       = 6.0e-5;
+  OFRp.hi       = 90.0;
+  OFRp.inv_pow  = 2;
+  OFRp.MaxIter  = SP_iters; // get most shifts by 2000, stop sharing space
+  OFRp.action_tolerance= 1.0e-8;
+  OFRp.action_degree   = 18;
+  OFRp.md_tolerance= 1.0e-7;
+  OFRp.md_degree   = 14;
+  //  OFRp.degree   = 20; converges
+  //  OFRp.degree   = 16;
+  OFRp.precision= 80;
+  OFRp.BoundsCheckFreq=0;
+  std::vector<RealD> ActionTolByPole({
+      //      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
+      3.0e-7,1.0e-7,1.0e-8,1.0e-8,
+      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
+      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
+      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
+      1.0e-8,1.0e-8
+    });
+  std::vector<RealD> MDTolByPole({
+      //      1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
+      //      1.0e-6,3.0e-7,1.0e-7,1.0e-7,
+      1.0e-5,1.0e-6,1.0e-7,1.0e-7, // soften convergence
+      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
+      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
+      1.0e-8,1.0e-8
+    });
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
+
+  ////////////////////////////////////////////////////////////////
+  // Domain decomposed
+  ////////////////////////////////////////////////////////////////
+  Coordinate latt4  = GridPtr->GlobalDimensions();
+  Coordinate mpi    = GridPtr->ProcessorGrid();
+  Coordinate shm;
+
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+  
+  Coordinate CommDim(Nd);
+  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
+
+  Coordinate NonDirichlet(Nd+1,0);
+  Coordinate Dirichlet(Nd+1,0);
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+  //Dirichlet[1] = 0;
+  //Dirichlet[2] = 0;
+  //Dirichlet[3] = 0;
+
+  // 
+  Coordinate Block4(Nd);
+  Block4[0] = Dirichlet[1];
+  Block4[1] = Dirichlet[2];
+  Block4[2] = Dirichlet[3];
+  Block4[3] = Dirichlet[4];
+
+  int Width=4;
+  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplD::Field>(Block4,Width));
+
+  //////////////////////////
+  // Fermion Grids
+  //////////////////////////
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD  U(GridPtr); U=Zero();
+
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
+  TheHMC.initializeGaugeFieldAndRNGs(U);
+  std::cout << "loaded NERSC gauge field"<<std::endl;
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+  FermionAction::ImplParams ParamsDir(boundary);
+
+  Params.dirichlet=NonDirichlet;
+  ParamsDir.dirichlet=Dirichlet;
+  ParamsDir.partialDirichlet=0;
+  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
+
+  //  double StoppingCondition = 1e-14;
+  //  double MDStoppingCondition = 1e-9;
+  double StoppingCondition = 1e-8;
+  double MDStoppingCondition = 1e-8;
+  double MDStoppingConditionLoose = 1e-8;
+  double MDStoppingConditionStrange = 1e-8;
+  double MaxCGIterations = 300000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(3);
+  ActionLevel<HMCWrapper::Field> Level3(15);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
+  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
+
+  // Probably dominates the force - back to EOFA.
+  OneFlavourRationalParams SFRp;
+  SFRp.lo       = 0.1;
+  SFRp.hi       = 25.0;
+  SFRp.MaxIter  = 10000;
+  SFRp.tolerance= 1.0e-8;
+  SFRp.mdtolerance= 2.0e-6;
+  SFRp.degree   = 12;
+  SFRp.precision= 50;
+  
+  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
+  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
+  ConjugateGradient<FermionField>      ActionCG(StoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField>  DerivativeCG(MDStoppingCondition,MaxCGIterations);
+  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
+  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
+
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 ActionCG, 
+	 ActionCG, ActionCG,
+	 DerivativeCG, DerivativeCG,
+	 SFRp, true);
+  Level2.push_back(&EOFA);
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+  std::vector<int> dirichlet_den;
+  std::vector<int> dirichlet_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
+  }
+
+  for(int h=0;h<n_hasenbusch;h++){
+    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
+  }
+  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionAction *> Denominators;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+  
+  std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
+
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  std::vector<LinearOperatorD *> LinOpD;
+  
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage
+	      << " 2f quotient Action ";
+    std::cout << "det D("<<light_den[h]<<")";
+    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
+    std::cout << "/ det D("<<light_num[h]<<")";
+    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
+    std::cout << std::endl;
+
+    FermionAction::ImplParams ParamsNum(boundary);
+    FermionAction::ImplParams ParamsDen(boundary);
+    
+    if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
+    else                      ParamsNum.dirichlet = NonDirichlet;
+
+    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
+    else                      ParamsDen.dirichlet = NonDirichlet;
+
+    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
+    else                      ParamsNum.partialDirichlet = 0;
+
+    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
+    else                      ParamsDen.partialDirichlet = 0;
+    
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
+
+    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
+
+    double conv  = MDStoppingCondition;
+    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
+    
+    if(h!=0) {
+      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
+    } else {
+      Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+      Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+    }
+  }
+  for(int h=0;h<Bdys.size();h++){
+    Bdys[h]->SetTolerances(ActionTolByPole,MDTolByPole);
+  }
+  int nquo=Quotients.size();
+  Level1.push_back(Bdys[0]);
+  Level1.push_back(Bdys[1]);
+  Level2.push_back(Quotients[0]);
+  for(int h=1;h<nquo-1;h++){
+    Level2.push_back(Quotients[h]);
+  }
+  Level2.push_back(Quotients[nquo-1]);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+  /////////////////////////////////////////////////////////////
+
+  TheHMC.Run();  // no smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc
@@ -343,7 +343,7 @@ int main(int argc, char **argv) {
  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
  SFRp.lo       = 0.1;
-  SFRp.hi       = 25.0;
+  SFRp.hi       = 30.0;
  SFRp.MaxIter  = 10000;
  SFRp.tolerance= 1.0e-5;
  SFRp.mdtolerance= 2.0e-4;
--- a/HMC/Mobius2p1f_EOFA_96I_hmc.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc
@@ -128,7 +128,7 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
-#if 1
+#if 0
      RealD delta=1.e-4;
      std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl;
      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD);
@@ -180,7 +180,7 @@ int main(int argc, char **argv) {
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
-  MD.MDsteps =  14;
+  MD.MDsteps =  12;
  MD.trajL   = 0.5;

  HMCparameters HMCparams;
@@ -204,7 +204,7 @@ int main(int argc, char **argv) {
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  std::cout << "loaded NERSC checpointer"<<std::endl;
  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);

@@ -218,15 +218,14 @@ int main(int argc, char **argv) {
  RealD M5  = 1.8;
  RealD b   = 1.5;
  RealD c   = 0.5;
-  Real beta         = 2.13;
+  RealD beta         = 2.13;
  //  Real light_mass   = 5.4e-4;
  Real light_mass     = 7.8e-4;
+  //  Real light_mass     = 7.8e-3;
  Real strange_mass = 0.0362;
  Real pv_mass      = 1.0;
-  //  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
-  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
-  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated
-  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
+  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
+  //std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated

  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
@@ -277,20 +276,20 @@ int main(int argc, char **argv) {

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
-  double StoppingCondition = 1e-9;
-  double MDStoppingCondition = 1e-8;
-  double MDStoppingConditionLoose = 1e-8;
-  double MDStoppingConditionStrange = 1e-8;
-  double MaxCGIterations = 300000;
+  double StoppingCondition = 1e-14;
+  double MDStoppingCondition = 1e-9;
+  double MDStoppingConditionLoose = 1e-9;
+  double MDStoppingConditionStrange = 1e-9;
+  double MaxCGIterations = 50000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);

  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
-  //  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(1);
-  ActionLevel<HMCWrapper::Field> Level3(15);
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(2);
+  ActionLevel<HMCWrapper::Field> Level3(4);

  ////////////////////////////////////
  // Strange action
@@ -300,11 +299,11 @@ int main(int argc, char **argv) {

  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
-  SFRp.lo       = 0.1;
+  SFRp.lo       = 0.8;
  SFRp.hi       = 30.0;
  SFRp.MaxIter  = 10000;
-  SFRp.tolerance= 1.0e-8;
-  SFRp.mdtolerance= 2.0e-6;
+  SFRp.tolerance= 1.0e-12;
+  SFRp.mdtolerance= 1.0e-9;
  SFRp.degree   = 10;
  SFRp.precision= 50;
  
@@ -355,8 +354,10 @@ int main(int argc, char **argv) {
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
-	 ActionCGL, ActionCGR,
-	 DerivativeCGL, DerivativeCGR,
+	 //	 ActionCGL, ActionCGR,
+	 //	 DerivativeCGL, DerivativeCGR,
+	 ActionCG, ActionCG,
+	 DerivativeCG, DerivativeCG,
 	 SFRp, true);
  Level2.push_back(&EOFA);

@@ -443,13 +444,14 @@ int main(int argc, char **argv) {
  }
  int nquo=Quotients.size();
  for(int h=0;h<nquo;h++){
-    Level2.push_back(Quotients[h]);
+    Level1.push_back(Quotients[h]);
  }

  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
--- a/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc
@@ -0,0 +1,268 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_EODWFRatio.cc
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+
+  std::cout << " Grid Initialise "<<std::endl;
+  
+  Grid_init(&argc, &argv);
+
+  CartesianCommunicator::BarrierWorld();
+  std::cout << GridLogMessage << " Clock skew check" <<std::endl;
+  
+  int threads = GridThread::GetThreads();
+
+   // Typedefs to simplify notation
+  typedef WilsonImplD FermionImplPolicy;
+  typedef MobiusFermionD FermionAction;
+  typedef MobiusEOFAFermionD FermionEOFAAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+  typedef WilsonImplF FermionImplPolicyF;
+  typedef MobiusFermionF FermionActionF;
+  typedef MobiusEOFAFermionF FermionEOFAActionF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef Grid::XmlReader       Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  MD.name    = std::string("Leap Frog");
+  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  MD.name    = std::string("Force Gradient");
+  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  //  MD.name    = std::string("MinimumNorm2");
+  // TrajL = 2
+  // 4/2 => 0.6 dH
+  // 3/3 => 0.8 dH .. depth 3, slower
+  //MD.MDsteps =  4;
+  MD.MDsteps =  8;
+  MD.trajL   = 0.5;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 1077;
+  HMCparams.Trajectories     = 20;
+  HMCparams.NoMetropolisUntil=  0;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  HMCparams.StartingType     =std::string("ColdStart");
+  //  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_HMC_lat";
+  CPparams.rng_prefix    = "ckpoint_HMC_rng";
+  CPparams.saveInterval  = 1;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+  std::cout << "loaded NERSC checpointer"<<std::endl;
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 12;
+  RealD M5  = 1.8;
+  RealD b   = 1.5;
+  RealD c   = 0.5;
+  RealD beta         = 2.13;
+  //  Real light_mass   = 5.4e-4;
+  Real light_mass     = 7.8e-4;
+  //  Real light_mass     = 7.8e-3;
+  Real strange_mass = 0.0362;
+  Real pv_mass      = 1.0;
+  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
+  //std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
+
+  ////////////////////////////////////////////////////////////////
+  // Domain decomposed
+  ////////////////////////////////////////////////////////////////
+  Coordinate latt4  = GridPtr->GlobalDimensions();
+  Coordinate mpi    = GridPtr->ProcessorGrid();
+  Coordinate shm;
+
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+
+  //////////////////////////
+  // Fermion Grids
+  //////////////////////////
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD  U(GridPtr); U=Zero();
+
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
+  TheHMC.initializeGaugeFieldAndRNGs(U);
+  std::cout << "loaded NERSC gauge field"<<std::endl;
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+
+  //  double StoppingCondition = 1e-14;
+  //  double MDStoppingCondition = 1e-9;
+  double StoppingCondition = 1e-14;
+  double MDStoppingCondition = 1e-9;
+  double MDStoppingConditionLoose = 1e-9;
+  double MDStoppingConditionStrange = 1e-9;
+  double MaxCGIterations = 50000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(2);
+  ActionLevel<HMCWrapper::Field> Level3(4);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
+  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
+
+  // Probably dominates the force - back to EOFA.
+  OneFlavourRationalParams SFRp;
+  SFRp.lo       = 0.8;
+  SFRp.hi       = 30.0;
+  SFRp.MaxIter  = 10000;
+  SFRp.tolerance= 1.0e-12;
+  SFRp.mdtolerance= 1.0e-9;
+  SFRp.degree   = 10;
+  SFRp.precision= 50;
+  
+  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
+  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
+  ConjugateGradient<FermionField>      ActionCG(StoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField>  DerivativeCG(MDStoppingCondition,MaxCGIterations);
+  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
+  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
+
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 ActionCG, 
+	 ActionCG, ActionCG,
+	 DerivativeCG, DerivativeCG,
+	 SFRp, true);
+  Level2.push_back(&EOFA);
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass); 
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]);
+  }
+
+  for(int h=0;h<n_hasenbusch;h++){
+    light_num.push_back(hasenbusch[h]);
+  }
+  light_num.push_back(pv_mass);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionAction *> Denominators;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+  
+  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
+
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  std::vector<LinearOperatorD *> LinOpD;
+  
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage
+	      << " 2f quotient Action ";
+    std::cout << "det D("<<light_den[h]<<")";
+    std::cout << "/ det D("<<light_num[h]<<")";
+    std::cout << std::endl;
+
+    FermionAction::ImplParams ParamsNum(boundary);
+    FermionAction::ImplParams ParamsDen(boundary);
+    
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
+
+    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
+
+    double conv  = MDStoppingCondition;
+    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
+    
+    Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG,CG));
+  }
+  int nquo=Quotients.size();
+  for(int h=0;h<nquo;h++){
+    Level1.push_back(Quotients[h]);
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+  /////////////////////////////////////////////////////////////
+
+  TheHMC.Run();  // no smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/MPI_benchmark/bench2.pbs
+++ b/MPI_benchmark/bench2.pbs
@@ -0,0 +1,22 @@
+#!/bin/bash
+#PBS -q EarlyAppAccess
+#PBS -l select=2
+#PBS -l walltime=01:00:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+export TZ='/usr/share/zoneinfo/US/Central'
+export OMP_PROC_BIND=spread
+export OMP_NUM_THREADS=3
+unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS=12         # Number of MPI ranks per node
+NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
+NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
+
+NTOTRANKS=$(( NNODES * NRANKS ))
+
+CMD="mpiexec -np 2 -ppn 1  -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1"
+$CMD
--- a/MPI_benchmark/compile-command
+++ b/MPI_benchmark/compile-command
@@ -0,0 +1 @@
+mpicxx  -fsycl halo_mpi.cc -o halo_mpi
--- a/MPI_benchmark/gpu_tile_compact.sh
+++ b/MPI_benchmark/gpu_tile_compact.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
+export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
+export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
+export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
+
+export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
+export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
+export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
+export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
+  
+
+export ZE_AFFINITY_MASK=$gpu_id.$tile_id
+export ONEAPI_DEVICE_FILTER=gpu,level_zero
+
+#unset EnableWalkerPartition
+#export EnableImplicitScaling=0
+#export GRID_MPICH_NIC_BIND=$NIC
+#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
+#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
+#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
+#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
+
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
+
+numactl -m $PNUMA -N $NUMA  "$@"
--- a/MPI_benchmark/halo_mpi.cc
+++ b/MPI_benchmark/halo_mpi.cc
@@ -0,0 +1,333 @@
+#include <cassert>
+#include <complex>
+#include <memory>
+#include <vector>
+#include <algorithm>
+#include <array>
+#include <string>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <ctime>
+#include <sys/time.h>
+
+#include <mpi.h>
+
+/**************************************************************
+ * GPU - GPU memory cartesian halo exchange benchmark
+ * Config: what is the target
+ **************************************************************
+ */
+#undef ACC_CUDA
+#undef  ACC_HIP
+#define  ACC_SYCL
+#undef  ACC_NONE
+
+/**************************************************************
+ * Some MPI globals
+ **************************************************************
+ */
+MPI_Comm WorldComm;
+MPI_Comm WorldShmComm;
+
+int WorldSize;
+int WorldRank;
+
+int WorldShmSize;
+int WorldShmRank;
+
+/**************************************************************
+ * Allocate buffers on the GPU, SYCL needs an init call and context
+ **************************************************************
+ */
+#ifdef ACC_CUDA
+#include <cuda.h>
+void acceleratorInit(void){}
+void *acceleratorAllocDevice(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = cudaMalloc((void **)&ptr,bytes);
+  assert(err==cudaSuccess);
+  return ptr;
+}
+void acceleratorFreeDevice(void *ptr){  cudaFree(ptr);}
+#endif
+#ifdef ACC_HIP
+#include <hip/hip_runtime.h>
+void acceleratorInit(void){}
+inline void *acceleratorAllocDevice(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = hipMalloc((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
+    ptr = (void *) NULL;
+    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
+  }
+  return ptr;
+};
+inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
+#endif
+#ifdef ACC_SYCL
+#include <sycl/CL/sycl.hpp>
+#include <sycl/usm.hpp>
+cl::sycl::queue *theAccelerator;
+void acceleratorInit(void)
+{
+  int nDevices = 1;
+#if 1
+  cl::sycl::gpu_selector selector;
+  cl::sycl::device selectedDevice { selector };
+  theAccelerator = new sycl::queue (selectedDevice);
+#else
+  cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v  };
+  theAccelerator = new sycl::queue (selectedDevice);
+#endif
+  auto name = theAccelerator->get_device().get_info<sycl::info::device::name>();
+  printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout);
+}
+inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);};
+inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);};
+#endif
+#ifdef ACC_NONE
+void acceleratorInit(void){}
+inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);};
+inline void acceleratorFreeDevice(void *ptr){free(ptr);};
+#endif
+
+
+/**************************************************************
+ * Microsecond timer
+ **************************************************************
+ */
+inline double usecond(void) {
+  struct timeval tv;
+  gettimeofday(&tv,NULL);
+  return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec;
+}
+/**************************************************************
+ * Main benchmark routine
+ **************************************************************
+ */
+void Benchmark(int64_t L,std::vector<int> cart_geom,bool use_device,int ncall)
+{
+  int64_t words = 3*4*2;
+  int64_t face,vol;
+  int Nd=cart_geom.size();
+  
+  /**************************************************************
+   * L^Nd volume, L^(Nd-1) faces, 12 complex per site
+   * Allocate memory for these
+   **************************************************************
+   */
+  face=1; for( int d=0;d<Nd-1;d++) face = face*L;
+  vol=1;  for( int d=0;d<Nd;d++) vol = vol*L;
+
+  
+  std::vector<void *> send_bufs;
+  std::vector<void *> recv_bufs;
+  size_t vw = face*words;
+  size_t bytes = face*words*sizeof(double);
+
+  if ( use_device ) {
+    for(int d=0;d<2*Nd;d++){
+      send_bufs.push_back(acceleratorAllocDevice(bytes));
+      recv_bufs.push_back(acceleratorAllocDevice(bytes));
+    }
+  } else {
+    for(int d=0;d<2*Nd;d++){
+      send_bufs.push_back(malloc(bytes));
+      recv_bufs.push_back(malloc(bytes));
+    }
+  }
+  /*********************************************************
+   * Build cartesian communicator
+   *********************************************************
+   */
+  int ierr;
+  int rank;
+  std::vector<int> coor(Nd);
+  MPI_Comm communicator;
+  std::vector<int> periodic(Nd,1);
+  MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator);
+  MPI_Comm_rank(communicator,&rank);
+  MPI_Cart_coords(communicator,rank,Nd,&coor[0]);
+
+  static int reported;
+  if ( ! reported ) { 
+    printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank,
+	 coor[0],coor[1],coor[2],coor[3]); fflush(stdout);
+    reported =1 ;
+  }
+  /*********************************************************
+   * Perform halo exchanges
+   *********************************************************
+   */
+  for(int d=0;d<Nd;d++){
+    if ( cart_geom[d]>1 ) {
+      double t0=usecond();
+
+      int from,to;
+      
+      MPI_Barrier(communicator);
+      for(int n=0;n<ncall;n++){
+	
+	void *xmit = (void *)send_bufs[d];
+	void *recv = (void *)recv_bufs[d];
+	
+	ierr=MPI_Cart_shift(communicator,d,1,&from,&to);
+	assert(ierr==0);
+	
+	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
+			  recv,bytes,MPI_CHAR,from, from,
+			  communicator,MPI_STATUS_IGNORE);
+	assert(ierr==0);
+	
+	xmit = (void *)send_bufs[Nd+d];
+	recv = (void *)recv_bufs[Nd+d];
+	
+	ierr=MPI_Cart_shift(communicator,d,-1,&from,&to);
+	assert(ierr==0);
+	
+	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
+			  recv,bytes,MPI_CHAR,from, from,
+			  communicator,MPI_STATUS_IGNORE);
+	assert(ierr==0);
+      }
+      MPI_Barrier(communicator);
+
+      double t1=usecond();
+      
+      double dbytes    = bytes*WorldShmSize;
+      double xbytes    = dbytes*2.0*ncall;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      if ( ! WorldRank ) {
+	printf("\t%12ld\t %12ld %16.0lf\n",L,bytes,bidibytes/(t1-t0)); fflush(stdout);
+      }
+    }
+  }
+  /*********************************************************
+   * Free memory
+   *********************************************************
+   */
+  if ( use_device ) {
+    for(int d=0;d<2*Nd;d++){
+      acceleratorFreeDevice(send_bufs[d]);
+      acceleratorFreeDevice(recv_bufs[d]);
+    }
+  } else {
+    for(int d=0;d<2*Nd;d++){
+      free(send_bufs[d]);
+      free(recv_bufs[d]);
+    }
+  }
+
+}
+
+/**************************************
+ * Command line junk
+ **************************************/
+
+std::string CmdOptionPayload(char ** begin, char ** end, const std::string & option)
+{
+  char ** itr = std::find(begin, end, option);
+  if (itr != end && ++itr != end) {
+    std::string payload(*itr);
+    return payload;
+  }
+  return std::string("");
+}
+bool CmdOptionExists(char** begin, char** end, const std::string& option)
+{
+  return std::find(begin, end, option) != end;
+}
+void CmdOptionIntVector(const std::string &str,std::vector<int> & vec)
+{
+  vec.resize(0);
+  std::stringstream ss(str);
+  int i;
+  while (ss >> i){
+    vec.push_back(i);
+    if(std::ispunct(ss.peek()))
+      ss.ignore();
+  }
+  return;
+}
+/**************************************
+ * Command line junk
+ **************************************/
+int main(int argc, char **argv)
+{
+  std::string arg;
+
+  acceleratorInit();
+
+  MPI_Init(&argc,&argv);
+
+  WorldComm = MPI_COMM_WORLD;
+  
+  MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
+
+  MPI_Comm_rank(WorldComm     ,&WorldRank);
+  MPI_Comm_size(WorldComm     ,&WorldSize);
+
+  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
+  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
+
+  if ( WorldSize/WorldShmSize > 2) {
+    printf("This benchmark is meant to run on at most two nodes only\n");
+  }
+
+  auto mpi =std::vector<int>({1,1,1,1});
+
+  if( CmdOptionExists(argv,argv+argc,"--mpi") ){
+    arg = CmdOptionPayload(argv,argv+argc,"--mpi");
+    CmdOptionIntVector(arg,mpi);
+  } else {
+    printf("Must specify --mpi <n1.n2.n3.n4> command line argument\n");
+    exit(0);
+  }
+
+  if( !WorldRank ) {
+    printf("***********************************\n");
+    printf("%d ranks\n",WorldSize); 
+    printf("%d ranks-per-node\n",WorldShmSize);
+    printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout);
+    printf("Cartesian layout: ");
+    for(int d=0;d<mpi.size();d++){
+      printf("%d ",mpi[d]);
+    }
+    printf("\n");fflush(stdout);
+    printf("***********************************\n");
+  }
+
+  
+  if( !WorldRank ) {
+    printf("=========================================================\n");
+    printf("= Benchmarking HOST memory MPI performance               \n");
+    printf("=========================================================\n");fflush(stdout);
+    printf("= L\t pkt bytes\t MB/s           \n");
+    printf("=========================================================\n");fflush(stdout);
+  }
+
+  for(int L=16;L<=64;L+=4){
+    Benchmark(L,mpi,false,100);
+  }  
+
+  if( !WorldRank ) {
+    printf("=========================================================\n");
+    printf("= Benchmarking DEVICE memory MPI performance             \n");
+    printf("=========================================================\n");fflush(stdout);
+  }
+  for(int L=16;L<=64;L+=4){
+    Benchmark(L,mpi,true,100);
+  }  
+
+  if( !WorldRank ) {
+    printf("=========================================================\n");
+    printf("= DONE   \n");
+    printf("=========================================================\n");
+  }
+  MPI_Finalize();
+}
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -185,6 +185,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  GaugeField Umu(UGrid);
  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
+  //  SU<Nc>::ColdConfiguration(Umu);
  UmuCopy=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;

@@ -307,6 +308,14 @@ void Benchmark(int Ls, Coordinate Dirichlet)
    if(( n2e>1.0e-4) ) {
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
+      std::cout<<GridLogMessage << "RESULT" << std::endl;
+      //      std::cout << result<<std::endl;
+      std::cout << norm2(result)<<std::endl;
+      std::cout<<GridLogMessage << "REF" << std::endl;
+      std::cout << norm2(ref)<<std::endl;
+      std::cout<<GridLogMessage << "ERR" << std::endl;
+      std::cout << norm2(err)<<std::endl;
+      FGrid->Barrier();
      exit(-1);
    }
    assert (n2e< 1.0e-4 );
--- a/configure.ac
+++ b/configure.ac
@@ -41,7 +41,7 @@ AC_PROG_RANLIB

 ############### Get compiler informations
 AC_LANG([C++])
-AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
+AX_CXX_COMPILE_STDCXX(17,noext,mandatory)
 AX_COMPILER_VENDOR
 AC_DEFINE_UNQUOTED([CXX_COMP_VENDOR],["$ax_cv_cxx_compiler_vendor"],
      [vendor of C++ compiler that will compile the code])
@@ -191,10 +191,28 @@ case ${ac_Nc} in
        AC_DEFINE([Config_Nc],[4],[Gauge group Nc]);;
    5)
        AC_DEFINE([Config_Nc],[5],[Gauge group Nc]);;
+    8)
+        AC_DEFINE([Config_Nc],[8],[Gauge group Nc]);;
    *)
      AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
 esac

+############### Symplectic group
+AC_ARG_ENABLE([Sp],
+    [AC_HELP_STRING([--enable-Sp=yes|no], [enable gauge group Sp2n])],
+    [ac_ENABLE_SP=${enable_Sp}], [ac_ENABLE_SP=no])
+
+AM_CONDITIONAL(BUILD_SP, [ test "${ac_ENABLE_SP}X" == "yesX" ])
+
+case ${ac_ENABLE_SP} in
+   yes)
+        AC_DEFINE([Sp2n_config],[1],[gauge group Sp2n], [have_sp2n=true]);;
+   no)
+        AC_DEFINE([Sp2n_config],[0],[gauge group SUn], [have_sp2n=false]);;
+    *)
+        AC_MSG_ERROR(["--enable-Sp is either yes or no"]);;
+esac
+
 ############### FP16 conversions
 AC_ARG_ENABLE([sfw-fp16],
    [AS_HELP_STRING([--enable-sfw-fp16=yes|no],[enable software fp16 comms])],
@@ -737,7 +755,7 @@ case ${ac_TIMERS} in
 esac

 ############### Chroma regression test
-AC_ARG_ENABLE([chroma],[AS_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
+AC_ARG_ENABLE([chroma],[AS_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++14 ])],ac_CHROMA=yes,ac_CHROMA=no)

 case ${ac_CHROMA} in
     yes|no)
@@ -819,6 +837,7 @@ FFTW                        : `if test "x$have_fftw" = xtrue; then echo yes; els
 LIME (ILDG support)         : `if test "x$have_lime" = xtrue; then echo yes; else echo no; fi`
 HDF5                        : `if test "x$have_hdf5" = xtrue; then echo yes; else echo no; fi`
 build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
+Sp2n                        : ${ac_ENABLE_SP}
 ----- BUILD FLAGS -------------------------------------
 CXXFLAGS:
 `echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
@@ -847,6 +866,7 @@ AC_CONFIG_FILES(tests/lanczos/Makefile)
 AC_CONFIG_FILES(tests/smearing/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(tests/testu01/Makefile)
+AC_CONFIG_FILES(tests/sp2n/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_CONFIG_FILES(examples/Makefile)
 AC_OUTPUT
--- a/documentation/Grid.pdf
+++ b/documentation/Grid.pdf
--- a/documentation/GridXcode/readme.md
+++ b/documentation/GridXcode/readme.md
@@ -10,9 +10,8 @@ For first time setup of the Xcode and Grid build environment on Mac OS, you will

 1. Install Xcode and the Xcode command-line utilities
 2. Set Grid environment variables
-3. Install and build Open MPI ***optional***
-4. Install and build Grid pre-requisites
-5. Install, Configure and Build Grid
+3. Install and build Grid pre-requisites
+4. Install, Configure and Build Grid

 Apple's [Xcode website][Xcode] is the go-to reference for 1, and the definitive reference for 4 and 5 is the [Grid Documentation][GridDoc].

@@ -92,60 +91,33 @@ launchctl setenv GridPkg /opt/local</string>
 </plist>
 ```

-## 3. Install and build Open MPI -- ***optional***
-
-Download the latest version of [Open MPI][OMPI] version 3.1 (I used 3.1.5) and build it like so:
-
-[OMPI]: https://www.open-mpi.org/software/ompi/v3.1/
-
-    ../configure CC=clang CXX=clang++ CXXFLAGS=-g --prefix=$GridPre/bin
-    make -j 4 all install
-
-***Note the `/bin` at the end of the prefix - this is required. As a quirk of the OpenMPI installer, `--prefix` must point to the `bin` subdirectory, with other files installed in `$GridPre/include`, `$GridPre/lib`, `$GridPre/share`, etc.***
-
-Grid does not have any dependencies on fortran, however many standard scientific packages do, so you may wish to download GNU fortran (e.g. MacPorts ``gfortran`` package) and add the following to your configure invocation:
-
-    F77=gfortran FC=gfortran
-
-## 4. Install and build Grid pre-requisites
+## 3. Install and build Grid pre-requisites

 To simplify the installation of **Grid pre-requisites**, you can use your favourite package manager, e.g.:

-### 1. [MacPorts][MacPorts]
+### 3.1. [MacPorts][MacPorts]

 [MacPorts]: https://www.macports.org "MacPorts package manager"

 Install [MacPorts][MacPorts] if you haven't done so already, and then install packages with:

-    sudo port install <portname>
+    sudo port install openmpi git-flow-avh gmp hdf5 mpfr fftw-3-single lapack wget autoconf automake bison cmake gawk libomp

-These are the `portname`s for mandatory Grid libraries:
+On a Mac without GPUs:

-* git-flow-avh
-* gmp
-* hdf5
-* mpfr
+    sudo port install OpenBLAS +native

-and these are the `portname`s for optional Grid libraries:
+To use `Gnu sha256sum`:

-* fftw-3-single
-* lapack
-* doxygen
-* OpenBLAS
+    pushd /opt/local/bin; sudo ln -s gsha256sum sha256sum; popd 

-***Please update this list with any packages I've missed! ... and double-check whether OpenBLAS is really for Grid. NB: lapack doesn't seem to work. Should it be scalapack?***
+These `port`s are not strictly necessary, but they are helpful:

-### 2. [Homebrew][Homebrew]
+    sudo port install gnuplot gsl h5utils nasm rclone texinfo tree xorg-server

-[Homebrew]: https://brew.sh "Homebrew package manager"
+***Please update this list with any packages I've missed!***

-Install [Homebrew][Homebrew] if you haven't done so already, and then install packages with:
-
-    sudo brew install <packagename>
-
-The same packages are available as from MacPorts.
-
-### Install LIME ***optional***
+#### Install LIME

 There isn't currently a port for [C-LIME][C-LIME], so download the source and then build it:

@@ -154,9 +126,19 @@ There isn't currently a port for [C-LIME][C-LIME], so download the source and th
    ../configure CC=clang --prefix=$GridPre
    make -j 4 all install

-## 5. Install, Configure and Build Grid
+### 3.2. [Homebrew][Homebrew]

-### 5.1 Install Grid
+[Homebrew]: https://brew.sh "Homebrew package manager"
+
+Install [Homebrew][Homebrew] if you haven't done so already, and then install packages with:
+
+    sudo brew install <packagename>
+
+I don't use Homebrew, so I'm not sure what the Brew package name equivalents are. ** Please update if you know **
+
+## 4. Install, Configure and Build Grid
+
+### 4.1 Install Grid

 [Grid]: https://github.com/paboyle/Grid

@@ -174,7 +156,7 @@ or

 depending on how many times you like to enter your password.

-### 5.2 Configure Grid
+### 4.2 Configure Grid

 The Xcode build system supports multiple configurations for each project, by default: `Debug` and `Release`, but more configurations can be defined. We will create separate Grid build directories for each configuration, using the Grid **Autoconf** build system to make each configuration. NB: it is **not** necessary to run `make install` on them once they are built (IDE features such as *jump to definition* will work better of you don't).

@@ -198,7 +180,7 @@ Debug configuration with MPI:

    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --prefix=$GridPre/MPIDebug

-### 5.3 Build Grid
+### 4.3 Build Grid

 Each configuration must be built before they can be used. You can either:

--- a/documentation/manual.rst
+++ b/documentation/manual.rst
@@ -2778,47 +2778,81 @@ and there are associated reconstruction routines for assembling four spinors fro

 These ca

-
-SU(N)
+Gauge Group
 --------
+A generic Nc qcd/utils/GaugeGroup.h is provided. This defines a template class that can be specialised to different gauge groups::

-A generic Nc qcd/utils/SUn.h is provided. This defines a template class::
+  template <int ncolour, class group_name>
+  class GaugeGroup {...}

-  template <int ncolour> class SU ;
+Supported groups are SU(N) and Sp(2N). The group can be specified through the GroupName namespace::

-The most important external methods are::
+  namespace GroupName {
+  class SU {};
+  class Sp {};
+  }
+
+A simpler interface is achieved by aliasing the GaugeGroup class with a specific group::
+
+  template <int ncolour>
+  using SU = GaugeGroup<ncolour, GroupName::SU>;
+
+  template <int ncolour>
+  using Sp = GaugeGroup<ncolour, GroupName::Sp>;
+  
+Specific aliases are then defined::
+
+  typedef SU<2> SU2;
+  typedef SU<3> SU3;
+  typedef SU<4> SU4;
+  typedef SU<5> SU5;
+  typedef Sp<2> Sp2;
+  typedef Sp<4> Sp4;
+  typedef Sp<6> Sp6;
+  typedef Sp<8> Sp8;
+
+Some methods are common to both gauge groups. Common external methods are::

-  static void printGenerators(void) ;
  template <class cplx>  static void generator(int lieIndex, iSUnMatrix<cplx> &ta) ;
+  static void GaussianFundamentalLieAlgebraMatrix(GridParallelRNG &pRNG, LatticeMatrix &out, Real scale = 1.0) ;
+  static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) ;
+  static void TepidConfiguration(GridParallelRNG &pRNG,GaugeField &out);
+  static void ColdConfiguration(GaugeField &out);
+  static void taProj( const LatticeMatrixType &in,  LatticeMatrixType &out);
+  static void taExp(const LatticeMatrixType &x, LatticeMatrixType &ex) ;
+  static void printGenerators(void) ;
+   
+Whenever needed, a different implementation of these methods for the gauge groups is achieved by overloading. For example,::
+
+  template <typename LatticeMatrixType> //  shared interface for the traceless-antihermitian projection
+  static void taProj(const LatticeMatrixType &in, LatticeMatrixType &out) {
+    taProj(in, out, group_name());
+  }
+  
+  template <typename LatticeMatrixType> //  overloaded function to SU(N) simply perform Ta
+  static void taProj(const LatticeMatrixType &in, LatticeMatrixType &out, GroupName::SU) {
+    out = Ta(in);
+  }
+  
+  template <typename LatticeMatrixType> //  overloaded function to Sp(2N) must use a modified Ta function
+  static void taProj(const LatticeMatrixType &in, LatticeMatrixType &out, GroupName::Sp) {
+    out = SpTa(in);
+  }
+
+Gauge Group: SU(N)
+--------
+The specialisation of GaugeGroup to SU(N), formally part of qcd/utils/GaugeGroup.h, is found in the file qcd/utils/SUn.impl
+It contains methods that are only implemented for SU(N), and specialisations of shared methods to the special unitary group
+
+Public methods are::

  static void SubGroupHeatBath(GridSerialRNG &sRNG, GridParallelRNG &pRNG, RealD beta,  // coeff multiplying staple in action (with no 1/Nc)
                               LatticeMatrix &link,
 			       const LatticeMatrix &barestaple,  // multiplied by action coeffs so th
 			       int su2_subgroup, int nheatbath, LatticeInteger &wheremask);
-
-  static void GaussianFundamentalLieAlgebraMatrix(GridParallelRNG &pRNG,
-                                                  LatticeMatrix &out,
-                                                  Real scale = 1.0) ;
  static void GaugeTransform( GaugeField &Umu, GaugeMat &g)
  static void RandomGaugeTransform(GridParallelRNG &pRNG, GaugeField &Umu, GaugeMat &g);

-  static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) ;
-  static void TepidConfiguration(GridParallelRNG &pRNG,GaugeField &out);
-  static void ColdConfiguration(GaugeField &out);
-
-  static void taProj( const LatticeMatrixType &in,  LatticeMatrixType &out);
-  static void taExp(const LatticeMatrixType &x, LatticeMatrixType &ex) ;
-
-  static int su2subgroups(void) ; // returns how many subgroups
-
-
-Specific instantiations are defined::
-
-	 typedef SU<2> SU2;
-	 typedef SU<3> SU3;
-	 typedef SU<4> SU4;
-	 typedef SU<5> SU5;
-
 For example, Quenched QCD updating may be run as (tests/core/Test_quenched_update.cc)::

  for(int sweep=0;sweep<1000;sweep++){
@@ -2857,6 +2891,16 @@ For example, Quenched QCD updating may be run as (tests/core/Test_quenched_updat
    }
  }

+Gauge Group: Sp(2N)
+--------
+The specialisation of GaugeGroup to Sp(2N), formally part of qcd/utils/GaugeGroup.h, is found in the file qcd/utils/Sp(2N).impl
+It contains methods that are only implemented for Sp(2N), and specialisations of shared methods to the special unitary group
+
+External methods are::
+
+  static void Omega(LatticeColourMatrixD &in) // Symplectic matrix left invariant by Sp(2N)
+
+Generation of Sp(2N) gauge fields is only supported via HMC.

 Space time grids
 ----------------
--- a/m4/ax_cxx_compile_stdcxx.m4
+++ b/m4/ax_cxx_compile_stdcxx.m4
--- a/m4/ax_cxx_compile_stdcxx_14.m4
+++ b/m4/ax_cxx_compile_stdcxx_14.m4
@@ -0,0 +1,34 @@
+# =============================================================================
+#  https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_14.html
+# =============================================================================
+#
+# SYNOPSIS
+#
+#   AX_CXX_COMPILE_STDCXX_14([ext|noext], [mandatory|optional])
+#
+# DESCRIPTION
+#
+#   Check for baseline language coverage in the compiler for the C++14
+#   standard; if necessary, add switches to CXX and CXXCPP to enable
+#   support.
+#
+#   This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX
+#   macro with the version set to C++14.  The two optional arguments are
+#   forwarded literally as the second and third argument respectively.
+#   Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for
+#   more information.  If you want to use this macro, you also need to
+#   download the ax_cxx_compile_stdcxx.m4 file.
+#
+# LICENSE
+#
+#   Copyright (c) 2015 Moritz Klammler <moritz@klammler.eu>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 5
+
+AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX])
+AC_DEFUN([AX_CXX_COMPILE_STDCXX_14], [AX_CXX_COMPILE_STDCXX([14], [$1], [$2])])
--- a/scripts/filelist
+++ b/scripts/filelist
@@ -15,6 +15,8 @@ STAG_FERMION_FILES=`  find . -name '*.cc' -path '*/instantiation/*' -path '*/ins
 GP_FERMION_FILES=`    find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/Gparity*' `
 ADJ_FERMION_FILES=`   find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/WilsonAdj*' `
 TWOIND_FERMION_FILES=`find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/WilsonTwoIndex*'`
+SP_FERMION_FILES=`find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/SpWilsonImpl*'`
+SP_TWOIND_FERMION_FILES=`find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/SpWilsonTwo*'`

 HPPFILES=`find . -type f -name '*.hpp'`
 echo HFILES=$HFILES $HPPFILES > Make.inc
@@ -27,13 +29,14 @@ echo STAG_FERMION_FILES=$STAG_FERMION_FILES   >> Make.inc
 echo GP_FERMION_FILES=$GP_FERMION_FILES   >> Make.inc
 echo ADJ_FERMION_FILES=$ADJ_FERMION_FILES   >> Make.inc
 echo TWOIND_FERMION_FILES=$TWOIND_FERMION_FILES   >> Make.inc
+echo SP_FERMION_FILES=$SP_FERMION_FILES >> Make.inc
+echo SP_TWOIND_FERMION_FILES=$SP_TWOIND_FERMION_FILES >> Make.inc

 # tests Make.inc
 cd $home/tests
 dirs=`find . -type d -not -path '*/\.*'`
 for subdir in $dirs; do
    cd $home/tests/$subdir
-    pwd
    TESTS=`ls T*.cc`
    TESTLIST=`echo ${TESTS} | sed s/.cc//g `
    PREF=`[ $subdir = '.' ] && echo noinst || echo EXTRA`
--- a/systems/Aurora/benchmarks/bench.pbs
+++ b/systems/Aurora/benchmarks/bench.pbs
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
+#PBS -q EarlyAppAccess
+#PBS -l select=1
+#PBS -l walltime=01:00:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+HDIR=/home/paboyle/
+#module use /soft/testing/modulefiles/
+#module load intel-UMD23.05.25593.11/23.05.25593.11
+#module load tools/pti-gpu  
+#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
+#export PATH=$HDIR/tools/bin:$PATH
+
+export TZ='/usr/share/zoneinfo/US/Central'
+export OMP_PROC_BIND=spread
+export OMP_NUM_THREADS=3
+unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+source ../sourceme.sh
+
+echo Jobid: $PBS_JOBID
+echo Running on host `hostname`
+echo Running on nodes `cat $PBS_NODEFILE`
+
+echo NODES
+cat $PBS_NODEFILE
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS=12         # Number of MPI ranks per node
+NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
+NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
+
+NTOTRANKS=$(( NNODES * NRANKS ))
+
+echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
+echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
+
+    
+#CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
+#	     ./gpu_tile_compact.sh \
+#	./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
+#	--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
+CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
+	     ./gpu_tile_compact.sh \
+	./Benchmark_memory_bandwidth --mpi 1.1.2.6 --grid 32.24.32.192 \
+	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+$CMD
--- a/systems/Aurora/benchmarks/bench2.pbs
+++ b/systems/Aurora/benchmarks/bench2.pbs
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
+
+#PBS -q EarlyAppAccess
+#PBS -l select=2
+#PBS -l walltime=01:00:00
+#PBS -A LatticeQCD_aesp_CNDA
+#export OMP_PROC_BIND=spread
+#export OMP_NUM_THREADS=3
+#unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+source ../sourceme.sh
+
+#echo Jobid: $PBS_JOBID
+#echo Running on host `hostname`
+#echo Running on nodes `cat $PBS_NODEFILE`
+
+#echo NODES
+#cat $PBS_NODEFILE
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS=12         # Number of MPI ranks per node
+NDEPTH=3          # Number of hardware threads per rank, spacing between MPI ranks on a node
+NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
+
+NTOTRANKS=$(( NNODES * NRANKS ))
+
+
+CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \
+	     ./gpu_tile_compact.sh \
+	./Benchmark_comms_host_device --mpi 1.1.1.2 --grid 32.24.32.192 \
+	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
+#$CMD | tee 1-to-1.comms.hmem0
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
+#$CMD | tee 1-to-1.comms.hmem1
+
+
+CMD="mpiexec -np 4 -ppn 2 -d ${NDEPTH} --cpu-bind=depth -envall \
+	     ./gpu_tile_compact.sh \
+	./Benchmark_comms_host_device --mpi 2.2.1.1 --grid 32.24.32.96 \
+	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
+#$CMD | tee 2-to-2.comms.hmem1
+
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
+$CMD | tee 2-to-2.comms.hmem0
+
+CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \
+	     ./gpu_tile_compact.sh \
+	./Benchmark_comms_host_device --mpi 3.2.1.1 --grid 32.24.32.96 \
+	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
+#$CMD | tee 3-to-3.comms.hmem1
+
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
+#$CMD | tee 3-to-3.comms.hmem0
+
+
+CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \
+	     ./gpu_tile_compact4a.sh \
+	./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \
+	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
+#$CMD | tee 4-to-4.comms.hmem1.nic-affinity
+
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
+$CMD | tee 4-to-4.comms.hmem0
+
+#mpiexec -np 1 --ppn 1 -d 1 numactl -H | tee numa.log
+
+CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \
+	     ./gpu_tile_compact.sh \
+	./Benchmark_comms_host_device --mpi 3.2.2.1 --grid 32.24.32.96 \
+	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
+#$CMD | tee 6-to-6.comms.hmem1
+
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
+$CMD | tee 6-to-6.comms.hmem0
+
+
+CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
+	     ./gpu_tile_compact.sh \
+	./Benchmark_comms_host_device --mpi 3.2.2.2 --grid 32.24.32.192 \
+	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
+#$CMD | tee 12-to-12.comms.hmem1
+
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
+$CMD | tee 12-to-12.comms.hmem0
--- a/systems/Aurora/benchmarks/gpu_tile_compact.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
+#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
+export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
+export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
+export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
+export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
+
+export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
+export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
+export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
+export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
+export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
+  
+#export GRID_MPICH_NIC_BIND=$NIC
+#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
+
+unset EnableWalkerPartition
+export EnableImplicitScaling=0
+export ZE_AFFINITY_MASK=$gpu_id.$tile_id
+export ONEAPI_DEVICE_FILTER=gpu,level_zero
+
+#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
+#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
+#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
+
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
+
+numactl -m $NUMA -N $NUMAP  "$@"
--- a/systems/Aurora/benchmarks/gpu_tile_compact4.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact4.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+export  NUMA_MAP=(2 2 3 3  2 2  3 3  )
+export  PROC_MAP=(0 0 1 1  0 0  1 1  )
+export  NIC_MAP=(0 0  4 4  1 1  5 5  )
+export  GPU_MAP=(0 1  3 4  0 1  3 4  )
+export TILE_MAP=(0 0  0 0  1 1  1 1  )
+export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
+export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
+export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
+export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
+  
+#export GRID_MPICH_NIC_BIND=$NIC
+
+unset EnableWalkerPartition
+export EnableImplicitScaling=0
+export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+export ZE_AFFINITY_MASK=$gpu_id.$tile_id
+#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
+export ONEAPI_DEVICE_FILTER=gpu,level_zero
+export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
+
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND ; NUMA domain $NUMA"
+
+numactl -m $NUMA -N $PROC_MAP  "$@"
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -0,0 +1,16 @@
+TOOLS=$HOME/tools
+../../configure \
+	--enable-simd=GPU \
+	--enable-gen-simd-width=64 \
+	--enable-comms=mpi-auto \
+	--enable-accelerator-cshift \
+	--disable-gparity \
+	--disable-fermion-reps \
+	--enable-shm=nvlink \
+	--enable-accelerator=sycl \
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=icpx \
+	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
+
--- a/systems/Aurora/proxies.sh
+++ b/systems/Aurora/proxies.sh
@@ -0,0 +1,9 @@
+export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
+export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
+export http_proxy=http://proxy.alcf.anl.gov:3128
+export https_proxy=http://proxy.alcf.anl.gov:3128
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
+git config --global http.proxy http://proxy.alcf.anl.gov:3128
+module use /soft/modulefiles
+module load intel_compute_runtime/release/agama-devel-682.22
+ 
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -0,0 +1,12 @@
+#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
+
+module use /soft/modulefiles
+module load intel_compute_runtime/release/agama-devel-682.22
+
+export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
+export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
+export http_proxy=http://proxy.alcf.anl.gov:3128
+export https_proxy=http://proxy.alcf.anl.gov:3128
+#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
+git config --global http.proxy http://proxy.alcf.anl.gov:3128
+ 
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	5bfa88be85	Aurora MPI standalone benchmake and options that work well	2024-02-06 16:28:40 +00:00
Peter Boyle	2a0d75bac2	Aurora files	2023-12-21 23:20:17 +00:00
Peter Boyle	f48298ad4e	Bug fix	2023-12-11 20:57:02 -05:00
root	645e47c1ba	Config for Ampere Altra ARM	2023-12-08 16:17:56 -05:00
Peter Boyle	d1d9827263	Integrator logging update	2023-12-08 12:14:00 -05:00
Peter Boyle	14643c0aab	SDCC benchmarking scripts for A100 nodes and IceLake nodes (AVX512)	2023-12-04 15:45:57 -05:00
Peter Boyle	b77a9b8947	SDDC compiles starting	2023-11-30 14:31:51 -05:00
Peter Boyle	7d077fe493	Frontier compiel	2023-11-09 13:58:44 -05:00
Peter Boyle	51051df62c	3GeV run setup	2023-10-16 20:49:52 +03:00
Peter Boyle	33097681b9	FTHMC compiled and merged to develop	2023-10-14 00:42:55 +03:00
Peter Boyle	07e4900218	FTHMC commit	2023-10-13 18:21:57 +03:00
Peter Boyle	36ab567d67	FTHMC 3 Gev	2023-10-13 18:21:57 +03:00
Peter Boyle	e19171523b	FTHMC Status at lattice conference commit	2023-10-13 18:21:56 +03:00
Peter Boyle	9626a2c7c0	Asynch handling	2023-10-13 18:21:56 +03:00
Peter Boyle	e936f5b80b	IfGridTensor shorthand	2023-10-13 18:21:56 +03:00
Peter Boyle	ffc0639cb9	Running in HMC tests	2023-10-13 18:21:56 +03:00
Peter Boyle	c5b43b322c	traceProduct eliminates non-contributing intermediate terms	2023-10-13 18:21:56 +03:00
Peter Boyle	c9c4576237	Improved frontier cshift	2023-10-13 18:21:56 +03:00
Peter Boyle	6d0c2de399	Deprecate teh PVC directory and make a PVC-OEM generic PVC target with no queueing system dependency -- just interactive scripts	2023-10-03 17:04:20 +00:00
Peter Boyle	7786ea9921	Bug fix in script	2023-10-03 09:58:44 -07:00
Peter Boyle	d93eac7b1c	Performance regressed and is OK in icpx 2023.2	2023-10-03 15:53:14 +00:00
Peter Boyle	afc316f501	Rename headers	2023-10-02 16:25:11 -04:00
Peter Boyle	f14bfd5c1b	Relocate sub includes	2023-10-02 16:23:38 -04:00
Peter Boyle	c5f1420dea	Merge remote-tracking branch 'LupoA/develop' into LupoA-develop	2023-10-02 16:22:35 -04:00
Peter Boyle	018e6da872	Merge pull request #440 from giltirn/feature/paddedcellgauge Feature/paddedcellgauge	2023-10-02 10:00:42 -04:00
Peter Boyle	b77bccfac2	Merge pull request #444 from mmphys/feature/docX Update doc complete list of Macports needed to build Grid on a fresh Mac	2023-10-02 09:57:11 -04:00
Peter Boyle	80359e0d49	Bland SYCL compile	2023-09-26 13:20:27 -07:00
Peter Boyle	3d437c5cc4	Making SYCL happy	2023-09-26 13:19:42 -07:00
Michael Marshall	bd56c95a6f	Update documentation with complete list of Macports needed to build Grid on a fresh Mac	2023-07-14 13:50:06 +01:00
chillenzer	dbd8bb49dc	Merge pull request #32 from LupoA/sp2n/develop Sp2n/develop	2023-07-04 15:23:43 +00:00
Julian Lenz	3a29af0ce4	Fixed linker error	2023-07-04 16:08:44 +01:00
Julian Lenz	f7b79cdd45	Added test for ProjectSpn	2023-07-03 18:00:32 +01:00
Alessandro Lupo	075b9d22d0	adjoint rep implemented as 2indx symmetric	2023-07-02 13:58:31 +01:00
Alessandro Lupo	b92428f05f	better test	2023-07-02 13:34:03 +01:00
Alessandro Lupo	34b11864b6	prettiest tests	2023-07-02 13:25:57 +01:00
Christopher Kelly	1dfaa08afb	The stencils for the staple and rect-staple padded cell implementations are now created and stored by workspace classes that allow for reuse providing the grids remain consistent The workspaces are now used by the plaq+rectangle gauge action resulting in a further 2x performance improvement as measured on a 16^4 local volume for 2 nodes (16 ranks) of Crusher	2023-06-28 15:11:24 -04:00
Christopher Kelly	f44dce390f	Implemented acclerator-optimized versions of localCopyRegion and insertSliceLocal to speed up padding Fixed const correctness on PaddedCell methods Fixed compile issues on Crusher Added timing breakdowns for PaddedCell::Expand and the padded implementations of the staples, visible under --log Performance Optimized kernel for StaplePadded Test_iwasaki_action_newstaple now repeats the calculation 10 times and reports average timings	2023-06-27 14:58:10 -04:00
Christopher Kelly	bb71e9a96a	Added PaddedCell and GeneralisedLocalStencil header includes to standard base headers Moved versions of the padded-cell implementations of staple and rect-staple from test code to WilsonLoops header Added StapleAndRectStapleAll which is now called by the plaq+rectangle action class. Under the hood it uses the padded cell implementations with maximal reuse of the padded gauge links	2023-06-27 11:23:30 -04:00
Christopher Kelly	6f6844ccf1	Added new StapleAll and RectStapleAll functions that return the staples for all mu as an array Modified plaq+rectangle gauge actions to use the above Added a test code to confirm the above changes	2023-06-26 15:48:47 -04:00
Christopher Kelly	4c6613d72c	Modified RectStapleDouble and RectStapleOptimised to use Gauge-BC respecting CshiftLink Added test code tests/debug/Test_optimized_staple_gaugebc demonstrating equivalence of above to RectStapleUnoptimised for cconj gauge BCs Removed optimized staple only being used for periodic gauge BCs; it is now always used	2023-06-26 10:20:23 -04:00
Alessandro Lupo	559257bbe9	better documentation and filelist names	2023-06-23 16:16:48 +01:00
Alessandro Lupo	cff1f8d3b8	rm unused variables and formatting	2023-06-23 16:04:18 +01:00
Alessandro Lupo	f27d2083cd	adjustments in SUn and Sp2n impl	2023-06-23 15:34:08 +01:00
Christopher Kelly	36cc9c524f	Threaded the constructor of GeneralLocalStencil	2023-06-23 09:57:38 -04:00
Alessandro Lupo	2822487450	rm unncessary line	2023-06-23 14:55:23 +01:00
Alessandro Lupo	e07fafe46a	minor adjustments to twoindex	2023-06-23 12:18:04 +01:00
Alessandro Lupo	063d290bd8	missing function	2023-06-23 11:11:20 +01:00
Alessandro Lupo	4e6194d92a	Avoid code duplication in ProjectSUn	2023-06-23 11:03:50 +01:00
Alessandro Lupo	de30c4e22a	minor improvements	2023-06-23 10:49:41 +01:00
Christopher Kelly	4241c7d4a3	Imported coalescedReadGeneralPermute GPU implementation from Christoph Fixed bug in padded staple code where extract was being called on the result before the GPU view was closed Fixed compile issue with pointer cast in padded staple code Added timing summaries of padded staple code and timing breakdown of staple implementation to Test_padded_cell_staple	2023-06-21 16:01:01 -04:00
Christopher Kelly	7b11075102	The user can now specify the implementation of Cshift used by the PaddedCell class through a virtual base class API. Implementations for default (regular Cshift) and for gauge links (which respects the gauge BCs) Fixed const-correctness for PaddedCell and ConjugateGimpl::setDirections Modified test code for padded-cell implementation of staple, rect-staple to use cconj BCs	2023-06-20 17:09:56 -04:00
Christopher Kelly	abc658dca5	Added coalescedReadGeneralPermute CPU implementation based on Christoph's GPT code In a test code, implemented a padded-cell version of the staple and rectangular-staple calculation	2023-06-20 16:14:25 -04:00
Alessandro Lupo	2372275b2c	Merge pull request #36 from LupoA/sp2n/gpu-bugfix Sp2n/gpu bugfix [close #30]	2023-06-20 13:46:00 +01:00
chillenzer	ef736e8aa4	Merge pull request #35 from LupoA/sp2n/enableSp consistent enable sp config flag	2023-06-20 10:41:09 +00:00
Julian Lenz	5e539e2d54	Forgot some follow-ups on changed signature	2023-06-18 12:37:51 +01:00
Julian Lenz	96773f5254	Apparently forgot to remove one Lattice version	2023-06-18 12:21:39 +01:00
Alessandro Lupo	d80df09f3b	consistent enable sp config flag	2023-06-16 19:16:46 +01:00
Julian Lenz	621e612c30	Fix non-zero ret on device bug	2023-06-16 16:27:49 +01:00
Julian Lenz	8c3792721b	ClangFormat	2023-06-16 15:58:23 +01:00
Julian Lenz	c95bbd3948	Remove accelerated lattice version	2023-06-16 15:50:26 +01:00
Julian Lenz	e28ab7a732	Re-included instantiations for symmetric 2Index AS Sp	2023-06-16 14:20:37 +01:00
Alessandro Lupo	c797cbe737	deal with post-merge trauma	2023-06-16 14:20:37 +01:00
Alessandro Lupo	e09dfbf1c2	definetely the right merge upstream/develop	2023-06-16 14:19:46 +01:00
Julian Lenz	116d90b0ee	First attempt on #30	2023-06-15 15:09:37 +01:00
Julian Lenz	b0646ca187	Remove some unused variables	2023-06-15 15:09:09 +01:00
chillenzer	4895ff260e	Merge pull request #28 from LupoA/sp2n/config compile sp2n fermion impl only if declared at config time	2023-06-09 13:07:48 +00:00
Alessandro Lupo	470d93006a	compile sp2n fermion impl only if declared at config time	2023-06-07 12:53:33 +01:00
chillenzer	2f3d03f188	Merge pull request #27 from LupoA/sp2n/documentation documentation for gaugegroup and sp2n	2023-06-01 16:42:27 +00:00
Alessandro Lupo	8db7c23bee	improve documentation	2023-06-01 17:39:10 +01:00
chillenzer	69dc5172dc	Merge pull request #26 from LupoA/sp2n/irreps Sp2n/irreps	2023-06-01 16:28:15 +00:00
Julian Lenz	fd72eb6546	Merge branch 'sp2n/algorithm' into sp2n/irreps	2023-06-01 17:24:01 +01:00
Alessandro Lupo	b405767569	make private methods private	2023-05-26 17:02:16 +01:00
Alessandro Lupo	fe88a0c12f	cleaner twoindex class, cleaner tests	2023-05-26 16:55:30 +01:00
Alessandro Lupo	e61a9ed2b4	partial revert	2023-05-26 13:54:26 +01:00
Alessandro Lupo	de8daa3824	group is SUn by default	2023-05-26 13:44:41 +01:00
Alessandro Lupo	3a50fb29cb	directly call sp helper	2023-05-26 13:28:47 +01:00
Alessandro Lupo	6647d2656f	rm unnecessary specialisation	2023-05-26 12:27:22 +01:00
Alessandro Lupo	a6f4dbeb6d	remove redundant template parameter	2023-05-26 12:13:40 +01:00
Alessandro Lupo	92a282f2d8	Merge pull request #24 from LupoA/sp2n/fix_static_assert_symmetric Move static_assert inside of function	2023-05-26 11:13:50 +01:00
Alessandro Lupo	ca2fd9fc7b	documentation for gaugegroup and sp2n	2023-05-25 18:40:54 +01:00
Alessandro Lupo	be1a4f5860	implement TwoIndexSymm for sp2n	2023-05-22 17:21:03 +01:00
Alessandro Lupo	5897b93dd4	debug tests, fix dimension	2023-05-22 13:42:21 +01:00
Alessandro Lupo	af091e0881	DimensionHelper for 2index irreps	2023-05-21 16:56:06 +01:00
Alessandro Lupo	3c1e5e9517	Merge pull request #25 from LupoA/sp2n/unify_representations Sp2n/unify representations [close #3]	2023-05-21 14:55:27 +01:00
Alessandro Lupo	85b2cb7a8a	changing some hardcoded SUn lines	2023-05-21 14:50:28 +01:00
Julian Lenz	b8bdc2eefb	Unified two index representations	2023-05-18 18:36:29 +01:00
Julian Lenz	0078826ff1	Move static_assert inside of function	2023-05-18 18:14:53 +01:00
Julian Lenz	e855c41772	Unified spfundamental.h with fundamental.h	2023-05-18 18:11:20 +01:00
chillenzer	d169c275b6	Merge pull request #22 from LupoA/sp2n/unify_twoindex Unify TwoIndex	2023-05-18 14:55:02 +00:00
Julian Lenz	a5125e23f4	Typo	2023-05-18 15:41:35 +01:00
Julian Lenz	7b83c80757	Merge branch 'sp2n/unify_twoindex' of github.com:LupoA/Grid into sp2n/unify_twoindex	2023-05-18 15:36:14 +01:00
Julian Lenz	e41821e206	Disable two index symmetric	2023-05-18 15:29:55 +01:00
Alessandro Lupo	5a75ab15a2	typo in 2S dim	2023-05-17 20:47:57 +01:00
Alessandro Lupo	932c783fbf	2AS for every Nc!	2023-05-17 20:22:05 +01:00
Julian Lenz	55f9cce577	Revert "Added automated HMC test for Nc=4" This reverts commit `eee27b8b30`.	2023-05-17 09:17:48 +01:00
Alessandro Lupo	b3533ca847	correct tests (failing)	2023-05-16 17:43:52 +01:00
Alessandro Lupo	fd2a637010	test 2index	2023-05-16 14:10:39 +01:00
Julian Lenz	eee27b8b30	Added automated HMC test for Nc=4	2023-05-15 18:37:33 +01:00
Julian Lenz	8522352aa3	ClangFormat	2023-05-15 18:36:05 +01:00
Alessandro Lupo	3beb8f4091	fixing typo, getting pre-changes physics	2023-05-15 16:00:15 +01:00
Alessandro Lupo	12a706e9b1	de-hardcode the number of generators	2023-05-15 15:48:21 +01:00
Alessandro Lupo	170aa7df01	fix (dimension to be improved)	2023-05-15 15:20:18 +01:00
Julian Lenz	e8ad1fef53	Unify TwoIndex	2023-05-12 14:35:50 +01:00
Alessandro Lupo	aa9df63a05	rename group projections based on determinants	2023-05-10 14:50:52 +01:00
chillenzer	3953312a93	Merge pull request #20 from LupoA/sp2n/unify_gaugeimpltypes Sp2n/unify gaugeimpltypes	2023-05-03 15:17:10 +00:00
Julian Lenz	6e62f4f616	ClangFormat	2023-05-03 16:15:12 +01:00
Julian Lenz	6a7bdca53b	Take over additional algebra tests from Alessandro	2023-05-03 16:02:02 +01:00
Julian Lenz	c7fba9aace	Take over additional group tests from Alessandro	2023-05-03 16:01:48 +01:00
Julian Lenz	ac6c7cb8d6	Merge in Alessandro's changes [test fails]	2023-05-03 02:53:03 +01:00
Julian Lenz	c5924833a1	ClangFormat	2023-05-03 02:39:36 +01:00
Julian Lenz	ac0a74be0d	Taken care of algebra tests	2023-05-03 02:32:42 +01:00
Julian Lenz	42b0e1125d	Naming and argument types	2023-05-03 01:51:46 +01:00
Julian Lenz	339c4fda79	Extracted is_element_of Sp2n	2023-05-02 15:44:34 +01:00
Alessandro Lupo	9b85bf9402	better projection test	2023-05-02 15:42:20 +01:00
Alessandro Lupo	86b02c3cd8	cleaning up requested by Julian	2023-05-02 13:31:17 +01:00
Alessandro Lupo	7b3b7093fa	cleaning up requested by Ed	2023-05-02 12:50:57 +01:00
Alessandro Lupo	881b08a465	Correct implementation of SpTa	2023-04-27 18:17:06 +01:00
Julian Lenz	3ee5444c69	Remove commented out stuff	2023-04-21 08:08:18 +01:00
Julian Lenz	5e28fe56d2	Remove code duplication: Iterating through vectors	2023-04-21 08:08:06 +01:00
Alessandro Lupo	5aabe074fe	Rename Sympl* to Sp*	2023-04-18 11:50:20 +01:00
Alessandro Lupo	dace904c10	fix typo	2023-04-14 18:06:18 +01:00
Alessandro Lupo	be98d26610	small change I missed in previous commit	2023-04-13 17:48:43 +01:00
Alessandro Lupo	178376f24b	minor stylistic changes	2023-04-06 12:08:17 +01:00
chillenzer	6a0eb466ee	Merge pull request #19 from LupoA/refactoring_sp2n refactoring sp2n	2023-04-05 10:50:58 +00:00
Alessandro Lupo	4ea29b8f0f	Template group into GaugeImplTypes. Closing #2	2023-04-04 17:49:28 +01:00
Alessandro Lupo	778291230a	expand ProjecOnGaugeGroup, change ProjectOnSp2nAlgebra into SpTa, fixing some of its issues	2023-04-04 17:48:13 +01:00
Alessandro Lupo	026e736dfa	Projection on algebra can now be templated. Fix #12	2023-04-03 16:31:19 +01:00
Alessandro Lupo	4275b3f431	Fix typo and remove unnecessary lines	2023-04-03 12:01:52 +01:00
Alessandro Lupo	1b8176e2c0	fix code duplication	2023-03-17 14:58:00 +00:00
Alessandro Lupo	cbc053c3db	Revert "projection on Sp2n algebra, to be used instead of Ta" This reverts commit `ba7f9d7b70`.	2023-03-17 11:36:58 +00:00
Alessandro Lupo	cdf3f6ef6e	Merge branch 'refactoring_sp2n' of https://github.com/LupoA/Grid into refactoring_sp2n	2023-03-15 15:59:50 +00:00
Alessandro Lupo	ba7f9d7b70	projection on Sp2n algebra, to be used instead of Ta	2023-03-15 15:55:12 +00:00
Alessandro Lupo	371fd123fb	consequence of iSUnMatrix being no longer a member of the SU class	2023-03-14 10:47:07 +00:00
Alessandro Lupo	d6ff644aab	Towards the day all tests compile	2023-03-14 10:43:25 +00:00
Julian Lenz	29586f6b5e	Deactivate some tests for Nc!=3	2023-03-13 08:17:14 +00:00
Alessandro Lupo	fd057c838f	add ProjectOnGaugeGroup and ProjectGn to allow future templating in GaugeImplTypes	2023-03-10 12:10:46 +00:00
Alessandro Lupo	f51222086c	Move functions from GaugeGroup to group specific implementations	2023-03-09 16:22:20 +00:00
Alessandro Lupo	f73691ec47	Merge pull request #18 from nickforce989/sp2n/newbranch Sp2n/newbranch	2023-02-13 10:22:27 +01:00
Niccolo Forzano	7ebda3e9ec	Merge commit 'b10e1b7bc8bec809f874e9e48a3ccc7b2619c9d1' into sp2n/newbranch	2023-01-19 12:10:18 +00:00
Niccolo Forzano	b10e1b7bc8	Fixed files giving zero force computation on GPU, issue #8	2023-01-18 18:04:47 +00:00
Alessandro Lupo	d7dea44ce7	Merge pull request #17 from chillenzer/unify_gauge_groups Fix compilation error in nvcc (closes #15)	2022-12-19 16:24:03 +00:00
Julian Lenz	37b6b82869	Fix file extensions	2022-12-18 16:12:56 +00:00
Julian Lenz	92ad5b8f74	Compiler error fix: NVCC requires names for templ. par.	2022-12-18 15:50:19 +00:00
Alessandro Lupo	8c80f1c168	Merge pull request #14 from chillenzer/unify_gauge_groups Unify gauge groups (closes #5)	2022-12-01 17:35:46 +00:00
Julian Lenz	0af7d5a793	Rename Grid/qcd/utils/<Group>_impl.h -> Grid/qcd/utils/<Group>.h	2022-11-30 17:12:00 +00:00
Julian Lenz	505fa49983	Renamed SUn.h -> GaugeGroup.h	2022-11-30 17:09:48 +00:00
Julian Lenz	7bcf33def9	Removed Sp2n.h	2022-11-30 16:59:46 +00:00
Julian Lenz	a13820656a	Removed iSUnMatrix, etc.	2022-11-30 15:09:03 +00:00
Julian Lenz	fa71b46a41	Hide nsp	2022-11-30 14:44:23 +00:00
Julian Lenz	b8b3ae6ac1	Make helper functions private	2022-11-30 13:29:14 +00:00
Julian Lenz	55c008da21	Removed forward declaration	2022-11-30 13:12:21 +00:00
Julian Lenz	2507606bd0	With function overloading (still dirty).	2022-11-30 12:54:36 +00:00
Julian Lenz	7c2ad4f8c8	Attempt with SFINAE (failed)	2022-11-30 11:57:39 +00:00
Julian Lenz	54c8025aad	Remove unnecessary pwd in scripts/filelist	2022-11-28 17:50:38 +00:00
Julian Lenz	921e23e83c	Separated out everything SU specific	2022-11-28 17:47:50 +00:00
Julian Lenz	6e750ecb0e	Remove apparently forgotten file	2022-11-28 16:33:46 +00:00
Julian Lenz	b8f1f5d2a3	Introduce GaugeGroup	2022-11-25 17:45:32 +00:00
Julian Lenz	9273f2937c	Autoformat google style	2022-11-25 17:44:08 +00:00
Julian Lenz	1aa28b47ae	Add existing test to check	2022-11-25 17:40:40 +00:00
Julian Lenz	629cb2987a	Fix typo in Makefile.am	2022-11-25 17:40:21 +00:00
Julian Lenz	03235d6368	Fixed type in configure.ac	2022-11-25 16:57:40 +00:00
Alessandro Lupo	22064c7e4c	Fixing #11	2022-11-25 13:10:29 +00:00
Alessandro Lupo	2de03e5172	Revert "Revert "Fixing issue #11 : consistent use of ncolour and nsp"" This reverts commit `3af4929dda`.	2022-11-23 19:40:28 +00:00
Alessandro Lupo	3af4929dda	Revert "Fixing issue #11 : consistent use of ncolour and nsp" This reverts commit `1ba429345b`.	2022-11-23 19:34:59 +00:00
Alessandro Lupo	1ba429345b	Fixing issue #11 : consistent use of ncolour and nsp	2022-11-23 18:45:01 +00:00
Alessandro Lupo	88bdd4344b	2indx antisymm representation of sp2n	2021-11-04 18:27:35 +00:00
Alessandro Lupo	4044536eea	add projection on sp2n algebra	2021-10-26 10:20:44 +01:00
Alessandro Lupo	4d8ae6221c	fix projection	2021-10-22 10:44:54 +01:00
Alessandro Lupo	4e31e4e094	Better tests	2021-10-13 15:07:23 +01:00
Alessandro Lupo	0d6674e489	hot start for sp2n	2021-10-12 18:53:54 +01:00
Alessandro Lupo	b145fd4f5b	necessary to merge	2021-10-12 17:08:46 +01:00
Alessandro Lupo	8a5b794f25	necessary change to merge with upstrm	2021-10-12 16:04:03 +01:00
Alessandro Lupo	291e80f88a	sp2n as config option	2021-10-12 16:00:32 +01:00
Alessandro Lupo	1ace5850ae	first hmc	2021-10-12 16:00:32 +01:00
Alessandro Lupo	283f14b7c1	fix sp2n projection	2021-10-12 16:00:32 +01:00
Alessandro Lupo	1d6e708083	tests!	2021-10-12 16:00:32 +01:00
Alessandro Lupo	89457e25e3	sp fermion instantiation	2021-10-12 16:00:32 +01:00
Alessandro Lupo	7e3b298d3d	project on sp2n	2021-10-12 16:00:32 +01:00
Alessandro Lupo	7ff3e5eed4	gauge and fermion implementation for sp2n	2021-10-12 16:00:32 +01:00
Alessandro Lupo	19eb51cf41	sp2n generators	2021-10-12 15:53:33 +01:00
Alessandro Lupo	470d4dcc6d	sp2n as config option	2021-10-12 15:47:56 +01:00
Alessandro Lupo	ed03bfd555	first hmc	2021-10-12 12:16:47 +01:00
Alessandro Lupo	8c0fbcccae	fix sp2n projection	2021-10-12 12:12:16 +01:00
Alessandro Lupo	d4866157fe	tests!	2021-10-12 09:06:15 +01:00
Alessandro Lupo	b6496b6cb5	sp fermion instantiation	2021-10-11 16:32:10 +01:00
Alessandro Lupo	4f5fe57920	project on sp2n	2021-10-11 16:28:15 +01:00
Alessandro Lupo	11fb943b1e	gauge and fermion implementation for sp2n	2021-10-11 16:21:25 +01:00
Alessandro Lupo	046a23121e	sp2n generators	2021-10-05 15:51:22 +01:00
				`@@ -0,0 +1 @@`
				`../WilsonCloverFermionInstantiation.cc.master`
				`@@ -0,0 +1 @@`
				`#define IMPLEMENTATION SpWilsonTwoIndexAntiSymmetricImplD`