Merge pull request #451 from paboyle/feature/eigen-3.4.0-update

updating Eigen to 3.4.0
Merge pull request #454 from edbennett/adjoint-broke
2025-11-14 10:39:31 +00:00 · 2024-02-28 18:03:37 -05:00 · 2024-02-28 14:05:27 -05:00 · 2024-02-28 14:04:43 -05:00 · 2024-02-28 13:59:04 -05:00 · 2024-02-28 19:56:23 +01:00
63 changed files with 3341 additions and 218 deletions
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -29,8 +29,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-extern Vector<std::pair<int,int> > Cshift_table; 
+extern std::vector<std::pair<int,int> > Cshift_table; 
 extern commVector<std::pair<int,int> > Cshift_table_device; 
 inline std::pair<int,int> *MapCshiftTable(void)
 {
  // GPU version
 #ifdef ACCELERATOR_CSHIFT    
  uint64_t sz=Cshift_table.size();
  if (Cshift_table_device.size()!=sz )    {
    Cshift_table_device.resize(sz);
  }
  acceleratorCopyToDevice((void *)&Cshift_table[0],
 			  (void *)&Cshift_table_device[0],
 			  sizeof(Cshift_table[0])*sz);
  return &Cshift_table_device[0];
 #else 
  return &Cshift_table[0];
 #endif
  // CPU version use identify map
 }
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
@@ -74,8 +93,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
  }
  {
    auto buffer_p = & buffer[0];
-    auto table = &Cshift_table[0];
+    auto table = MapCshiftTable();
-#ifdef ACCELERATOR_CSHIFT    
+#ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
@@ -225,7 +244,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
  {
    auto buffer_p = & buffer[0];
-    auto table = &Cshift_table[0];
+    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
@@ -297,30 +316,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  }
 }
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
 template <typename T>
 T iDivUp(T a, T b) // Round a / b to nearest higher integer value
 { return (a % b != 0) ? (a / b + 1) : (a / b); }
 template <typename T>
 __global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
 {
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx >= e1*e2) return;
    int n, b, o;
    n = idx / e2;
    b = idx % e2;
    o = n*stride + b;
    vector[2*idx + 0] = lo + o;
    vector[2*idx + 1] = ro + o;
 }
 #endif
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@@ -345,20 +340,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int ent=0;
  if(cbmask == 0x3 ){
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
    ent = e1*e2;
    dim3 blockSize(acceleratorThreads());
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
    accelerator_barrier();
 #else
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
 #endif
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@@ -372,7 +359,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  }
  {
-    auto table = &Cshift_table[0];
+    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
@@ -409,19 +396,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int ent=0;
  if ( cbmask == 0x3 ) {
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
    ent = e1*e2;
    dim3 blockSize(acceleratorThreads());
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
    accelerator_barrier();
 #else
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
 #endif
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
@@ -432,7 +411,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  }
  {
-    auto table = &Cshift_table[0];
+    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -52,7 +52,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
  int splice_dim      = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
-
+  RealD t1,t0;
  t0=usecond();
  if ( !comm_dim ) {
    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
@@ -63,6 +64,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
  t1=usecond();
  //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }
@@ -127,16 +130,20 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
+  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  for(int x=0;x<rd;x++){       
    int sx        =  (x+sshift)%rd;
    int comm_proc = ((x+sshift)/rd)%pd;
    if (comm_proc==0) {
-
+      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-
+      tcopy+=usecond();
    } else {
      int words = buffer_size;
@@ -144,26 +151,39 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      int bytes = words * sizeof(vobj);
      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
      tgather+=usecond();
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-
+      
-      grid->Barrier();
+      tcomms-=usecond();
      //      grid->Barrier();
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
      xbytes+=bytes;
      //      grid->Barrier();
      tcomms+=usecond();
-      grid->Barrier();
+      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
      tscatter+=usecond();
    }
  }
  /*
  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -190,6 +210,12 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int permute_type=grid->PermuteType(dimension);
  ///////////////////////////////////////////////
@@ -227,7 +253,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      pointers[i] = &send_buf_extract[i][0];
    }
    int sx   = (x+sshift)%rd;
    tgather-=usecond();
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
    tgather+=usecond();
    for(int i=0;i<Nsimd;i++){
@@ -252,7 +280,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-	grid->Barrier();
+	tcomms-=usecond();
 	//	grid->Barrier();
 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
@@ -262,7 +291,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 			     recv_from_rank,
 			     bytes);
-	grid->Barrier();
+	xbytes+=bytes;
 	//	grid->Barrier();
 	tcomms+=usecond();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
@@ -270,9 +301,17 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      }
    }
    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
  }
-
+  /*
  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
 }
 #else 
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -292,6 +331,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(comm_dim==1);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
@@ -315,7 +359,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
    if (comm_proc==0) {
      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
      tcopy+=usecond();
    } else {
@@ -324,7 +370,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      int bytes = words * sizeof(vobj);
      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
      tgather+=usecond();
      //      int rank           = grid->_processor;
      int recv_from_rank;
@@ -332,7 +380,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-      grid->Barrier();
+      tcomms-=usecond();
      //      grid->Barrier();
      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
      grid->SendToRecvFrom((void *)&send_buf[0],
@@ -340,13 +389,24 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
      xbytes+=bytes;
      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
-      grid->Barrier();
+      //      grid->Barrier();
      tcomms+=usecond();
      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
      tscatter+=usecond();
    }
  }
  /*
  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -372,6 +432,11 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  assert(simd_layout==2);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int permute_type=grid->PermuteType(dimension);
@@ -414,8 +479,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    for(int i=0;i<Nsimd;i++){       
      pointers[i] = &send_buf_extract[i][0];
    }
    tgather-=usecond();
    int sx   = (x+sshift)%rd;
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
    tgather+=usecond();
    for(int i=0;i<Nsimd;i++){
@@ -440,7 +507,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-	grid->Barrier();
+	tcomms-=usecond();
 	//	grid->Barrier();
 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
@@ -449,17 +517,28 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 			     recv_from_rank,
 			     bytes);
 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
 	xbytes+=bytes;
-	grid->Barrier();
+	//	grid->Barrier();
 	tcomms+=usecond();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
      }
    }
    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
-  }
+    tscatter+=usecond();
  }
  /*
  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
  */
 }
 #endif
 NAMESPACE_END(Grid); 
--- a/Grid/cshift/Cshift_table.cc
+++ b/Grid/cshift/Cshift_table.cc
@@ -1,4 +1,5 @@
 #include <Grid/GridCore.h>       
 NAMESPACE_BEGIN(Grid);
-Vector<std::pair<int,int> > Cshift_table; 
+std::vector<std::pair<int,int> > Cshift_table; 
 commVector<std::pair<int,int> > Cshift_table_device; 
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -270,5 +270,42 @@ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const L
    return axpby_norm_fast(ret,a,b,x,y);
 }
 /// Trace product
 template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2)
  -> Lattice<decltype(trace(obj()))>
 {
  typedef decltype(trace(obj())) robj;
  Lattice<robj> ret_i(rhs_1.Grid());
  autoView( rhs1 , rhs_1, AcceleratorRead);
  autoView( rhs2 , rhs_2, AcceleratorRead);
  autoView( ret , ret_i, AcceleratorWrite);
  ret.Checkerboard() = rhs_1.Checkerboard();
  accelerator_for(ss,rhs1.size(),obj::Nsimd(),{
      coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss)));
  });
  return ret_i;
 }
 template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2)
  -> Lattice<decltype(trace(obj1()))>
 {
  typedef decltype(trace(obj1())) robj;
  Lattice<robj> ret_i(rhs_1.Grid());
  autoView( rhs1 , rhs_1, AcceleratorRead);
  autoView( ret , ret_i, AcceleratorWrite);
  ret.Checkerboard() = rhs_1.Checkerboard();
  accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{
      coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2));
  });
  return ret_i;
 }
 template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1)
  -> Lattice<decltype(trace(obj1()))>
 {
  return traceProduct(rhs_1,rhs_2);
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }
-#if ( (!defined(GRID_CUDA)) )
+#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -31,6 +31,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_SYCL)
 #include <Grid/lattice/Lattice_reduction_sycl.h>
 #endif
 #include <Grid/lattice/Lattice_slicesum_core.h>
 NAMESPACE_BEGIN(Grid);
@@ -448,19 +449,10 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
-
+  int ostride=grid->_ostride[orthogdim];
-  // sum over reduced dimension planes, breaking out orthog dir
+  
-  // Parallel over orthog direction
+  //Reduce Data down to lvSum
-  autoView( Data_v, Data, CpuRead);
+  sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
  thread_for( r,rd, {
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int ss= so+n*stride+b;
 	lvSum[r]=lvSum[r]+Data_v[ss];
      }
    }
  });
  // Sum across simd lanes in the plane, breaking out orthog dir.
  Coordinate icoor(Nd);
@@ -504,6 +496,7 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
  return result;
 }
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  cudaGetDevice(&device);
 #endif
 #ifdef GRID_HIP
-  hipGetDevice(&device);
+  auto r=hipGetDevice(&device);
 #endif
  Iterator warpSize            = gpu_props[device].warpSize;
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -152,6 +152,7 @@ public:
 #ifdef RNG_FAST_DISCARD
  static void Skip(RngEngine &eng,uint64_t site)
  {
 #if 0
    /////////////////////////////////////////////////////////////////////////////////////
    // Skip by 2^40 elements between successive lattice sites
    // This goes by 10^12.
@@ -162,9 +163,9 @@ public:
    // tens of seconds per trajectory so this is clean in all reasonable cases,
    // and margin of safety is orders of magnitude.
    // We could hack Sitmo to skip in the higher order words of state if necessary
-      //
+    //
-      // Replace with 2^30 ; avoid problem on large volumes
+    // Replace with 2^30 ; avoid problem on large volumes
-      //
+    //
    /////////////////////////////////////////////////////////////////////////////////////
    //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
    const int shift = 30;
@@ -179,6 +180,9 @@ public:
    assert((skip >> shift)==site); // check for overflow
    eng.discard(skip);
 #else
    eng.discardhi(site);
 #endif
    //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
  } 
 #endif
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -0,0 +1,213 @@
 #pragma once
 #include <type_traits>
 #if defined(GRID_CUDA)
 #include <cub/cub.cuh>
 #define gpucub cub
 #define gpuError_t cudaError_t
 #define gpuSuccess cudaSuccess
 #elif defined(GRID_HIP)
 #include <hipcub/hipcub.hpp>
 #define gpucub hipcub
 #define gpuError_t hipError_t
 #define gpuSuccess hipSuccess
 #endif
 NAMESPACE_BEGIN(Grid);
 #if defined(GRID_CUDA) || defined(GRID_HIP)
 template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
  size_t subvol_size = e1*e2;
  commVector<vobj> reduction_buffer(rd*subvol_size);
  auto rb_p = &reduction_buffer[0];
  vobj zero_init;
  zeroit(zero_init);
  void *temp_storage_array = NULL;
  size_t temp_storage_bytes = 0;
  vobj *d_out;
  int* d_offsets;
  std::vector<int> offsets(rd+1,0);
  for (int i = 0; i < offsets.size(); i++) {
    offsets[i] = i*subvol_size;
  }
  //Allocate memory for output and offset arrays on device
  d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
  //copy offsets to device
  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
  if (gpuErr!=gpuSuccess) {
    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
    exit(EXIT_FAILURE);
  }
  //allocate memory for temp_storage_array  
  temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
  //prepare buffer for reduction
  //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
  //use 2d accelerator_for to avoid launch latencies found when serially looping over rd 
  accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{ 
    int n = s / e2;
    int b = s % e2;
    int so=r*ostride; // base offset for start of plane 
    int ss= so+n*stride+b;
    coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
  });
  //issue segmented reductions in computeStream
  gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
  if (gpuErr!=gpuSuccess) {
    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
    exit(EXIT_FAILURE);
  }
  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  //sync after copy
  accelerator_barrier();
  acceleratorFreeDevice(temp_storage_array);
  acceleratorFreeDevice(d_out);
  acceleratorFreeDevice(d_offsets);
 }
 template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
  typedef typename vobj::vector_type vector;
  const int words = sizeof(vobj)/sizeof(vector);
  const int osites = rd*e1*e2;
  commVector<vector>buffer(osites);
  vector *dat = (vector *)Data;
  vector *buf = &buffer[0];
  Vector<vector> lvSum_small(rd);
  vector *lvSum_ptr = (vector *)&lvSum[0];
  for (int w = 0; w < words; w++) {
    accelerator_for(ss,osites,1,{
 	    buf[ss] = dat[ss*words+w];
    });
    sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
    for (int r = 0; r < rd; r++) {
      lvSum_ptr[w+words*r]=lvSum_small[r];
    }
  }
 }
 template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 {
  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
    if constexpr (sizeof(vobj) <= 256) { 
      sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
    }
    else {
      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
    }
 }
 #endif
 #if defined(GRID_SYCL)
 template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  typedef typename vobj::scalar_object sobj;
  size_t subvol_size = e1*e2;
  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
  vobj vobj_zero;
  zeroit(vobj_zero);
  commVector<vobj> reduction_buffer(rd*subvol_size);    
  auto rb_p = &reduction_buffer[0];
  autoView(Data_v, Data, AcceleratorRead);
  //prepare reduction buffer 
  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
      int n = s / e2;
      int b = s % e2;
      int so=r*ostride; // base offset for start of plane 
      int ss= so+n*stride+b;
      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
  });
  for (int r = 0; r < rd; r++) {
      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
          Reduction,
          [=](cl::sycl::id<1> item, auto &sum) {
              auto s = item[0];
              sum += rb_p[r*subvol_size+s];
          });
      });
      theGridAccelerator->wait();
      lvSum[r] = mysum[0];
  }
  free(mysum,*theGridAccelerator);
 }
 #endif
 template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  // sum over reduced dimension planes, breaking out orthog dir
  // Parallel over orthog direction
  autoView( Data_v, Data, CpuRead);
  thread_for( r,rd, {
    int so=r*ostride; // base offset for start of plane 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int ss= so+n*stride+b;
        lvSum[r]=lvSum[r]+Data_v[ss];
      }
    }
  });
 }
 template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
 {
  #if defined(GRID_CUDA) || defined(GRID_HIP)
  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #elif defined(GRID_SYCL)
  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #else
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #endif
 }
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -469,15 +469,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;
  vobj zz = Zero();
  accelerator_for(sc,coarse->oSites(),1,{
      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate
-      vobj cd = zz;
+      vobj cd = Zero();
      for(int sb=0;sb<blockVol;sb++){
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -45,6 +45,7 @@ public:
  };
  // Host only
  GridBase * getGrid(void) const { return _grid; };
  vobj* getHostPointer(void) const { return _odata; };
 };
 /////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/perfmon/Tracing.h
+++ b/Grid/perfmon/Tracing.h
@@ -34,7 +34,7 @@ class GridTracer {
 };
 inline void tracePush(const char *name) { roctxRangePushA(name); }
 inline void tracePop(const char *name) { roctxRangePop(); }
-inline int  traceStart(const char *name) { roctxRangeStart(name); }
+inline int  traceStart(const char *name) { return roctxRangeStart(name); }
 inline void traceStop(int ID) { roctxRangeStop(ID); }
 #endif
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -129,6 +129,22 @@ public:
  virtual ~Action(){}
 };
 template <class GaugeField >
 class EmptyAction : public Action <GaugeField>
 {
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative
  ///////////////////////////////
  // Logging
  ///////////////////////////////
  virtual std::string action_name()    { return std::string("Level Force Log"); };
  virtual std::string LogParameters()  { return std::string("No parameters");};
 };
 NAMESPACE_END(Grid);
 #endif // ACTION_BASE_H
--- a/Grid/qcd/action/fermion/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.h
@@ -63,7 +63,9 @@ public:
  virtual void MooeeDag(const FermionField &in, FermionField &out) ;
  virtual void MooeeInv(const FermionField &in, FermionField &out) ;
  virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
-
+  virtual void M(const FermionField &in, FermionField &out) ;
  virtual void Mdag(const FermionField &in, FermionField &out) ;
 private:
  RealD mu; // TwistedMass parameter
--- a/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
@@ -93,5 +93,25 @@ void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &ou
  RealD b    = tm /sq;
  axpibg5x(out,in,a,b);
 }
 template<class Impl>
 void WilsonTMFermion<Impl>::M(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  this->Dhop(in, out, DaggerNo);
  FermionField tmp(out.Grid());
  RealD a = 4.0+this->mass;
  RealD b = this->mu;
  axpibg5x(tmp,in,a,b);
  axpy(out, 1.0, tmp, out);
 }
 template<class Impl>
 void WilsonTMFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  this->Dhop(in, out, DaggerYes);
  FermionField tmp(out.Grid());
  RealD a = 4.0+this->mass;
  RealD b = -this->mu;
  axpibg5x(tmp,in,a,b);
  axpy(out, 1.0, tmp, out);
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -87,6 +87,8 @@ public:
  const ActionSet<Field, RepresentationPolicy> as;
  ActionSet<Field,RepresentationPolicy> LevelForces;
  //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
  static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){ 
    static MomentumFilterNone<MomentaField> filter;
@@ -124,6 +126,9 @@ public:
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing
    assert(as.size()==LevelForces.size());
    Field level_force(U.Grid()); level_force =Zero();
    for (int a = 0; a < as[level].actions.size(); ++a) {
      double start_full = usecond();
@@ -144,7 +149,10 @@ public:
      MomFilter->applyFilter(force);
      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl;
-      
+
      // track the total
      level_force = level_force+force;
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
@@ -167,6 +175,16 @@ public:
    }
    {
      // total force
      Real force_abs   = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
      Real force_max   = std::sqrt(maxLocalNorm2(level_force));
      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    
      LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
    }
    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);
@@ -216,6 +234,16 @@ public:
    //Default the momentum filter to "do-nothing"
    MomFilter = getDefaultMomFilter();
    for (int level = 0; level < as.size(); ++level) {
      int multiplier = as.at(level).multiplier;
      ActionLevel<Field, RepresentationPolicy> * Level = new ActionLevel<Field, RepresentationPolicy>(multiplier);
      Level->push_back(new EmptyAction<Field>); 
      LevelForces.push_back(*Level);
      // does it copy by value or reference??
      // - answer it copies by value, BUT the action level contains a reference that is NOT updated.
      // Unsafe code in Guido's area
    }
  };
  virtual ~Integrator() {}
@@ -233,10 +261,14 @@ public:
  void reset_timer(void)
  {
    assert(as.size()==LevelForces.size());
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        as[level].actions.at(actionID)->reset_timer();
      }
      int actionID=0;
      assert(LevelForces.at(level).actions.size()==1);
      LevelForces.at(level).actions.at(actionID)->reset_timer();
    }
  }
  void print_timer(void)
@@ -298,6 +330,16 @@ public:
 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
 		  << std::endl;
      }
      int actionID=0;
      std::cout << GridLogMessage 
 		  << LevelForces[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] :\n\t\t "
 		  <<" force max " << LevelForces[level].actions.at(actionID)->deriv_max_average()
 		  <<" norm "      << LevelForces[level].actions.at(actionID)->deriv_norm_average()
 		  <<" Fdt max  "  << LevelForces[level].actions.at(actionID)->Fdt_max_average()
 		  <<" Fdt norm "  << LevelForces[level].actions.at(actionID)->Fdt_norm_average()
 		  <<" calls "     << LevelForces[level].actions.at(actionID)->deriv_num
 		  << std::endl;
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
@@ -319,6 +361,13 @@ public:
 	std::cout << as[level].actions.at(actionID)->LogParameters();
      }
    }
    std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <<std::endl;
    for (int level = 0; level < LevelForces.size(); ++level) {
      std::cout << GridLogMessage << "[Integrator] ---- Level: "<< level << std::endl;
      for (int actionID = 0; actionID < LevelForces[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage << "["<< LevelForces[level].actions.at(actionID)->action_name() << "] ID: " << actionID << std::endl;
      }
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
@@ -400,6 +449,7 @@ public:
  RealD S(Field& U) 
  {  // here also U not used
    assert(as.size()==LevelForces.size());
    std::cout << GridLogIntegrator << "Integrator action\n";
    RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@@ -1,3 +1,4 @@
 /*!
  @file GaugeConfiguration.h
  @brief Declares the GaugeConfiguration class
@@ -6,6 +7,15 @@
 NAMESPACE_BEGIN(Grid);
 template<class T> void Dump(const Lattice<T> & lat,
 			    std::string s,
 			    Coordinate site = Coordinate({0,0,0,0}))
 {
  typename T::scalar_object tmp;
  peekSite(tmp,lat,site);
  std::cout << " Dump "<<s<<" "<<tmp<<std::endl;
 }
 /*!
  @brief Smeared configuration masked container
  Modified for a multi-subset smearing (aka Luscher Flowed HMC)
@@ -28,6 +38,101 @@ private:
  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
  typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField;
  void BaseSmearDerivative(GaugeField& SigmaTerm,
 			   const GaugeField& iLambda,
 			   const GaugeField& U,
 			   int mmu, RealD rho)
  {
    // Reference
    // Morningstar, Peardon, Phys.Rev.D69,054501(2004)
    // Equation 75
    // Computing Sigma_mu, derivative of S[fat links] with respect to the thin links
    // Output SigmaTerm
    GridBase *grid = U.Grid();
    WilsonLoops<Gimpl> WL;
    GaugeLinkField staple(grid), u_tmp(grid);
    GaugeLinkField iLambda_mu(grid), iLambda_nu(grid);
    GaugeLinkField U_mu(grid), U_nu(grid);
    GaugeLinkField sh_field(grid), temp_Sigma(grid);
    Real rho_munu, rho_numu;
    rho_munu = rho;
    rho_numu = rho;
    for(int mu = 0; mu < Nd; ++mu){
      U_mu       = peekLorentz(      U, mu);
      iLambda_mu = peekLorentz(iLambda, mu);
      for(int nu = 0; nu < Nd; ++nu){
 	if(nu==mu) continue;
 	U_nu       = peekLorentz(      U, nu);
 	// Nd(nd-1) = 12 staples normally.
 	// We must compute 6 of these
 	// in FTHMC case
 	if ( (mu==mmu)||(nu==mmu) )
 	  WL.StapleUpper(staple, U, mu, nu);
 	if(nu==mmu) {
 	  iLambda_nu = peekLorentz(iLambda, nu);
 	  temp_Sigma = -rho_numu*staple*iLambda_nu;  //ok
 	  //-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x)
 	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
 	  sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity?
 	  temp_Sigma = rho_numu*sh_field*staple; //ok
 	  //r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)
 	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
 	}
 	if ( mu == mmu ) { 
 	  sh_field = Cshift(iLambda_mu, nu, 1);
 	  temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok
 	  //-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x)
 	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
 	}
 	//	staple = Zero();
 	sh_field = Cshift(U_nu, mu, 1);
 	temp_Sigma = Zero();
 	if ( mu == mmu )
 	  temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu;
 	if ( nu == mmu ) {
 	  temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu;
 	  u_tmp = adj(U_nu)*iLambda_nu;
 	  sh_field = Cshift(u_tmp, mu, 1);
 	  temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu;
 	}
 	sh_field = Cshift(temp_Sigma, nu, -1);
 	Gimpl::AddLink(SigmaTerm, sh_field, mu);
      }
    }
  }
  void BaseSmear(GaugeLinkField& Cup, const GaugeField& U,int mu,RealD rho) {
    GridBase *grid = U.Grid();
    GaugeLinkField tmp_stpl(grid);
    WilsonLoops<Gimpl> WL;
    Cup = Zero();
    for(int nu=0; nu<Nd; ++nu){
      if (nu != mu) {
 	// get the staple in direction mu, nu
 	WL.Staple(tmp_stpl, U, mu, nu);  //nb staple conventions of IroIro and Grid differ by a dagger
 	Cup += adj(tmp_stpl*rho);
      }
    }
  }
  // Adjoint vector to GaugeField force
  void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu)
  {
@@ -47,27 +152,54 @@ private:
    GaugeLinkField UtaU(PlaqL.Grid());
    GaugeLinkField D(PlaqL.Grid());
    AdjMatrixField Dbc(PlaqL.Grid());
    AdjMatrixField Dbc_opt(PlaqL.Grid());
    LatticeComplex tmp(PlaqL.Grid());
    const int Ngen = SU3Adjoint::Dimension;
    Complex ci(0,1);
    ColourMatrix   ta,tb,tc;
-    
+    RealD t=0;
    RealD tp=0;
    RealD tta=0;
    RealD tpk=0;
    t-=usecond();
    for(int a=0;a<Ngen;a++) {
      tta-=usecond();
      SU3::generator(a, ta);
      ta = 2.0 * ci * ta;
      // Qlat Tb = 2i Tb^Grid
-      UtaU= 2.0*ci*adj(PlaqL)*ta*PlaqR;
+      UtaU= adj(PlaqL)*ta*PlaqR; // 6ms
      tta+=usecond();
      ////////////////////////////////////////////
      // Could add this entire C-loop to a projection routine
      // for performance. Could also pick checkerboard on UtaU
      // and set checkerboard on result for 2x perf
      ////////////////////////////////////////////
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, tc);
-	D = Ta( (2.0)*ci*tc *UtaU);
+	tc = 2.0*ci*tc;
 	tp-=usecond(); 
 	D = Ta( tc *UtaU); // 2ms
 #if 1
 	SU3::LieAlgebraProject(Dbc_opt,D,c); // 5.5ms
 #else
 	for(int b=0;b<Ngen;b++){
 	  SU3::generator(b, tb);
 	  tmp =-trace(ci*tb*D); 
 	  PokeIndex<ColourIndex>(Dbc,tmp,b,c);  // Adjoint rep
 	}
 #endif
 	tp+=usecond();
      }
-      tmp = trace(MpInvJx * Dbc);
+      //      Dump(Dbc_opt,"Dbc_opt");
      //      Dump(Dbc,"Dbc");
      tpk-=usecond();
      tmp = trace(MpInvJx * Dbc_opt);
      PokeIndex<ColourIndex>(Fdet2,tmp,a);
      tpk+=usecond();
    }
    t+=usecond();
    std::cout << GridLogPerformance << " Compute_MpInvJx_dNxxdSy " << t/1e3 << " ms  proj "<<tp/1e3<< " ms"
 	      << " ta "<<tta/1e3<<" ms" << " poke "<<tpk/1e3<< " ms"<<std::endl;
  }
  void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd)
@@ -79,12 +211,17 @@ private:
    ColourMatrix   tc;
    for(int b=0;b<Ngen;b++) {
      SU3::generator(b, tb);
-      Nx = (2.0)*Ta( adj(PlaqL)*ci*tb * PlaqR );
+      tb = 2.0 * ci * tb;
      Nx = Ta( adj(PlaqL)*tb * PlaqR );
 #if 1
      SU3::LieAlgebraProject(NxAd,Nx,b);
 #else
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, tc);
 	auto tmp =closure( -trace(ci*tc*Nx)); 
 	PokeIndex<ColourIndex>(NxAd,tmp,c,b); 
      }
 #endif
    }
  }
  void ApplyMask(GaugeField &U,int smr)
@@ -164,8 +301,7 @@ public:
    // Computes ALL the staples -- could compute one only and do it here
    RealD time;
    time=-usecond();
-    this->StoutSmearing->BaseSmear(C, U);
+    BaseSmear(Cmu, U,mu,rho);
    Cmu = peekLorentz(C, mu);
    //////////////////////////////////////////////////////////////////
    // Assemble Luscher exp diff map J matrix 
@@ -209,6 +345,36 @@ public:
    // dJ(x)/dxe
    //////////////////////////////////////
    time=-usecond();
 #if 1
    std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid);
    std::vector<AdjMatrix> TRb_s; TRb_s.resize(8);
    AdjMatrixField tbXn(grid);
    AdjMatrixField sumXtbX(grid);
    AdjMatrixField t2(grid);
    AdjMatrixField dt2(grid);
    AdjMatrixField t3(grid);
    AdjMatrixField dt3(grid);
    AdjMatrixField aunit(grid);
    for(int b=0;b<8;b++){
      SU3Adjoint::generator(b, TRb_s[b]);
      dJdX[b] = TRb_s[b];
    }
    aunit = ComplexD(1.0);
    // Could put into an accelerator_for
    X  = (-1.0)*ZxAd; 
    t2 = X;
    for (int j = 12; j > 1; --j) {
      t3  = t2*(1.0 / (j + 1))  + aunit;
      t2  = X * t3;
      for(int b=0;b<8;b++){
 	dJdX[b]= TRb_s[b] * t3 + X * dJdX[b]*(1.0 / (j + 1));
      }
    }
    for(int b=0;b<8;b++){
      dJdX[b] = -dJdX[b];
    }
 #else
    std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid);
    AdjMatrixField tbXn(grid);
    AdjMatrixField sumXtbX(grid);
@@ -224,14 +390,15 @@ public:
      X  = (-1.0)*ZxAd; 
      t2 = X;
      dt2 = TRb;
-      for (int j = 20; j > 1; --j) {
+      for (int j = 12; j > 1; --j) {
-	t3 = t2*(1.0 / (j + 1))  + aunit;
+	t3  = t2*(1.0 / (j + 1))  + aunit;
 	dt3 = dt2*(1.0 / (j + 1));
 	t2 = X * t3;
 	dt2 = TRb * t3 + X * dt3;
      }
      dJdX[b] = -dt2; 
    }
 #endif  
    time+=usecond();
    std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl;
    /////////////////////////////////////////////////////////////////
@@ -281,8 +448,8 @@ public:
    for(int e =0 ; e<8 ; e++){
      LatticeComplexD tr(grid);
-      ColourMatrix te;
+      //      ColourMatrix te;
-      SU3::generator(e, te);
+      //      SU3::generator(e, te);
      tr = trace(dJdX[e] * nMpInv);
      pokeColour(dJdXe_nMpInv,tr,e);
    }
@@ -493,20 +660,25 @@ public:
    //////////////////////////////////////////////////////////////////
    // Assemble the N matrix
    //////////////////////////////////////////////////////////////////
-    // Computes ALL the staples -- could compute one only here
+    double rho=this->StoutSmearing->SmearRho[1];
-    this->StoutSmearing->BaseSmear(C, U);
+    BaseSmear(Cmu, U,mu,rho);
-    Cmu = peekLorentz(C, mu);
+
    Umu = peekLorentz(U, mu);
    Complex ci(0,1);
    for(int b=0;b<Ngen;b++) {
      SU3::generator(b, Tb);
      // Qlat Tb = 2i Tb^Grid
      Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu));
      // FIXME -- replace this with LieAlgebraProject
 #if 0
      SU3::LieAlgebraProject(Ncb,tmp,b);
 #else
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, Tc);
 	auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important
 	PokeIndex<ColourIndex>(Ncb,tmp,c,b); 
      }
 #endif
    }      
    //////////////////////////////////////////////////////////////////
@@ -693,15 +865,19 @@ private:
 					  const GaugeField& GaugeK,int level) 
  {
    GridBase* grid = GaugeK.Grid();
-    GaugeField C(grid), SigmaK(grid), iLambda(grid);
+    GaugeField SigmaK(grid), iLambda(grid);
    GaugeField SigmaKPrimeA(grid);
    GaugeField SigmaKPrimeB(grid);
    GaugeLinkField iLambda_mu(grid);
    GaugeLinkField iQ(grid), e_iQ(grid);
    GaugeLinkField SigmaKPrime_mu(grid);
    GaugeLinkField GaugeKmu(grid), Cmu(grid);
-    
+
-    this->StoutSmearing->BaseSmear(C, GaugeK);
+    int mmu= (level/2) %Nd;
    int cb= (level%2);
    double rho=this->StoutSmearing->SmearRho[1];
    // Can override this to do one direction only.
    SigmaK = Zero();
    iLambda = Zero();
@@ -712,18 +888,38 @@ private:
    // Could get away with computing only one polarisation here
    // int mu= (smr/2) %Nd;
    // SigmaKprime_A has only one component
-    for (int mu = 0; mu < Nd; mu++)
+#if 0
    BaseSmear(Cmu, GaugeK,mu,rho);
    GaugeKmu = peekLorentz(GaugeK, mu);
    SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
    iQ = Ta(Cmu * adj(GaugeKmu));
    this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
    pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
    pokeLorentz(iLambda, iLambda_mu, mu);
    BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho);  // derivative of SmearBase
 #else
    //    GaugeField C(grid);
    //    this->StoutSmearing->BaseSmear(C, GaugeK);
    //    for (int mu = 0; mu < Nd; mu++)
    int mu =mmu;
    BaseSmear(Cmu, GaugeK,mu,rho);
    {
-      Cmu = peekLorentz(C, mu);
+      // Cmu = peekLorentz(C, mu);
      GaugeKmu = peekLorentz(GaugeK, mu);
      SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
      iQ = Ta(Cmu * adj(GaugeKmu));
      this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
      pokeLorentz(iLambda, iLambda_mu, mu);
      std::cout << " mu "<<mu<<" SigmaKPrime_mu"<<norm2(SigmaKPrime_mu)<< " iLambda_mu " <<norm2(iLambda_mu)<<std::endl;
    }
-    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase
+    //    GaugeField SigmaKcopy(grid);
-
+    //    SigmaKcopy = SigmaK;
    BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho);  // derivative of SmearBase
    //    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase
    //    SigmaKcopy = SigmaKcopy - SigmaK;
    //    std::cout << " BaseSmearDerivative fast path error" <<norm2(SigmaKcopy)<<std::endl;
 #endif
    ////////////////////////////////////////////////////////////////////////////////////
    // propagate the rest of the force as identity map, just add back
    ////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@@ -69,7 +69,7 @@ public:
  /*! Construct stout smearing object from explicitly specified rho matrix */
  Smear_Stout(const std::vector<double>& rho_)
    : OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} {
-    std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl
+    std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl;
    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
    }
--- a/Grid/qcd/utils/GaugeGroup.h
+++ b/Grid/qcd/utils/GaugeGroup.h
@@ -100,6 +100,9 @@ class GaugeGroup {
  using iGroupMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
  template <typename vtype>
  using iAlgebraVector = iScalar<iScalar<iVector<vtype, AdjointDimension> > >;
  template <typename vtype>
  using iSUnAlgebraMatrix =
    iScalar<iScalar<iMatrix<vtype, AdjointDimension> > >;
  static int su2subgroups(void) { return su2subgroups(group_name()); }
  //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -128,10 +131,19 @@ class GaugeGroup {
  typedef Lattice<vMatrix> LatticeMatrix;
  typedef Lattice<vMatrixF> LatticeMatrixF;
  typedef Lattice<vMatrixD> LatticeMatrixD;
-
+  
  typedef Lattice<vAlgebraVector> LatticeAlgebraVector;
  typedef Lattice<vAlgebraVectorF> LatticeAlgebraVectorF;
  typedef Lattice<vAlgebraVectorD> LatticeAlgebraVectorD;
  typedef iSUnAlgebraMatrix<vComplex>  vAlgebraMatrix;
  typedef iSUnAlgebraMatrix<vComplexF> vAlgebraMatrixF;
  typedef iSUnAlgebraMatrix<vComplexD> vAlgebraMatrixD;
  typedef Lattice<vAlgebraMatrix>  LatticeAlgebraMatrix;
  typedef Lattice<vAlgebraMatrixF> LatticeAlgebraMatrixF;
  typedef Lattice<vAlgebraMatrixD> LatticeAlgebraMatrixD;
  typedef iSU2Matrix<Complex> SU2Matrix;
  typedef iSU2Matrix<ComplexF> SU2MatrixF;
@@ -160,7 +172,7 @@ class GaugeGroup {
    return generator(lieIndex, ta, group_name());
  }
-  static void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
+  static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
    return su2SubGroupIndex(i1, i2, su2_index, group_name());
  }
@@ -389,6 +401,52 @@ class GaugeGroup {
    }
  }
 // Ta are hermitian (?)
 // Anti herm is i Ta basis
 static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, int b)
 {
  conformable(in, out);
  GridBase *grid = out.Grid();
  LatticeComplex tmp(grid);
  Matrix ta;
  // Using Luchang's projection convention
  //  2 Tr{Ta Tb} A_b= 2/2 delta ab A_b = A_a
  autoView(out_v,out,AcceleratorWrite);
  autoView(in_v,in,AcceleratorRead);
  int N = ncolour;
  int NNm1 = N * (N - 1);
  int hNNm1= NNm1/2;
  RealD sqrt_2 = sqrt(2.0);
  Complex ci(0.0,1.0);
  for(int su2Index=0;su2Index<hNNm1;su2Index++){
    int i1, i2;
    su2SubGroupIndex(i1, i2, su2Index);
    int ax = su2Index*2;
    int ay = su2Index*2+1;
    accelerator_for(ss,grid->oSites(),1,{
 	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
 	// trace( Ta x Ci in)
 	// Bet I need to move to real part with mult by -i
 	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
 	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
      });
  }
  for(int diagIndex=0;diagIndex<N-1;diagIndex++){
    int k = diagIndex + 1; // diagIndex starts from 0
    int a = NNm1+diagIndex;
    RealD scale = 1.0/sqrt(2.0*k*(k+1));
    accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
 	auto tmp = in_v[ss]()()(0,0);
 	for(int i=1;i<k;i++){
 	  tmp=tmp+in_v[ss]()()(i,i);
 	}
 	tmp = tmp - in_v[ss]()()(k,k)*k;
 	out_v[ss]()()(a,b) =imag(tmp) * scale;
      });
    }
 }
 };
 template <int ncolour>
--- a/Grid/qcd/utils/SUn.impl.h
+++ b/Grid/qcd/utils/SUn.impl.h
@@ -10,6 +10,7 @@
 // doesn't get found by the scripts/filelist during bootstrapping.
 private:
 template <ONLY_IF_SU>
 static int su2subgroups(GroupName::SU) { return (ncolour * (ncolour - 1)) / 2; }
 ////////////////////////////////////////////////////////////////////////
@@ -576,3 +577,4 @@ static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeFie
  LieRandomize(pRNG,g,1.0);
  GaugeTransform<Gimpl>(Umu,g);
 }
--- a/Grid/sitmo_rng/sitmo_prng_engine.hpp
+++ b/Grid/sitmo_rng/sitmo_prng_engine.hpp
@@ -218,6 +218,10 @@ public:
    // -------------------------------------------------
    // misc
    // -------------------------------------------------
    void discardhi(uint64_t z) {
      _s[3] += z;
      encrypt_counter();
    }
    // req: 26.5.1.4 Random number engine requirements, p.908 table 117, row 9
    // Advances e’s state ei to ei+z by any means equivalent to z
@@ -387,4 +391,4 @@ private:
 #undef MIXK
 #undef MIX2
-#endif
+#endif
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -706,7 +706,7 @@ public:
 	}
      }
    }
-    std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    //std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@@ -761,7 +761,8 @@ public:
 		   int checkerboard,
 		   const std::vector<int> &directions,
 		   const std::vector<int> &distances,
-		   Parameters p=Parameters())
+		   Parameters p=Parameters(),
 		   bool preserve_shm=false)
  {
    face_table_computed=0;
    _grid    = grid;
@@ -855,7 +856,9 @@ public:
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();
-    _grid->ShmBufferFreeAll();
+    // Allow for multiple stencils to exist simultaneously
    if (!preserve_shm)
      _grid->ShmBufferFreeAll();
    int maxl=2;
    u_simd_send_buf.resize(maxl);
--- a/Grid/tensors/Tensor_trace.h
+++ b/Grid/tensors/Tensor_trace.h
@@ -69,6 +69,35 @@ accelerator_inline auto trace(const iVector<vtype,N> &arg) -> iVector<decltype(t
  }
  return ret;
 }
 ////////////////////////////
 // Fast path traceProduct
 ////////////////////////////
 template<class S1 , class S2, IfNotGridTensor<S1> = 0, IfNotGridTensor<S2> = 0>
 accelerator_inline auto traceProduct( const S1 &arg1,const S2 &arg2)
  -> decltype(arg1*arg2)
 {
  return arg1*arg2;
 }
 template<class vtype,class rtype,int N >
 accelerator_inline auto traceProduct(const iMatrix<vtype,N> &arg1,const iMatrix<rtype,N> &arg2) -> iScalar<decltype(trace(arg1._internal[0][0]*arg2._internal[0][0]))>
 {
  iScalar<decltype( trace(arg1._internal[0][0]*arg2._internal[0][0] )) > ret;
  zeroit(ret._internal);
  for(int i=0;i<N;i++){
  for(int j=0;j<N;j++){
    ret._internal=ret._internal+traceProduct(arg1._internal[i][j],arg2._internal[j][i]);
  }}
  return ret;
 }
 template<class vtype,class rtype >
 accelerator_inline auto traceProduct(const iScalar<vtype> &arg1,const iScalar<rtype> &arg2) -> iScalar<decltype(trace(arg1._internal*arg2._internal))>
 {
  iScalar<decltype(trace(arg1._internal*arg2._internal))> ret;
  ret._internal=traceProduct(arg1._internal,arg2._internal);
  return ret;
 }
 NAMESPACE_END(Grid);
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -34,9 +34,12 @@ NAMESPACE_BEGIN(Grid);
  // These are the Grid tensors
  template<typename T>     struct isGridTensor                : public std::false_type { static constexpr bool notvalue = true; };
-  template<class T>        struct isGridTensor<iScalar<T>>    : public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T>        struct isGridTensor<iScalar<T> >   : public std::true_type  { static constexpr bool notvalue = false; };
-  template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T, int N> struct isGridTensor<iVector<T, N> >: public std::true_type  { static constexpr bool notvalue = false; };
-  template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T, int N> struct isGridTensor<iMatrix<T, N> >: public std::true_type  { static constexpr bool notvalue = false; };
  template <typename T>  using IfGridTensor    = Invoke<std::enable_if<isGridTensor<T>::value, int> >;
  template <typename T>  using IfNotGridTensor = Invoke<std::enable_if<!isGridTensor<T>::value, int> >;
  // Traits to identify scalars
  template<typename T>     struct isGridScalar                : public std::false_type { static constexpr bool notvalue = true; };
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -7,6 +7,8 @@ uint32_t accelerator_threads=2;
 uint32_t acceleratorThreads(void)       {return accelerator_threads;};
 void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
 #define ENV_LOCAL_RANK_PALS    "PALS_LOCAL_RANKID"
 #define ENV_RANK_PALS          "PALS_RANKID"
 #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 #define ENV_LOCAL_RANK_SLURM   "SLURM_LOCALID"
@@ -147,7 +149,7 @@ void acceleratorInit(void)
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
-    hipGetDeviceProperties(&gpu_props[i], i);
+    auto r=hipGetDeviceProperties(&gpu_props[i], i);
    hipDeviceProp_t prop; 
    prop = gpu_props[i];
    totalDeviceMem = prop.totalGlobalMem;
@@ -228,8 +230,17 @@ void acceleratorInit(void)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_PALS   )) != NULL) { world_rank = atoi(localRankStr);}
  char hostname[HOST_NAME_MAX+1];
  gethostname(hostname, HOST_NAME_MAX+1);
  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
  auto devices = cl::sycl::device::get_devices();
  for(int d = 0;d<devices.size();d++){
@@ -241,9 +252,10 @@ void acceleratorInit(void)
    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
 #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld");
    if ( world_rank == 0) {
-    GPU_PROP_STR(vendor);
+      GPU_PROP_STR(vendor);
-    GPU_PROP_STR(version);
+      GPU_PROP_STR(version);
    //    GPU_PROP_STR(device_type);
    /*
    GPU_PROP(max_compute_units);
@@ -259,7 +271,8 @@ void acceleratorInit(void)
    GPU_PROP(single_fp_config);
    */
    //    GPU_PROP(double_fp_config);
-    GPU_PROP(global_mem_size);
+      GPU_PROP(global_mem_size);
    }
  }
  if ( world_rank == 0 ) {
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -225,6 +225,8 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
 inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
@@ -287,23 +289,24 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
-      unsigned long nt=acceleratorThreads();				\
+    unsigned long nt=acceleratorThreads();				\
-      unsigned long unum1 = num1;					\
+    if(nt < 8)nt=8;							\
-      unsigned long unum2 = num2;					\
+    unsigned long unum1 = num1;						\
-      if(nt < 8)nt=8;							\
+    unsigned long unum2 = num2;						\
-      cl::sycl::range<3> local {nt,1,nsimd};				\
+    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
-      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
+    cl::sycl::range<3> local {nt,1,nsimd};				\
-      cgh.parallel_for(					\
+    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
-      cl::sycl::nd_range<3>(global,local), \
+    cgh.parallel_for(							\
-      [=] (cl::sycl::nd_item<3> item) /*mutable*/     \
+		     cl::sycl::nd_range<3>(global,local),		\
-      [[intel::reqd_sub_group_size(16)]]	      \
+		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
-      {						      \
+		     [[intel::reqd_sub_group_size(16)]]			\
-      auto iter1    = item.get_global_id(0);	      \
+		     {							\
-      auto iter2    = item.get_global_id(1);	      \
+		       auto iter1    = item.get_global_id(0);		\
-      auto lane     = item.get_global_id(2);	      \
+		       auto iter2    = item.get_global_id(1);		\
-      { __VA_ARGS__ };				      \
+		       auto lane     = item.get_global_id(2);		\
-     });	   			              \
+		       { if (iter1 < unum1){ __VA_ARGS__ } };		\
-    });
+		     });						\
  });
 #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
@@ -405,7 +408,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 #define accelerator_barrier(dummy)				\
  {								\
-    hipStreamSynchronize(computeStream);			\
+    auto r=hipStreamSynchronize(computeStream);			\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@@ -438,19 +441,21 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };
-inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
+inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);};
-inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
+inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);}
 inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);}
 //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
 //inline void acceleratorCopySynchronise(void) {  }
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
-  hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  auto r=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 }
-inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
+inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyStream); };
 #endif
@@ -575,4 +580,11 @@ accelerator_inline void acceleratorFence(void)
  return;
 }
 inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
 {
  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
  acceleratorCopySynchronise();
 }
 NAMESPACE_END(Grid);
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@@ -54,15 +54,16 @@ int main(int argc, char **argv)
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps = 12;
+  MD.MDsteps = 24;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 0;
+  HMCparams.StartTrajectory  = 104;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  20;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  HMCparams.StartingType     =std::string("HotStart");
+  //  HMCparams.StartingType     =std::string("HotStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
@@ -87,6 +88,7 @@ int main(int argc, char **argv)
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
@@ -134,7 +136,6 @@ int main(int argc, char **argv)
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(2);
  ActionLevel<HMCWrapper::Field> Level3(4);
  ////////////////////////////////////
  // Strange action
@@ -191,7 +192,7 @@ int main(int argc, char **argv)
  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
-  if( ApplySmearing ) Level2.push_back(&Jacobian);
+  if( ApplySmearing ) Level1.push_back(&Jacobian);
  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
@@ -200,7 +201,7 @@ int main(int argc, char **argv)
  /////////////////////////////////////////////////////////////
  //  GaugeAction.is_smeared = ApplySmearing;
  GaugeAction.is_smeared = true;
-  Level3.push_back(&GaugeAction);
+  Level2.push_back(&GaugeAction);
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
@@ -210,10 +211,11 @@ int main(int argc, char **argv)
  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
-  TheHMC.TheAction.push_back(Level3);
+
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  TheHMC.Run(SmearingPolicy); // for smearing
--- a/HMC/FTHMC2p1f_3GeV.cc
+++ b/HMC/FTHMC2p1f_3GeV.cc
@@ -0,0 +1,226 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Copyright (C) 2023
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
 using namespace Grid;
 int main(int argc, char **argv)
 {
  std::cout << std::setprecision(12);
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionD FermionAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 24;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  20;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("HotStart");
  HMCparams.StartingType     =std::string("ColdStart");
  //  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
  CPparams.saveInterval  = 1;
  CPparams.saveSmeared   = true;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  Real beta         = 2.37;
  Real light_mass   = 0.0047;
  Real strange_mass = 0.0186;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0; // Scale factor one, Shamir
  RealD c   = 0.0;
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 1.0e-2;
  OFRp.hi       = 64;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-10;
  OFRp.degree   = 14;
  OFRp.precision= 40;
  std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeField Uhot(GridPtr);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  bool ApplySmearing = true;
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(2);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 CG,
 	 CG, CG,
 	 CG, CG, 
 	 OFRp, false);
  EOFA.is_smeared = ApplySmearing;
  Level1.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Quotients[h]->is_smeared = ApplySmearing;
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // lnDetJacobianAction
  /////////////////////////////////////////////////////////////
  double rho = 0.1;  // smearing parameter
  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
  if( ApplySmearing ) Level1.push_back(&Jacobian);
  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  GaugeAction.is_smeared = ApplySmearing;
  Level2.push_back(&GaugeAction);
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage <<  std::endl;
  std::cout << GridLogMessage <<  std::endl;
  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  TheHMC.Run(SmearingPolicy); // for smearing
  Grid_finalize();
 } // main
--- a/HMC/HMC2p1f_3GeV.cc
+++ b/HMC/HMC2p1f_3GeV.cc
@@ -0,0 +1,226 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Copyright (C) 2023
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
 using namespace Grid;
 int main(int argc, char **argv)
 {
  std::cout << std::setprecision(12);
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionD FermionAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 24;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  20;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("HotStart");
  HMCparams.StartingType     =std::string("ColdStart");
  //  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
  CPparams.saveInterval  = 1;
  CPparams.saveSmeared   = true;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  Real beta         = 2.37;
  Real light_mass   = 0.0047;
  Real strange_mass = 0.0186;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0; // Scale factor one, Shamir
  RealD c   = 0.0;
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 1.0e-2;
  OFRp.hi       = 64;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-10;
  OFRp.degree   = 14;
  OFRp.precision= 40;
  std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeField Uhot(GridPtr);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  bool ApplySmearing = false;
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(2);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 CG,
 	 CG, CG,
 	 CG, CG, 
 	 OFRp, false);
  EOFA.is_smeared = ApplySmearing;
  Level1.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Quotients[h]->is_smeared = ApplySmearing;
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // lnDetJacobianAction
  /////////////////////////////////////////////////////////////
  double rho = 0.1;  // smearing parameter
  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
  if( ApplySmearing ) Level1.push_back(&Jacobian);
  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  GaugeAction.is_smeared = ApplySmearing;
  Level2.push_back(&GaugeAction);
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage <<  std::endl;
  std::cout << GridLogMessage <<  std::endl;
  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  TheHMC.Run(SmearingPolicy); // for smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
@@ -0,0 +1,350 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_hmc_EODWFRatio.cc
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  CartesianCommunicator::BarrierWorld();
  std::cout << GridLogMessage << " Clock skew check" <<std::endl;
  int threads = GridThread::GetThreads();
   // Typedefs to simplify notation
  typedef WilsonImplD FermionImplPolicy;
  typedef MobiusFermionD FermionAction;
  typedef MobiusEOFAFermionD FermionEOFAAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  MD.name    = std::string("Force Gradient");
  //typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  // MD.name    = std::string("MinimumNorm2");
  // TrajL = 2
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
  MD.MDsteps =  3;
  MD.trajL   = 0.5;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 1077;
  HMCparams.Trajectories     = 1;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_DDHMC_lat";
  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  std::cout << "loaded NERSC checpointer"<<std::endl;
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  RealD M5  = 1.8;
  RealD b   = 1.5;
  RealD c   = 0.5;
  Real beta         = 2.13;
  //  Real light_mass   = 5.4e-4;
  Real light_mass     = 7.8e-4;
  Real light_mass_dir = 0.01;
  Real strange_mass = 0.0362;
  Real pv_mass      = 1.0;
  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  //  std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
  int SP_iters=9000;
  RationalActionParams OFRp; // Up/down
  OFRp.lo       = 6.0e-5;
  OFRp.hi       = 90.0;
  OFRp.inv_pow  = 2;
  OFRp.MaxIter  = SP_iters; // get most shifts by 2000, stop sharing space
  OFRp.action_tolerance= 1.0e-8;
  OFRp.action_degree   = 18;
  OFRp.md_tolerance= 1.0e-7;
  OFRp.md_degree   = 14;
  //  OFRp.degree   = 20; converges
  //  OFRp.degree   = 16;
  OFRp.precision= 80;
  OFRp.BoundsCheckFreq=0;
  std::vector<RealD> ActionTolByPole({
      //      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      3.0e-7,1.0e-7,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8
    });
  std::vector<RealD> MDTolByPole({
      //      1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
      //      1.0e-6,3.0e-7,1.0e-7,1.0e-7,
      1.0e-5,1.0e-6,1.0e-7,1.0e-7, // soften convergence
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8
    });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
  ////////////////////////////////////////////////////////////////
  // Domain decomposed
  ////////////////////////////////////////////////////////////////
  Coordinate latt4  = GridPtr->GlobalDimensions();
  Coordinate mpi    = GridPtr->ProcessorGrid();
  Coordinate shm;
  GlobalSharedMemory::GetShmDims(mpi,shm);
  Coordinate CommDim(Nd);
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
  Coordinate NonDirichlet(Nd+1,0);
  Coordinate Dirichlet(Nd+1,0);
  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
  //Dirichlet[1] = 0;
  //Dirichlet[2] = 0;
  //Dirichlet[3] = 0;
  // 
  Coordinate Block4(Nd);
  Block4[0] = Dirichlet[1];
  Block4[1] = Dirichlet[2];
  Block4[2] = Dirichlet[3];
  Block4[3] = Dirichlet[4];
  int Width=4;
  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplD::Field>(Block4,Width));
  //////////////////////////
  // Fermion Grids
  //////////////////////////
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD  U(GridPtr); U=Zero();
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  std::cout << "loaded NERSC gauge field"<<std::endl;
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  FermionAction::ImplParams ParamsDir(boundary);
  Params.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
  ParamsDir.partialDirichlet=0;
  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
  double StoppingCondition = 1e-8;
  double MDStoppingCondition = 1e-8;
  double MDStoppingConditionLoose = 1e-8;
  double MDStoppingConditionStrange = 1e-8;
  double MaxCGIterations = 300000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(3);
  ActionLevel<HMCWrapper::Field> Level3(15);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
  SFRp.lo       = 0.1;
  SFRp.hi       = 25.0;
  SFRp.MaxIter  = 10000;
  SFRp.tolerance= 1.0e-8;
  SFRp.mdtolerance= 2.0e-6;
  SFRp.degree   = 12;
  SFRp.precision= 50;
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ConjugateGradient<FermionField>      ActionCG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  DerivativeCG(MDStoppingCondition,MaxCGIterations);
  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
 	 ActionCG, ActionCG,
 	 DerivativeCG, DerivativeCG,
 	 SFRp, true);
  Level2.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  std::vector<int> dirichlet_den;
  std::vector<int> dirichlet_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
  }
  for(int h=0;h<n_hasenbusch;h++){
    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
  }
  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  std::vector<LinearOperatorD *> LinOpD;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage
 	      << " 2f quotient Action ";
    std::cout << "det D("<<light_den[h]<<")";
    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
    std::cout << "/ det D("<<light_num[h]<<")";
    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
    std::cout << std::endl;
    FermionAction::ImplParams ParamsNum(boundary);
    FermionAction::ImplParams ParamsDen(boundary);
    if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
    else                      ParamsNum.dirichlet = NonDirichlet;
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;
    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
    else                      ParamsNum.partialDirichlet = 0;
    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
    else                      ParamsDen.partialDirichlet = 0;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
    double conv  = MDStoppingCondition;
    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
    if(h!=0) {
      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
    } else {
      Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
      Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
    }
  }
  for(int h=0;h<Bdys.size();h++){
    Bdys[h]->SetTolerances(ActionTolByPole,MDTolByPole);
  }
  int nquo=Quotients.size();
  Level1.push_back(Bdys[0]);
  Level1.push_back(Bdys[1]);
  Level2.push_back(Quotients[0]);
  for(int h=1;h<nquo-1;h++){
    Level2.push_back(Quotients[h]);
  }
  Level2.push_back(Quotients[nquo-1]);
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc
@@ -343,7 +343,7 @@ int main(int argc, char **argv) {
  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
  SFRp.lo       = 0.1;
-  SFRp.hi       = 25.0;
+  SFRp.hi       = 30.0;
  SFRp.MaxIter  = 10000;
  SFRp.tolerance= 1.0e-5;
  SFRp.mdtolerance= 2.0e-4;
--- a/HMC/Mobius2p1f_EOFA_96I_hmc.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc
@@ -128,7 +128,7 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
-#if 1
+#if 0
      RealD delta=1.e-4;
      std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl;
      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD);
@@ -180,7 +180,7 @@ int main(int argc, char **argv) {
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
-  MD.MDsteps =  14;
+  MD.MDsteps =  12;
  MD.trajL   = 0.5;
  HMCparameters HMCparams;
@@ -204,7 +204,7 @@ int main(int argc, char **argv) {
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  std::cout << "loaded NERSC checpointer"<<std::endl;
  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
@@ -218,15 +218,14 @@ int main(int argc, char **argv) {
  RealD M5  = 1.8;
  RealD b   = 1.5;
  RealD c   = 0.5;
-  Real beta         = 2.13;
+  RealD beta         = 2.13;
  //  Real light_mass   = 5.4e-4;
  Real light_mass     = 7.8e-4;
  //  Real light_mass     = 7.8e-3;
  Real strange_mass = 0.0362;
  Real pv_mass      = 1.0;
-  //  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
-  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  //std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
@@ -277,20 +276,20 @@ int main(int argc, char **argv) {
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
-  double StoppingCondition = 1e-9;
+  double StoppingCondition = 1e-14;
-  double MDStoppingCondition = 1e-8;
+  double MDStoppingCondition = 1e-9;
-  double MDStoppingConditionLoose = 1e-8;
+  double MDStoppingConditionLoose = 1e-9;
-  double MDStoppingConditionStrange = 1e-8;
+  double MDStoppingConditionStrange = 1e-9;
-  double MaxCGIterations = 300000;
+  double MaxCGIterations = 50000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
-  //  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(1);
+  ActionLevel<HMCWrapper::Field> Level2(2);
-  ActionLevel<HMCWrapper::Field> Level3(15);
+  ActionLevel<HMCWrapper::Field> Level3(4);
  ////////////////////////////////////
  // Strange action
@@ -300,11 +299,11 @@ int main(int argc, char **argv) {
  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
-  SFRp.lo       = 0.1;
+  SFRp.lo       = 0.8;
  SFRp.hi       = 30.0;
  SFRp.MaxIter  = 10000;
-  SFRp.tolerance= 1.0e-8;
+  SFRp.tolerance= 1.0e-12;
-  SFRp.mdtolerance= 2.0e-6;
+  SFRp.mdtolerance= 1.0e-9;
  SFRp.degree   = 10;
  SFRp.precision= 50;
@@ -355,8 +354,10 @@ int main(int argc, char **argv) {
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
-	 ActionCGL, ActionCGR,
+	 //	 ActionCGL, ActionCGR,
-	 DerivativeCGL, DerivativeCGR,
+	 //	 DerivativeCGL, DerivativeCGR,
 	 ActionCG, ActionCG,
 	 DerivativeCG, DerivativeCG,
 	 SFRp, true);
  Level2.push_back(&EOFA);
@@ -443,13 +444,14 @@ int main(int argc, char **argv) {
  }
  int nquo=Quotients.size();
  for(int h=0;h<nquo;h++){
-    Level2.push_back(Quotients[h]);
+    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
--- a/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc
@@ -0,0 +1,268 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_hmc_EODWFRatio.cc
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  std::cout << " Grid Initialise "<<std::endl;
  Grid_init(&argc, &argv);
  CartesianCommunicator::BarrierWorld();
  std::cout << GridLogMessage << " Clock skew check" <<std::endl;
  int threads = GridThread::GetThreads();
   // Typedefs to simplify notation
  typedef WilsonImplD FermionImplPolicy;
  typedef MobiusFermionD FermionAction;
  typedef MobiusEOFAFermionD FermionEOFAAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef WilsonImplF FermionImplPolicyF;
  typedef MobiusFermionF FermionActionF;
  typedef MobiusEOFAFermionF FermionEOFAActionF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  MD.name    = std::string("Force Gradient");
  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  //  MD.name    = std::string("MinimumNorm2");
  // TrajL = 2
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
  MD.MDsteps =  8;
  MD.trajL   = 0.5;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 1077;
  HMCparams.Trajectories     = 20;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  HMCparams.StartingType     =std::string("ColdStart");
  //  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_HMC_lat";
  CPparams.rng_prefix    = "ckpoint_HMC_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  std::cout << "loaded NERSC checpointer"<<std::endl;
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  RealD M5  = 1.8;
  RealD b   = 1.5;
  RealD c   = 0.5;
  RealD beta         = 2.13;
  //  Real light_mass   = 5.4e-4;
  Real light_mass     = 7.8e-4;
  //  Real light_mass     = 7.8e-3;
  Real strange_mass = 0.0362;
  Real pv_mass      = 1.0;
  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
  //std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
  ////////////////////////////////////////////////////////////////
  // Domain decomposed
  ////////////////////////////////////////////////////////////////
  Coordinate latt4  = GridPtr->GlobalDimensions();
  Coordinate mpi    = GridPtr->ProcessorGrid();
  Coordinate shm;
  GlobalSharedMemory::GetShmDims(mpi,shm);
  //////////////////////////
  // Fermion Grids
  //////////////////////////
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD  U(GridPtr); U=Zero();
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  std::cout << "loaded NERSC gauge field"<<std::endl;
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
  double StoppingCondition = 1e-14;
  double MDStoppingCondition = 1e-9;
  double MDStoppingConditionLoose = 1e-9;
  double MDStoppingConditionStrange = 1e-9;
  double MaxCGIterations = 50000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(2);
  ActionLevel<HMCWrapper::Field> Level3(4);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
  SFRp.lo       = 0.8;
  SFRp.hi       = 30.0;
  SFRp.MaxIter  = 10000;
  SFRp.tolerance= 1.0e-12;
  SFRp.mdtolerance= 1.0e-9;
  SFRp.degree   = 10;
  SFRp.precision= 50;
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ConjugateGradient<FermionField>      ActionCG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  DerivativeCG(MDStoppingCondition,MaxCGIterations);
  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
 	 ActionCG, ActionCG,
 	 DerivativeCG, DerivativeCG,
 	 SFRp, true);
  Level2.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass); 
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
  }
  for(int h=0;h<n_hasenbusch;h++){
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  std::vector<LinearOperatorD *> LinOpD;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage
 	      << " 2f quotient Action ";
    std::cout << "det D("<<light_den[h]<<")";
    std::cout << "/ det D("<<light_num[h]<<")";
    std::cout << std::endl;
    FermionAction::ImplParams ParamsNum(boundary);
    FermionAction::ImplParams ParamsDen(boundary);
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
    double conv  = MDStoppingCondition;
    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
    Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG,CG));
  }
  int nquo=Quotients.size();
  for(int h=0;h<nquo;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/MPI_benchmark/bench2.pbs
+++ b/MPI_benchmark/bench2.pbs
@@ -0,0 +1,22 @@
 #!/bin/bash
 #PBS -q EarlyAppAccess
 #PBS -l select=2
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 export TZ='/usr/share/zoneinfo/US/Central'
 export OMP_PROC_BIND=spread
 export OMP_NUM_THREADS=3
 unset OMP_PLACES
 cd $PBS_O_WORKDIR
 NNODES=`wc -l < $PBS_NODEFILE`
 NRANKS=12         # Number of MPI ranks per node
 NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
 NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
 NTOTRANKS=$(( NNODES * NRANKS ))
 CMD="mpiexec -np 2 -ppn 1  -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1"
 $CMD
--- a/MPI_benchmark/compile-command
+++ b/MPI_benchmark/compile-command
@@ -0,0 +1 @@
 mpicxx  -fsycl halo_mpi.cc -o halo_mpi
--- a/MPI_benchmark/gpu_tile_compact.sh
+++ b/MPI_benchmark/gpu_tile_compact.sh
@@ -0,0 +1,30 @@
 #!/bin/bash
 export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
 export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 #unset EnableWalkerPartition
 #export EnableImplicitScaling=0
 #export GRID_MPICH_NIC_BIND=$NIC
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 #export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 numactl -m $PNUMA -N $NUMA  "$@"
--- a/MPI_benchmark/halo_mpi.cc
+++ b/MPI_benchmark/halo_mpi.cc
@@ -0,0 +1,333 @@
 #include <cassert>
 #include <complex>
 #include <memory>
 #include <vector>
 #include <algorithm>
 #include <array>
 #include <string>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <ctime>
 #include <sys/time.h>
 #include <mpi.h>
 /**************************************************************
 * GPU - GPU memory cartesian halo exchange benchmark
 * Config: what is the target
 **************************************************************
 */
 #undef ACC_CUDA
 #undef  ACC_HIP
 #define  ACC_SYCL
 #undef  ACC_NONE
 /**************************************************************
 * Some MPI globals
 **************************************************************
 */
 MPI_Comm WorldComm;
 MPI_Comm WorldShmComm;
 int WorldSize;
 int WorldRank;
 int WorldShmSize;
 int WorldShmRank;
 /**************************************************************
 * Allocate buffers on the GPU, SYCL needs an init call and context
 **************************************************************
 */
 #ifdef ACC_CUDA
 #include <cuda.h>
 void acceleratorInit(void){}
 void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = cudaMalloc((void **)&ptr,bytes);
  assert(err==cudaSuccess);
  return ptr;
 }
 void acceleratorFreeDevice(void *ptr){  cudaFree(ptr);}
 #endif
 #ifdef ACC_HIP
 #include <hip/hip_runtime.h>
 void acceleratorInit(void){}
 inline void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = hipMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
 };
 inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
 #endif
 #ifdef ACC_SYCL
 #include <sycl/CL/sycl.hpp>
 #include <sycl/usm.hpp>
 cl::sycl::queue *theAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
 #if 1
  cl::sycl::gpu_selector selector;
  cl::sycl::device selectedDevice { selector };
  theAccelerator = new sycl::queue (selectedDevice);
 #else
  cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v  };
  theAccelerator = new sycl::queue (selectedDevice);
 #endif
  auto name = theAccelerator->get_device().get_info<sycl::info::device::name>();
  printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout);
 }
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);};
 #endif
 #ifdef ACC_NONE
 void acceleratorInit(void){}
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 #endif
 /**************************************************************
 * Microsecond timer
 **************************************************************
 */
 inline double usecond(void) {
  struct timeval tv;
  gettimeofday(&tv,NULL);
  return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec;
 }
 /**************************************************************
 * Main benchmark routine
 **************************************************************
 */
 void Benchmark(int64_t L,std::vector<int> cart_geom,bool use_device,int ncall)
 {
  int64_t words = 3*4*2;
  int64_t face,vol;
  int Nd=cart_geom.size();
  /**************************************************************
   * L^Nd volume, L^(Nd-1) faces, 12 complex per site
   * Allocate memory for these
   **************************************************************
   */
  face=1; for( int d=0;d<Nd-1;d++) face = face*L;
  vol=1;  for( int d=0;d<Nd;d++) vol = vol*L;
  std::vector<void *> send_bufs;
  std::vector<void *> recv_bufs;
  size_t vw = face*words;
  size_t bytes = face*words*sizeof(double);
  if ( use_device ) {
    for(int d=0;d<2*Nd;d++){
      send_bufs.push_back(acceleratorAllocDevice(bytes));
      recv_bufs.push_back(acceleratorAllocDevice(bytes));
    }
  } else {
    for(int d=0;d<2*Nd;d++){
      send_bufs.push_back(malloc(bytes));
      recv_bufs.push_back(malloc(bytes));
    }
  }
  /*********************************************************
   * Build cartesian communicator
   *********************************************************
   */
  int ierr;
  int rank;
  std::vector<int> coor(Nd);
  MPI_Comm communicator;
  std::vector<int> periodic(Nd,1);
  MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator);
  MPI_Comm_rank(communicator,&rank);
  MPI_Cart_coords(communicator,rank,Nd,&coor[0]);
  static int reported;
  if ( ! reported ) { 
    printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank,
 	 coor[0],coor[1],coor[2],coor[3]); fflush(stdout);
    reported =1 ;
  }
  /*********************************************************
   * Perform halo exchanges
   *********************************************************
   */
  for(int d=0;d<Nd;d++){
    if ( cart_geom[d]>1 ) {
      double t0=usecond();
      int from,to;
      MPI_Barrier(communicator);
      for(int n=0;n<ncall;n++){
 	void *xmit = (void *)send_bufs[d];
 	void *recv = (void *)recv_bufs[d];
 	ierr=MPI_Cart_shift(communicator,d,1,&from,&to);
 	assert(ierr==0);
 	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
 			  recv,bytes,MPI_CHAR,from, from,
 			  communicator,MPI_STATUS_IGNORE);
 	assert(ierr==0);
 	xmit = (void *)send_bufs[Nd+d];
 	recv = (void *)recv_bufs[Nd+d];
 	ierr=MPI_Cart_shift(communicator,d,-1,&from,&to);
 	assert(ierr==0);
 	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
 			  recv,bytes,MPI_CHAR,from, from,
 			  communicator,MPI_STATUS_IGNORE);
 	assert(ierr==0);
      }
      MPI_Barrier(communicator);
      double t1=usecond();
      double dbytes    = bytes*WorldShmSize;
      double xbytes    = dbytes*2.0*ncall;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
      if ( ! WorldRank ) {
 	printf("\t%12ld\t %12ld %16.0lf\n",L,bytes,bidibytes/(t1-t0)); fflush(stdout);
      }
    }
  }
  /*********************************************************
   * Free memory
   *********************************************************
   */
  if ( use_device ) {
    for(int d=0;d<2*Nd;d++){
      acceleratorFreeDevice(send_bufs[d]);
      acceleratorFreeDevice(recv_bufs[d]);
    }
  } else {
    for(int d=0;d<2*Nd;d++){
      free(send_bufs[d]);
      free(recv_bufs[d]);
    }
  }
 }
 /**************************************
 * Command line junk
 **************************************/
 std::string CmdOptionPayload(char ** begin, char ** end, const std::string & option)
 {
  char ** itr = std::find(begin, end, option);
  if (itr != end && ++itr != end) {
    std::string payload(*itr);
    return payload;
  }
  return std::string("");
 }
 bool CmdOptionExists(char** begin, char** end, const std::string& option)
 {
  return std::find(begin, end, option) != end;
 }
 void CmdOptionIntVector(const std::string &str,std::vector<int> & vec)
 {
  vec.resize(0);
  std::stringstream ss(str);
  int i;
  while (ss >> i){
    vec.push_back(i);
    if(std::ispunct(ss.peek()))
      ss.ignore();
  }
  return;
 }
 /**************************************
 * Command line junk
 **************************************/
 int main(int argc, char **argv)
 {
  std::string arg;
  acceleratorInit();
  MPI_Init(&argc,&argv);
  WorldComm = MPI_COMM_WORLD;
  MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
  MPI_Comm_rank(WorldComm     ,&WorldRank);
  MPI_Comm_size(WorldComm     ,&WorldSize);
  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
  if ( WorldSize/WorldShmSize > 2) {
    printf("This benchmark is meant to run on at most two nodes only\n");
  }
  auto mpi =std::vector<int>({1,1,1,1});
  if( CmdOptionExists(argv,argv+argc,"--mpi") ){
    arg = CmdOptionPayload(argv,argv+argc,"--mpi");
    CmdOptionIntVector(arg,mpi);
  } else {
    printf("Must specify --mpi <n1.n2.n3.n4> command line argument\n");
    exit(0);
  }
  if( !WorldRank ) {
    printf("***********************************\n");
    printf("%d ranks\n",WorldSize); 
    printf("%d ranks-per-node\n",WorldShmSize);
    printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout);
    printf("Cartesian layout: ");
    for(int d=0;d<mpi.size();d++){
      printf("%d ",mpi[d]);
    }
    printf("\n");fflush(stdout);
    printf("***********************************\n");
  }
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= Benchmarking HOST memory MPI performance               \n");
    printf("=========================================================\n");fflush(stdout);
    printf("= L\t pkt bytes\t MB/s           \n");
    printf("=========================================================\n");fflush(stdout);
  }
  for(int L=16;L<=64;L+=4){
    Benchmark(L,mpi,false,100);
  }  
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= Benchmarking DEVICE memory MPI performance             \n");
    printf("=========================================================\n");fflush(stdout);
  }
  for(int L=16;L<=64;L+=4){
    Benchmark(L,mpi,true,100);
  }  
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= DONE   \n");
    printf("=========================================================\n");
  }
  MPI_Finalize();
 }
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -90,11 +90,11 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-  Dirichlet[0] = 0;
+  //  Dirichlet[0] = 0;
-  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
  Benchmark(Ls,Dirichlet);
@@ -105,11 +105,11 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
-  Dirichlet[0] = 0;
+  //  Dirichlet[0] = 0;
-  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
+  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
-  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
+  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
-  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
+  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
-  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
+  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
  Benchmark(Ls,Dirichlet);
@@ -185,6 +185,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  GaugeField Umu(UGrid);
  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
  //  SU<Nc>::ColdConfiguration(Umu);
  UmuCopy=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
@@ -307,6 +308,14 @@ void Benchmark(int Ls, Coordinate Dirichlet)
    if(( n2e>1.0e-4) ) {
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
      std::cout<<GridLogMessage << "RESULT" << std::endl;
      //      std::cout << result<<std::endl;
      std::cout << norm2(result)<<std::endl;
      std::cout<<GridLogMessage << "REF" << std::endl;
      std::cout << norm2(ref)<<std::endl;
      std::cout<<GridLogMessage << "ERR" << std::endl;
      std::cout << norm2(err)<<std::endl;
      FGrid->Barrier();
      exit(-1);
    }
    assert (n2e< 1.0e-4 );
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,12 +1,12 @@
 #!/usr/bin/env bash
 set -e
-EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
+EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2'
-EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11'
+EIGEN_SHA256SUM='b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626'
 echo "-- deploying Eigen source..."
-ARC=`basename ${EIGEN_URL}`
+ARC=$(basename ${EIGEN_URL})
 wget ${EIGEN_URL} --no-check-certificate
 if command -v sha256sum; then
   echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \
@@ -14,13 +14,8 @@ if command -v sha256sum; then
 else
   echo "WARNING: could not verify checksum, please install sha256sum" >&2
 fi
-./scripts/update_eigen.sh ${ARC}
+./scripts/update_eigen.sh "${ARC}"
-rm ${ARC}
+rm "${ARC}"
 # patch for non-portable includes in Eigen 3.3.5
 # apparently already fixed in Eigen HEAD so it should not be 
 # a problem in the future (A.P.)
 patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch
 echo '-- generating Make.inc files...'
 ./scripts/filelist
 echo '-- generating configure script...'
--- a/scripts/eigen-3.3.5.Tensor.patch
+++ b/scripts/eigen-3.3.5.Tensor.patch
@@ -1,19 +0,0 @@
 --- ./Eigen/unsupported/Eigen/CXX11/Tensor	2018-07-23 10:33:42.000000000 +0100
 +++ Tensor	2018-08-28 16:15:56.000000000 +0100
@@ -25,7 +25,7 @@
 #include <utility>
 #endif
 -#include <Eigen/src/Core/util/DisableStupidWarnings.h>
 +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
 #include "../SpecialFunctions"
 #include "src/util/CXX11Meta.h"
@@ -147,6 +147,6 @@
 #include "src/Tensor/TensorIO.h"
 -#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
 +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 //#endif // EIGEN_CXX11_TENSOR_MODULE
--- a/systems/Aurora/benchmarks/bench1024.pbs
+++ b/systems/Aurora/benchmarks/bench1024.pbs
@@ -0,0 +1,56 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=1024
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 32 nodes, 384 ranks
 #
 CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 8.6.16.16 --grid 64.48.64.284 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 1024node.dwf.small
 CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 1024node.dwf
--- a/systems/Aurora/benchmarks/bench12.pbs
+++ b/systems/Aurora/benchmarks/bench12.pbs
@@ -0,0 +1,45 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=2
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
--- a/systems/Aurora/benchmarks/bench2048.pbs
+++ b/systems/Aurora/benchmarks/bench2048.pbs
@@ -0,0 +1,56 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=2048
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 32 nodes, 384 ranks
 #
 CMD="mpiexec -np 24576 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 8.12.16.16 --grid 64.48.64.284 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 24576 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 128.128.128.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 2048node.dwf.small
 CMD="mpiexec -np 24576 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 256.256.256.768 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 2048node.dwf
--- a/systems/Aurora/benchmarks/bench256.pbs
+++ b/systems/Aurora/benchmarks/bench256.pbs
@@ -0,0 +1,48 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=256
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 32 nodes, 384 ranks
 #
 CMD="mpiexec -np 3072 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 8.6.8.8 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 3072 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 8.8.4.12 --grid 128.128.128.768 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 256node.dwf.large
--- a/systems/Aurora/benchmarks/bench512.pbs
+++ b/systems/Aurora/benchmarks/bench512.pbs
@@ -0,0 +1,48 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=512
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 32 nodes, 384 ranks
 #
 CMD="mpiexec -np 6144 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 8.6.8.16 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 6144 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.12 --grid 256.128.128.768 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 512node.dwf.large
--- a/systems/Aurora/benchmarks/bench_scaling.pbs
+++ b/systems/Aurora/benchmarks/bench_scaling.pbs
@@ -0,0 +1,80 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=32
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 32 nodes, 384 ranks
 #
 CMD="mpiexec -np 384 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 12 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 1node.dwf
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 2node.dwf
 CMD="mpiexec -np 48 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 4node.dwf
 CMD="mpiexec -np 96 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 8node.dwf
 CMD="mpiexec -np 192 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 16node.dwf
 CMD="mpiexec -np 384 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 32node.dwf
--- a/systems/Aurora/benchmarks/gpu_tile_compact.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh
@@ -0,0 +1,33 @@
 #!/bin/bash
 export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
 #export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
 export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 #export GRID_MPICH_NIC_BIND=$NIC
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 #export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 #echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 numactl -m $NUMA -N $NUMAP  "$@"
--- a/systems/Aurora/benchmarks/gpu_tile_compact4.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact4.sh
@@ -0,0 +1,29 @@
 #!/bin/bash
 export  NUMA_MAP=(2 2 3 3  2 2  3 3  )
 export  PROC_MAP=(0 0 1 1  0 0  1 1  )
 export  NIC_MAP=(0 0  4 4  1 1  5 5  )
 export  GPU_MAP=(0 1  3 4  0 1  3 4  )
 export TILE_MAP=(0 0  0 0  1 1  1 1  )
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 #export GRID_MPICH_NIC_BIND=$NIC
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND ; NUMA domain $NUMA"
 numactl -m $NUMA -N $PROC_MAP  "$@"
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -0,0 +1,16 @@
 TOOLS=$HOME/tools
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
--- a/systems/Aurora/proxies.sh
+++ b/systems/Aurora/proxies.sh
@@ -0,0 +1,9 @@
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
 module use /soft/modulefiles
 module load intel_compute_runtime/release/agama-devel-682.22
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -0,0 +1,12 @@
 #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
 module use /soft/modulefiles
 module load intel_compute_runtime/release/agama-devel-682.22
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@@ -0,0 +1,23 @@
 CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 ../../configure --enable-comms=mpi-auto \
 --with-lime=$CLIME \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-tracing=timer \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
 --enable-accelerator-cshift \
 --with-gmp=$OLCF_GMP_ROOT \
 --with-fftw=$FFTW_DIR/.. \
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \
 LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@@ -0,0 +1,13 @@
 . /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
 spack load c-lime
 #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib
 module load emacs 
 module load PrgEnv-gnu
 module load rocm
 module load cray-mpich/8.1.23
 module load gmp
 module load cray-fftw
 module load craype-accel-amd-gfx90a
 export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 #Hack for lib
 #export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
--- a/systems/Lumi/HMC/32cube/fthmc3gev.slurm
+++ b/systems/Lumi/HMC/32cube/fthmc3gev.slurm
@@ -0,0 +1,57 @@
 #!/bin/bash -l
 #SBATCH --job-name=fthmc3ge
 #SBATCH --partition=small-g
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=8
 ##SBATCH --cpus-per-task=8
 #SBATCH --gpus-per-node=8
 #SBATCH --time=2:00:00
 #SBATCH --account=project_465000546
 #SBATCH --gpu-bind=none
 #SBATCH --exclusive
 #SBATCH --mem=0
 #sbatch --dependency=afterany:$SLURM_JOBID fthmc3gev.slurm
 CPU_BIND="map_ldom:3,3,1,1,0,0,2,2"
 MEM_BIND="map_mem:3,3,1,1,0,0,2,2"
 echo $CPU_BIND
 cat << EOF > ./select_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3 4 5 6 7)
 export NUMA_MAP=(3 3 1 1 0 0 2 2)
 export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
 export NUM=\${NUMA_MAP[\$SLURM_LOCALID]}
 #export HIP_VISIBLE_DEVICES=\$GPU
 export ROCR_VISIBLE_DEVICES=\$GPU
 echo RANK \$SLURM_LOCALID using GPU \$GPU    
 echo NUMA \$SLURM_LOCALID using NUMA \${NUM}
 echo numactl -m \$NUM -N \$NUM \$*
 exec numactl -m \$NUM -N \$NUM \$*
 EOF
 cat ./select_gpu
 chmod +x ./select_gpu
 root=/scratch/project_465000546/boylepet/Grid/systems/Lumi
 source ${root}/sourceme.sh
 export OMP_NUM_THREADS=7
 export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export MPICH_GPU_SUPPORT_ENABLED=1
 #cfg=`ls -rt ckpoint_*lat* | tail -n 1  `
 #traj="${cfg#*.}"
 #cfg=`ls -rt ckpoint_*lat* | tail -n 1  `
 traj=0
 vol=32.32.32.64
 mpi=1.2.2.2
 PARAMS="--mpi $mpi --accelerator-threads 16 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol"
 #HMCPARAMS="--StartingType CheckpointStart --StartingTrajectory $traj --Trajectories 200"
 HMCPARAMS="--StartingType ColdStart --StartingTrajectory $traj --Trajectories 20"
 srun ./select_gpu ../FTHMC2p1f_3GeV $HMCPARAMS $PARAMS
--- a/systems/Lumi/config-command
+++ b/systems/Lumi/config-command
@@ -23,7 +23,7 @@ echo mpfr X$MPFR
 --disable-fermion-reps \
 --disable-gparity \
 CXX=hipcc MPICXX=mpicxx \
-  CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
+  CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++17 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
  LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp" 
--- a/systems/SDCC-A100/bench.slurm
+++ b/systems/SDCC-A100/bench.slurm
@@ -0,0 +1,42 @@
 #!/bin/bash
 #SBATCH --partition csi
 #SBATCH --time=00:10:00
 #SBATCH -A csigeneral
 #SBATCH --exclusive
 #SBATCH --nodes=1
 #SBATCH --ntasks=4
 #SBATCH --qos csi
 #SBATCH --gres=gpu:4
 source sourceme.sh
 cat << EOF > select_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3)
 export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
 export CUDA_VISIBLE_DEVICES=\$GPU
 unset ROCR_VISIBLE_DEVICES
 echo RANK \$SLURM_LOCALID using GPU \$GPU    
 exec \$*
 EOF
 chmod +x ./select_gpu
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export UCX_TLS=cuda,gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=no
 export UCX_MEMTYPE_CACHE=n
 export OMP_NUM_THREAD=8
 #srun -N1 -n1 nvidia-smi
 #srun -N1 -n1 numactl -H > numa.txt
 srun -N1 -n1 lstopo A100-topo.pdf
 # 4.35 TF/s
 #srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --shm 2048 --shm-mpi 0  --accelerator-threads 16
 srun -N1 -n4 ./select_gpu ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 32.32.64.64 --shm 2048 --shm-mpi 0  --accelerator-threads 16
--- a/systems/SDCC-A100/config-command
+++ b/systems/SDCC-A100/config-command
@@ -0,0 +1,17 @@
 ../../configure \
 --enable-comms=mpi-auto \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-accelerator=cuda \
 --enable-gen-simd-width=64 \
 --enable-simd=GPU \
 --disable-accelerator-cshift \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=nvcc \
 MPICXX=mpicxx \
 LDFLAGS="-cudart shared " \
 CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared"
--- a/systems/SDCC-A100/sourceme.sh
+++ b/systems/SDCC-A100/sourceme.sh
@@ -0,0 +1,2 @@
 module load cuda/12.2
 module load openmpi
--- a/systems/SDCC-ARM/config-command-mpi
+++ b/systems/SDCC-ARM/config-command-mpi
@@ -0,0 +1,6 @@
 HDF=$HOME/paboyle/install
 LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=NEONv8 --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF 
 #LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=GEN --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF 
--- a/systems/SDCC-ICE/bench.slurm
+++ b/systems/SDCC-ICE/bench.slurm
@@ -0,0 +1,31 @@
 #!/bin/bash
 #SBATCH --partition lqcd
 #SBATCH --time=00:20:00
 #SBATCH -A lqcdtest
 #SBATCH --exclusive
 #SBATCH --nodes=1
 #SBATCH --ntasks=2
 #SBATCH --qos lqcd
 source sourceme.sh
 export OMP_NUM_THREAD=24
 #srun -N1 -n1 numactl -H > numa.txt
 #srun -N1 -n1 lstopo ice-topo.pdf
 cat << EOF > select_socket
 #!/bin/bash
 export NUM_MAP=(0 1)
 export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
 exec \$*
 EOF
 chmod +x ./select_socket
 #for vol in 8.8.8.16 8.8.8.32 8.8.8.64
 #for vol in 8.8.16.16 8.8.16.32 8.8.16.64
 for vol in 8.16.16.16 8.16.16.32 8.16.16.64 16.16.16.32 16.16.16.64 24.24.24.64 32.32.32.32
 do
 srun --cpu-bind=ldoms -N1 -n2 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid $vol --dslash-asm > $vol.2socket.out
 srun --cpu-bind=ldoms -N1 -n1 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm > $vol.1socket.out
 done
--- a/systems/SDCC-ICE/config-command
+++ b/systems/SDCC-ICE/config-command
@@ -0,0 +1,19 @@
 ../../configure \
 --enable-debug \
 --enable-comms=mpi-auto \
 --enable-unified=yes \
 --enable-shm=shmopen \
 --enable-shm-fast-path=shmopen \
 --enable-accelerator=none \
 --enable-simd=AVX512 \
 --disable-accelerator-cshift \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=clang++ \
 MPICXX=mpicxx \
 LDFLAGS=-L/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/hwloc-2.9.1-hgkscnt5pferhtde4ahctlupb6qf3vtl/lib/ \
 LIBS=-lhwloc \
 CXXFLAGS="-std=c++17"
--- a/systems/SDCC-ICE/sourceme.sh
+++ b/systems/SDCC-ICE/sourceme.sh
@@ -0,0 +1,2 @@
 export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-12.0.1-agey6vtuw3e375rewhhobvkznjh5ltz4/lib/:$LD_LIBRARY_PATH
 module load openmpi
--- a/tests/core/Test_sliceSum.cc
+++ b/tests/core/Test_sliceSum.cc
@@ -0,0 +1,321 @@
 #include <Grid/Grid.h>
 template<class vobj> inline void sliceSumCPU(const Grid::Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
 {
  using namespace Grid;
  ///////////////////////////////////////////////////////
  // FIXME precision promoted summation
  // may be important for correlation functions
  // But easily avoided by using double precision fields
  ///////////////////////////////////////////////////////
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_object::scalar_type scalar_type;
  GridBase  *grid = Data.Grid();
  assert(grid!=NULL);
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
  assert(orthogdim >= 0);
  assert(orthogdim < Nd);
  int fd=grid->_fdimensions[orthogdim];
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];
  Vector<vobj> lvSum(rd); // will locally sum vectors first
  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD
  result.resize(fd); // And then global sum to return the same vector to every node 
  for(int r=0;r<rd;r++){
    lvSum[r]=Zero();
  }
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
  int ostride=grid->_ostride[orthogdim];
  //Reduce Data down to lvSum
  sliceSumReduction_cpu(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
  // Sum across simd lanes in the plane, breaking out orthog dir.
  Coordinate icoor(Nd);
  for(int rt=0;rt<rd;rt++){
    extract(lvSum[rt],extracted);
    for(int idx=0;idx<Nsimd;idx++){
      grid->iCoorFromIindex(icoor,idx);
      int ldx =rt+icoor[orthogdim]*rd;
      lsSum[ldx]=lsSum[ldx]+extracted[idx];
    }
  }
  // sum over nodes.
  for(int t=0;t<fd;t++){
    int pt = t/ld; // processor plane
    int lt = t%ld;
    if ( pt == grid->_processor_coor[orthogdim] ) {
      result[t]=lsSum[lt];
    } else {
      result[t]=Zero();
    }
  }
  scalar_type * ptr = (scalar_type *) &result[0];
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
 }
 int main (int argc, char ** argv) {
    using namespace Grid;
    Grid_init(&argc,&argv);
    Coordinate latt_size({64,64,64,16});
    auto simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
    auto mpi_layout = GridDefaultMpi();
    GridCartesian Grid(latt_size, simd_layout, mpi_layout);
    std::vector<int> seeds({1, 2, 3, 4});
    GridParallelRNG pRNG(&Grid);
    pRNG.SeedFixedIntegers(seeds);
    LatticeComplexD test_data(&Grid);
    gaussian(pRNG,test_data);
    std::vector<TComplexD> reduction_reference;
    std::vector<TComplexD> reduction_result;
    //warmup
    for (int sweeps = 0; sweeps < 5; sweeps++) {
      reduction_result = sliceSum(test_data,0);
    }
    int trace_id = traceStart("sliceSum benchmark - ComplexD");
    std::cout << GridLogMessage << "Testing ComplexD" << std::endl;
    std::cout << GridLogMessage << "sizeof(ComplexD) = " << sizeof(ComplexD) << std::endl;
    std::cout << GridLogMessage << "sizeof(vComplexD) = " << sizeof(vComplexD) << std::endl;
    for (int i = 0; i < Nd; i++) {
      RealD t=-usecond();
      tracePush("sliceSum");
      sliceSumCPU(test_data,reduction_reference,i);
      tracePop("sliceSum");
      t+=usecond();
      std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
      std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
      RealD tgpu=-usecond();
      tracePush("sliceSumGpu");
      reduction_result = sliceSum(test_data,i);
      tracePop("sliceSumGpu");
      tgpu+=usecond();
      std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
      for(int t=0;t<reduction_reference.size();t++) {
        auto diff = reduction_reference[t]-reduction_result[t];
        assert(abs(TensorRemove(diff)) < 1e-8 );
      }
    }
    traceStop(trace_id);
    LatticeSpinVectorD test_data_cv(&Grid);
    gaussian(pRNG,test_data_cv);
    std::vector<SpinVectorD> reduction_reference_cv;
    std::vector<SpinVectorD> reduction_result_cv;
    //warmup
    for (int sweeps = 0; sweeps < 5; sweeps++) {
      reduction_result_cv = sliceSum(test_data_cv,0);
    }
    trace_id = traceStart("sliceSum benchmark - SpinVectorD");
    std::cout << GridLogMessage << "Testing SpinVectorD" << std::endl;
    std::cout << GridLogMessage << "sizeof(SpinVectorD) = " << sizeof(SpinVectorD) << std::endl;
    std::cout << GridLogMessage << "sizeof(vSpinVectorD) = " << sizeof(vSpinVectorD) << std::endl;
    for (int i = 0; i < Nd; i++) {
      RealD t=-usecond();
      tracePush("sliceSum");
      sliceSumCPU(test_data_cv,reduction_reference_cv,i);
      tracePop("sliceSum");
      t+=usecond();
      std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
      std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
      RealD tgpu=-usecond();
      tracePush("sliceSumGpu");
      reduction_result_cv = sliceSum(test_data_cv,i);
      tracePop("sliceSumGpu");
      tgpu+=usecond();
      std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
      for(int t=0;t<reduction_reference_cv.size();t++) {
        auto diff = reduction_reference_cv[t]-reduction_result_cv[t];
        assert(abs(diff()(0)()) < 1e-8 );
        assert(abs(diff()(1)()) < 1e-8 );
        assert(abs(diff()(2)()) < 1e-8 );
        assert(abs(diff()(3)()) < 1e-8 );
      }
    }
    traceStop(trace_id);
    LatticeSpinColourVectorD test_data_scv(&Grid);
    gaussian(pRNG,test_data_scv);
    std::vector<SpinColourVectorD> reduction_reference_scv;
    std::vector<SpinColourVectorD> reduction_result_scv;
    //warmup
    for (int sweeps = 0; sweeps < 5; sweeps++) {
      reduction_result_scv = sliceSum(test_data_scv,0);
    }
    trace_id = traceStart("sliceSum benchmark - SpinColourVectorD");
    std::cout << GridLogMessage << "Testing SpinColourVectorD" << std::endl;
    std::cout << GridLogMessage << "sizeof(SpinColourVectorD) = " << sizeof(SpinColourVectorD) << std::endl;
    std::cout << GridLogMessage << "sizeof(vSpinColourVectorD) = " << sizeof(vSpinColourVectorD) << std::endl;
    for (int i = 0; i < Nd; i++) {
      RealD t=-usecond();
      tracePush("sliceSum");
      sliceSumCPU(test_data_scv,reduction_reference_scv,i);
      tracePop("sliceSum");
      t+=usecond();
      std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
      std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
      RealD tgpu=-usecond();
      tracePush("sliceSumGpu");
      reduction_result_scv = sliceSum(test_data_scv,i);
      tracePop("sliceSumGpu");
      tgpu+=usecond();
      std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
      for(int t=0;t<reduction_reference_scv.size();t++) {
        auto diff = reduction_reference_scv[t]-reduction_result_scv[t];
        // std::cout << diff <<std::endl;
        assert(abs(diff()(0)(0)) < 1e-8 );
        assert(abs(diff()(0)(1)) < 1e-8 );
        assert(abs(diff()(0)(2)) < 1e-8 );
        assert(abs(diff()(1)(0)) < 1e-8 );
        assert(abs(diff()(1)(1)) < 1e-8 );
        assert(abs(diff()(1)(2)) < 1e-8 );    
        assert(abs(diff()(2)(0)) < 1e-8 );
        assert(abs(diff()(2)(1)) < 1e-8 );
        assert(abs(diff()(2)(2)) < 1e-8 );    
        assert(abs(diff()(3)(0)) < 1e-8 );
        assert(abs(diff()(3)(1)) < 1e-8 );
        assert(abs(diff()(3)(2)) < 1e-8 );
      }
    }
    traceStop(trace_id);
    LatticeSpinColourMatrixD test_data_scm(&Grid);
    gaussian(pRNG,test_data_scm);
    std::vector<SpinColourMatrixD> reduction_reference_scm;
    std::vector<SpinColourMatrixD> reduction_result_scm;
    //warmup
    for (int sweeps = 0; sweeps < 5; sweeps++) {
      reduction_result_scm = sliceSum(test_data_scm,0);
    }
    trace_id = traceStart("sliceSum benchmark - SpinColourMatrixD");
    std::cout << GridLogMessage << "Testing SpinColourMatrixD" << std::endl;
    std::cout << GridLogMessage << "sizeof(SpinColourMatrixD) = " << sizeof(SpinColourMatrixD) << std::endl;
    std::cout << GridLogMessage << "sizeof(vSpinColourMatrixD) = " << sizeof(vSpinColourMatrixD) << std::endl;
    for (int i = 0; i < Nd; i++) {
      RealD t=-usecond();
      tracePush("sliceSum");
      sliceSumCPU(test_data_scm,reduction_reference_scm,i);
      tracePop("sliceSum");
      t+=usecond();
      std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
      std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
      RealD tgpu=-usecond();
      tracePush("sliceSumGpu");
      reduction_result_scm = sliceSum(test_data_scm,i);
      tracePop("sliceSumGpu");
      tgpu+=usecond();
      std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
      for(int t=0;t<reduction_reference_scm.size();t++) {
        auto diff = reduction_reference_scm[t]-reduction_result_scm[t];
        // std::cout << diff <<std::endl;
        for (int is = 0; is < Ns; is++) {
          for (int js = 0; js < Ns; js++) {
            for (int ic = 0; ic < Nc; ic++) {
              for (int jc = 0; jc < Nc; jc++) {
                assert(abs(diff()(is,js)(ic,jc)) < 1e-8);
              }
            }
          }
        }
      }
    }
    traceStop(trace_id);
    Grid_finalize();
    return 0;
 }
Author	SHA1	Message	Date
Peter Boyle	fce3852dff	Merge pull request #451 from paboyle/feature/eigen-3.4.0-update updating Eigen to 3.4.0	2024-02-28 18:03:37 -05:00
Peter Boyle	ee1b8bbdbd	Merge pull request #454 from edbennett/adjoint-broke fix HMC for non-fundamental representations	2024-02-28 14:05:27 -05:00
Peter Boyle	3f1636637d	Merge pull request #453 from dbollweg/feature/sliceSum_gpu Feature/slice sum gpu	2024-02-28 14:04:43 -05:00
Peter Boyle	2e570f5300	Merge pull request #457 from lehner/feature/gpt Import GPT-related updates	2024-02-28 13:59:04 -05:00
Christoph Lehner	9f89486df5	remove unnecessary code path	2024-02-28 19:56:23 +01:00
Christoph Lehner	22b43b86cb	Make GPT test suite work with SYCL	2024-02-28 12:57:17 +01:00
dbollweg	3c9012676a	CUDA cub refuses to reduce vSpinColourMatrix, breaking up into smaller parts like already done for HIP case.	2024-02-27 12:41:45 -05:00
Dennis Bollweg	b507fe209c	Added SpinColourMatrix case to sliceSum Test	2024-02-27 11:28:32 -05:00
Dennis Bollweg	6cd2d8fcd5	Replace cuda/hip memcpy with Grid functions	2024-02-26 09:55:07 -05:00
dbollweg	0a816b5509	Merge branch 'feature/sliceSum_gpu' of https://github.com/dbollweg/Grid into feature/sliceSum_gpu	2024-02-22 21:43:06 -05:00
dbollweg	1c8b807c2e	free malloc'd memory	2024-02-22 21:42:44 -05:00
Christoph Lehner	66391f84f2	Merge branch 'feature/gpt' of ../Grid into develop	2024-02-21 19:05:00 +01:00
Ed Bennett	97f7a9ecb3	fix HMC for non-fundamental representations	2024-02-21 08:27:55 +00:00
Dennis Bollweg	15878f7613	sliceSumReduction_cub_large now also faster than CPU on Frontier	2024-02-16 13:55:21 -05:00
dbollweg	e0d5e3c6c7	Merge branch 'paboyle:develop' into feature/sliceSum_gpu	2024-02-16 13:16:37 -05:00
dbollweg	6f3455900e	Adding sliceSumReduction_cub_small/large since hipcub cannot deal with arb. large vobjs	2024-02-16 13:15:02 -05:00
Peter Boyle	73c0b29535	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-02-13 20:19:32 +00:00
Peter Boyle	303b83cdb8	Scaling benchmarks, verbosity and MPICH aware in acceleratorInit() For some reason Dirichlet benchmark fails on several nodes; need to debug this.	2024-02-13 19:48:03 +00:00
Peter Boyle	5ef4da3f29	Silence verbose	2024-02-13 19:47:36 +00:00
Peter Boyle	1502860004	Benchmark scripts	2024-02-13 19:47:02 +00:00
Peter Boyle	585efc6f3f	More benchmark scripts	2024-02-13 19:40:49 +00:00
Antonin Portelli	62055e04dd	missing semicolon generates error with some compilers	2024-02-13 18:18:27 +01:00
Antonin Portelli	e4a641b64e	removing old Eigen tensor patch	2024-02-13 10:37:14 +01:00
Antonin Portelli	8849f187f1	updating Eigen to 3.4.0	2024-02-13 10:30:22 +01:00
dbollweg	b5659d106e	more test cases	2024-02-09 13:37:14 -05:00
dbollweg	4b43307402	Undo include path changes for level zero api header	2024-02-09 13:07:56 -05:00
dbollweg	09af8c25a2	Merge branch 'paboyle:develop' into feature/sliceSum_gpu	2024-02-09 13:02:59 -05:00
dbollweg	9514035b87	refactor slicesum: slicesum uses GPU version by default now	2024-02-09 13:02:28 -05:00
Peter Boyle	7019916294	RNG seed change safer for large volumes; this is a long term solution	2024-02-07 00:56:39 +00:00
dbollweg	1514b4f137	slicesum_sycl passes test	2024-02-06 19:08:44 -05:00
Peter Boyle	91cf5ee312	Updated bench script	2024-02-06 23:45:10 +00:00
dbollweg	ab2de131bd	work towards sliceSum for sycl backend	2024-02-06 13:24:45 -05:00
Peter Boyle	5bfa88be85	Aurora MPI standalone benchmake and options that work well	2024-02-06 16:28:40 +00:00
Dennis Bollweg	5af8da76d7	Fix cuda compilation of Lattice_slicesum_gpu.h	2024-02-01 18:02:30 -05:00
Dennis Bollweg	b8b9dc952d	Async memcpy's and cleanup	2024-02-01 17:55:35 -05:00
Dennis Bollweg	79a6ed32d8	Use accelerator_for2d and DeviceSegmentedRecude to avoid kernel launch latencies	2024-02-01 16:41:03 -05:00
dbollweg	caa5f97723	Add sliceSum gpu using cub/hipcub	2024-01-31 16:50:06 -05:00
Peter Boyle	2a0d75bac2	Aurora files	2023-12-21 23:20:17 +00:00
Peter Boyle	f48298ad4e	Bug fix	2023-12-11 20:57:02 -05:00
root	645e47c1ba	Config for Ampere Altra ARM	2023-12-08 16:17:56 -05:00
Peter Boyle	d1d9827263	Integrator logging update	2023-12-08 12:14:00 -05:00
Peter Boyle	14643c0aab	SDCC benchmarking scripts for A100 nodes and IceLake nodes (AVX512)	2023-12-04 15:45:57 -05:00
Peter Boyle	b77a9b8947	SDDC compiles starting	2023-11-30 14:31:51 -05:00
Peter Boyle	7d077fe493	Frontier compiel	2023-11-09 13:58:44 -05:00
Christoph Lehner	f2648e94b9	getHostPointer added to Lattice	2023-10-23 13:47:41 +02:00
Peter Boyle	51051df62c	3GeV run setup	2023-10-16 20:49:52 +03:00
Peter Boyle	33097681b9	FTHMC compiled and merged to develop	2023-10-14 00:42:55 +03:00
Peter Boyle	07e4900218	FTHMC commit	2023-10-13 18:21:57 +03:00
Peter Boyle	36ab567d67	FTHMC 3 Gev	2023-10-13 18:21:57 +03:00
Peter Boyle	e19171523b	FTHMC Status at lattice conference commit	2023-10-13 18:21:56 +03:00
Peter Boyle	9626a2c7c0	Asynch handling	2023-10-13 18:21:56 +03:00
Peter Boyle	e936f5b80b	IfGridTensor shorthand	2023-10-13 18:21:56 +03:00
Peter Boyle	ffc0639cb9	Running in HMC tests	2023-10-13 18:21:56 +03:00
Peter Boyle	c5b43b322c	traceProduct eliminates non-contributing intermediate terms	2023-10-13 18:21:56 +03:00
Peter Boyle	c9c4576237	Improved frontier cshift	2023-10-13 18:21:56 +03:00
Christoph Lehner	e6ed516052	merged	2023-10-08 09:00:37 +02:00
Christoph Lehner	e2a3dae1f2	Option for multiple simultaneous CartesianStencils	2023-10-08 08:58:44 +02:00
Christoph Lehner	452bf2e907	Accelerator basisRotate also on HIP	2023-06-20 20:36:24 +03:00
Christoph Lehner	e8c29e2fe5	Merge pull request #31 from paboyle/develop Sync	2023-05-28 16:13:12 +02:00
Christoph Lehner	da9cbfc7cc	Suppress BuildSurfaceList verbosity in Stencil.h	2023-05-19 20:22:20 +02:00
Christoph Lehner	6b9f07c1ed	Merge pull request #30 from paboyle/develop Merge upstream	2023-05-19 20:20:58 +02:00
Christoph Lehner	5f75735dab	Add M and Mdag to WilsonTMFermion	2023-04-06 18:25:05 +02:00
		`@@ -0,0 +1,2 @@`
							`module load cuda/12.2`
							`module load openmpi`
		`@@ -0,0 +1,2 @@`
							`export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-12.0.1-agey6vtuw3e375rewhhobvkznjh5ltz4/lib/:$LD_LIBRARY_PATH`
							`module load openmpi`