diff --git a/Grid/Namespace.h b/Grid/Namespace.h
index 29b229fa..c42b46b3 100644
--- a/Grid/Namespace.h
+++ b/Grid/Namespace.h
@@ -30,9 +30,14 @@ directory
 
 #include <type_traits>
 #include <cassert>
+#include <exception>
 
 #define NAMESPACE_BEGIN(A) namespace A {
 #define NAMESPACE_END(A)   }
 #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
 #define GRID_NAMESPACE_END   NAMESPACE_END(Grid)
 #define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  ); 
+
+#define EXCEPTION_CHECK_BEGIN(A) try {
+#define EXCEPTION_CHECK_END(A)   } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }
+
diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index a7edb485..22353d49 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -89,9 +89,10 @@ public:
       gridblasHandle = theGridAccelerator;
 #endif
 #ifdef GRID_ONE_MKL
-      cl::sycl::cpu_selector selector;
+      cl::sycl::gpu_selector selector;
       cl::sycl::device selectedDevice { selector };
-      gridblasHandle =new sycl::queue (selectedDevice);
+      cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
+      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 #endif
       gridblasInit=1;
     }
@@ -266,8 +267,46 @@ public:
     assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+    std::cerr << " Calling SYCL batched ZGEMM "<<std::endl;
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+      oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &notransp,
+						  &notransp,
+						  &m64,&n64,&k64,
+						  (ComplexD *) &alpha_p[0],
+						  (const ComplexD **)&Amk[0], (const int64_t *)&lda64,
+						  (const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
+						  (ComplexD *) &beta_p[0],
+						  (ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+      synchronise();
+    std::cerr << " Called SYCL batched ZGEMM "<<std::endl;
+      std::vector<ComplexD> A(m*k);  // pointer list to matrices
+      std::vector<ComplexD> B(k*n);
+      std::vector<ComplexD> C(m*n);
+      int sda = lda*k;
+      int sdb = ldb*k;
+      int sdc = ldc*n;
+      for (int p = 0; p < 1; ++p) {
+	acceleratorCopyFromDevice((void *)&Amk[p][0],(void *)&A[0],m*k*sizeof(ComplexD));
+	acceleratorCopyFromDevice((void *)&Bkn[p][0],(void *)&B[0],k*n*sizeof(ComplexD));
+	acceleratorCopyFromDevice((void *)&Cmn[p][0],(void *)&C[0],m*n*sizeof(ComplexD));
+	for (int mm = 0; mm < m; ++mm) {
+	  for (int nn = 0; nn < n; ++nn) {
+	    ComplexD c_mn(0.0);
+	    for (int kk = 0; kk < k; ++kk)
+	      c_mn += A[mm + kk*lda ] * B[kk + nn*ldb];
+	    std::cout << " beta "<<beta<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
+	  }
+	}
+      }
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
     // Need a default/reference implementation
@@ -285,7 +324,6 @@ public:
       }
     }
 #endif
-    //    synchronise();
      RealD t1=usecond();
      RealD flops = 8.0*m*n*k*batchCount;
      RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
@@ -366,8 +404,25 @@ public:
     assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+      oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &notransp,
+						  &notransp,
+						  &m64,&n64,&k64,
+						  (ComplexF *) &alpha_p[0],
+						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64,
+						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
+						  (ComplexF *) &beta_p[0],
+						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
     int sda = lda*k;
@@ -467,8 +522,25 @@ public:
     assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+      oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &notransp,
+						  &notransp,
+						  &m64,&n64,&k64,
+						  (float *) &alpha_p[0],
+						  (const float **)&Amk[0], (const int64_t *)&lda64,
+						  (const float **)&Bkn[0], (const int64_t *)&ldb64,
+						  (float *) &beta_p[0],
+						  (float **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
     int sda = lda*k;
@@ -568,24 +640,25 @@ public:
     assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    /*
       int64_t m64=m;
       int64_t n64=n;
       int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
       int64_t batchCount64=batchCount;
-      oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
-      onemkl::transpose::N,
-      onemkl::transpose::N,
-      &m64,&n64,&k64,
-      (double *) &alpha_p[0],
-      (double **)&Amk[0], lda,
-      (double **)&Bkn[0], ldb,
-      (double *) &beta_p[0],
-      (double **)&Cmn[0], ldc,
-      1,&batchCount64);
-     */
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+      oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &notransp,
+						  &notransp,
+						  &m64,&n64,&k64,
+						  (double *) &alpha_p[0],
+						  (const double **)&Amk[0], (const int64_t *)&lda64,
+						  (const double **)&Bkn[0], (const int64_t *)&ldb64,
+						  (double *) &beta_p[0],
+						  (double **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
     int sda = lda*k;
@@ -673,6 +746,7 @@ public:
 						beta,
 						(ComplexD *)Cmn,ldc,sdc,
 						batchCount);
+    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
      // Need a default/reference implementation
diff --git a/Grid/algorithms/iterative/SchurRedBlack.h b/Grid/algorithms/iterative/SchurRedBlack.h
index d97e4993..494aa77b 100644
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -499,6 +499,87 @@ namespace Grid {
       }
   };
 
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal is identity, left preconditioned by Mee^inv
+  // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv  ) phi =  Mee_inv eta
+  //
+  // Solve:
+  // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag  Mee_inv eta
+  //
+  // Old notation e<->o
+  //
+  // Left precon by Moo^-1
+  //  b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o =  [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1}  eta_o
+  //                                   eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
+      const bool _solnAsInitGuess = false)  
+    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
+
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
+      Mtmp=src_o-Mtmp;                 
+      _Matrix.MooeeInv(Mtmp,tmp);      assert( tmp.Checkerboard() ==Odd);     
+      
+      // get the right MpcDag
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
+    }
+
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field   sol_e(grid);
+
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);    assert(  tmp.Checkerboard()   ==Even);
+      tmp = src_e-tmp;             assert(  src_e.Checkerboard() ==Even);
+      _Matrix.MooeeInv(tmp,sol_e); assert(  sol_e.Checkerboard() ==Even);
+     
+      setCheckerboard(sol,sol_e);  assert(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o);  assert(  sol_o.Checkerboard() ==Odd );
+    };
+
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+
   ///////////////////////////////////////////////////////////////////////////////////////////////////////
   // Site diagonal is identity, right preconditioned by Mee^inv
   // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index 8a27f527..293ce2fb 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -54,6 +54,9 @@ public:
     size_type bytes = __n*sizeof(_Tp);
     profilerAllocate(bytes);
     _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
+    if ( (_Tp*)ptr == (_Tp *) NULL ) {
+      printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
+    }
     assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
     return ptr;
   }
@@ -100,6 +103,9 @@ public:
     size_type bytes = __n*sizeof(_Tp);
     profilerAllocate(bytes);
     _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
+    if ( (_Tp*)ptr == (_Tp *) NULL ) {
+      printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
+    }
     assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
     return ptr;
   }
@@ -145,6 +151,9 @@ public:
     size_type bytes = __n*sizeof(_Tp);
     profilerAllocate(bytes);
     _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
+    if ( (_Tp*)ptr == (_Tp *) NULL ) {
+      printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
+    }
     assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
     return ptr;
   }
diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc
index a9e5c9b4..e56af238 100644
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid);
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
+
+#if defined(__has_feature)
+#if __has_feature(leak_sanitizer)
+#define ASAN_LEAK_CHECK
+#endif
+#endif
+
+#ifdef ASAN_LEAK_CHECK
+#include <sanitizer/asan_interface.h>
+#include <sanitizer/common_interface_defs.h>
+#include <sanitizer/lsan_interface.h>
+#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
+#else
+#define LEAK_CHECK(A) { }
+#endif
+
+void MemoryManager::DisplayMallinfo(void)
+{
+#ifdef __linux__
+  struct mallinfo mi;
+  
+  mi = mallinfo();
+
+  std::cout << "MemoryManager: Total non-mmapped bytes (arena):       "<< (size_t)mi.arena<<std::endl;
+  std::cout << "MemoryManager: # of free chunks (ordblks):            "<< (size_t)mi.ordblks<<std::endl;
+  std::cout << "MemoryManager: # of free fastbin blocks (smblks):     "<< (size_t)mi.smblks<<std::endl;
+  std::cout << "MemoryManager: # of mapped regions (hblks):           "<< (size_t)mi.hblks<<std::endl;
+  std::cout << "MemoryManager: Bytes in mapped regions (hblkhd):      "<< (size_t)mi.hblkhd<<std::endl;
+  std::cout << "MemoryManager: Max. total allocated space (usmblks):  "<< (size_t)mi.usmblks<<std::endl;
+  std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
+  std::cout << "MemoryManager: Total allocated space (uordblks):      "<< (size_t)mi.uordblks<<std::endl;
+  std::cout << "MemoryManager: Total free space (fordblks):           "<< (size_t)mi.fordblks<<std::endl;
+  std::cout << "MemoryManager: Topmost releasable block (keepcost):   "<< (size_t)mi.keepcost<<std::endl;
+#endif
+  LEAK_CHECK();
+ 
+}
+
 void MemoryManager::PrintBytes(void)
 {
   std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
@@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void)
 #ifdef GRID_CUDA
   cuda_mem();
 #endif
-  
+  DisplayMallinfo();
 }
 
 uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h
index 528ebbaa..0870313d 100644
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -211,6 +211,7 @@ private:
 #endif
 
  public:
+  static void DisplayMallinfo(void);
   static void NotifyDeletion(void * CpuPtr);
   static void Print(void);
   static void PrintAll(void);
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 7b66c31d..6ce70232 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -373,7 +373,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
   nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 #else
   typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
-  Vector<inner_t> inner_tmp(sites);
+  deviceVector<inner_t> inner_tmp;
+  inner_tmp.resize(sites);
   auto inner_tmp_v = &inner_tmp[0];
 
   accelerator_for( ss, sites, nsimd,{
diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h
index 1429d848..b53755aa 100644
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -90,6 +90,7 @@ public:
         exit(1);
       }
       Parameters.StartingType = arg;
+      std::cout <<GridLogMessage << " GenericHMCrunner --StartingType "<<arg<<std::endl;
     }
 
     if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) {
@@ -97,6 +98,7 @@ public:
       std::vector<int> ivec(0);
       GridCmdOptionIntVector(arg, ivec);
       Parameters.StartTrajectory = ivec[0];
+      std::cout <<GridLogMessage << " GenericHMCrunner --StartingTrajectory "<<ivec[0]<<std::endl;
     }
 
     if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) {
@@ -104,6 +106,7 @@ public:
       std::vector<int> ivec(0);
       GridCmdOptionIntVector(arg, ivec);
       Parameters.Trajectories = ivec[0];
+      std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "<<ivec[0]<<std::endl;
     }
 
     if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) {
@@ -111,6 +114,7 @@ public:
       std::vector<int> ivec(0);
       GridCmdOptionIntVector(arg, ivec);
       Parameters.NoMetropolisUntil = ivec[0];
+      std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "<<ivec[0]<<std::endl;
     }
     if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) {
       arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile");
diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h
index 0276b6fd..549920a0 100644
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -137,9 +137,11 @@ public:
 
       double start_force = usecond();
 
+      MemoryManager::Print();
       as[level].actions.at(a)->deriv_timer_start();
       as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta
       as[level].actions.at(a)->deriv_timer_stop();
+      MemoryManager::Print();
 
       auto name = as[level].actions.at(a)->action_name();
 
@@ -246,7 +248,11 @@ public:
     }
   };
 
-  virtual ~Integrator() {}
+  virtual ~Integrator()
+  {
+    // Pain in the ass to clean up the Level pointers
+    // Guido's design is at fault as per comment above in constructor
+  }
 
   virtual std::string integrator_name() = 0;
   
@@ -460,6 +466,7 @@ public:
     for (int level = 0; level < as.size(); ++level) {
       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 
+	MemoryManager::Print();
         // get gauge field from the SmearingPolicy and
         // based on the boolean is_smeared in actionID
         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
@@ -468,6 +475,7 @@ public:
    	        as[level].actions.at(actionID)->S_timer_stop();
         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
         H += Hterm;
+	MemoryManager::Print();
 
       }
       as[level].apply(S_hireps, Representations, level, H);
diff --git a/HMC/Mobius2p1f.cc b/HMC/Mobius2p1f.cc
index 4ab1f20f..8042d6e6 100644
--- a/HMC/Mobius2p1f.cc
+++ b/HMC/Mobius2p1f.cc
@@ -58,7 +58,7 @@ int main(int argc, char **argv) {
   HMCparameters HMCparams;
   HMCparams.StartTrajectory  = 0;
   HMCparams.Trajectories     = 200;
-  HMCparams.NoMetropolisUntil=  20;
+  HMCparams.NoMetropolisUntil=  0;
   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
   HMCparams.StartingType     =std::string("ColdStart");
   HMCparams.MD = MD;
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
   CheckpointerParameters CPparams;
   CPparams.config_prefix = "ckpoint_EODWF_lat";
   CPparams.rng_prefix    = "ckpoint_EODWF_rng";
-  CPparams.saveInterval  = 10;
+  CPparams.saveInterval  = 1;
   CPparams.format        = "IEEE64BIG";
   TheHMC.Resources.LoadNerscCheckpointer(CPparams);
 
@@ -186,6 +186,8 @@ int main(int argc, char **argv) {
 
   /////////////////////////////////////////////////////////////
   // HMC parameters are serialisable
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file                                                                     
+  TheHMC.initializeGaugeFieldAndRNGs(U);
 
   std::cout << GridLogMessage << " Running the HMC "<< std::endl;
   TheHMC.Run();  // no smearing
diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command
index 58eb8a03..f538f319 100644
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -1,16 +1,18 @@
 
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
+	--enable-debug \
 	--disable-gparity \
 	--disable-fermion-reps \
+	--with-lime=$CLIME \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-accelerator-aware-mpi=yes\
 	--enable-unified=no \
 	MPICXX=mpicxx \
-	CXX=icpx \
-	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
+	CXX=icpx 
 
diff --git a/systems/Aurora/config-command-sanitize b/systems/Aurora/config-command-sanitize
new file mode 100644
index 00000000..d400a103
--- /dev/null
+++ b/systems/Aurora/config-command-sanitize
@@ -0,0 +1,22 @@
+# -fsycl-targets=spir64_gen -Xs\" -device pvc \"
+# -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc
+# -fsycl-targets=intel_gpu_pvc
+
+unset DEVICE
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -Xarch_host -fsanitize=address" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -Xarch_host -fsanitize=address  -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
+../../configure \
+	--enable-simd=GPU \
+	--enable-gen-simd-width=64 \
+	--enable-comms=mpi-auto \
+	--enable-debug \
+	--disable-gparity \
+	--disable-fermion-reps \
+	--with-lime=$CLIME \
+	--enable-shm=nvlink \
+	--enable-accelerator=sycl \
+	--enable-accelerator-aware-mpi=yes\
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=icpx 
+
diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh
index b43b3b71..d577ae96 100644
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -1,28 +1,23 @@
+source ~/spack/share/spack/setup-env.sh 
+spack load c-lime
+
+export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
+#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
+
+export INTELGT_AUTO_ATTACH_DISABLE=1
+
 #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
 
-module load oneapi/release/2023.12.15.001
-
-#module use /soft/modulefiles
-#module load intel_compute_runtime/release/agama-devel-682.22
-
-export FI_CXI_DEFAULT_CQ_SIZE=131072
-export FI_CXI_CQ_FILL_PERCENT=20
-
-export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
-#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
-
-#
 # -ftarget-register-alloc-mode=pvc:default 
 # -ftarget-register-alloc-mode=pvc:small
 # -ftarget-register-alloc-mode=pvc:large
 # -ftarget-register-alloc-mode=pvc:auto
-#
+#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
-#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
 
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
diff --git a/tests/debug/Test_cayley_cg.cc b/tests/debug/Test_cayley_cg.cc
index 74492fd9..068c260f 100644
--- a/tests/debug/Test_cayley_cg.cc
+++ b/tests/debug/Test_cayley_cg.cc
@@ -392,9 +392,27 @@ void  TestCGschur(What & Ddwf,
 		   GridParallelRNG *RNG5)
 {
   LatticeFermion src   (FGrid); random(*RNG5,src);
-  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion result1(FGrid); result1=Zero();
+  LatticeFermion result2(FGrid); result2=Zero();
+  LatticeFermion result3(FGrid); result3=Zero();
 
   ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
   SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
-  SchurSolver(Ddwf,src,result);
+  SchurSolver(Ddwf,src,result1);
+
+  SchurRedBlackDiagOneSolve<LatticeFermion> SchurSolverSymm1(CG);
+  SchurSolverSymm1(Ddwf,src,result2);
+
+  SchurRedBlackDiagTwoSolve<LatticeFermion> SchurSolverSymm2(CG);
+  SchurSolverSymm2(Ddwf,src,result3);
+
+  std::cout << GridLogMessage << " Standard " <<norm2(result1)<<std::endl;
+
+  std::cout << GridLogMessage << " Symm1    " <<norm2(result2)<<std::endl; 
+  result2=result2-result1;
+  std::cout << GridLogMessage << " diff " <<norm2(result2) <<std::endl; 
+
+  std::cout << GridLogMessage << " Symm2    " <<norm2(result3)<<std::endl; 
+  result3=result3-result1;
+  std::cout << GridLogMessage << " diff " <<norm2(result3) <<std::endl; 
 }