Optimise lie algebra project

Clean up the accelerator pick/set checkerboard
Attempt at operating on half checkerboard
2025-06-18 07:47:06 +01:00 · 2024-09-19 15:48:09 -04:00 · 2024-08-23 12:34:41 -04:00 · 2024-08-23 11:05:09 -04:00
113 changed files with 1833 additions and 3544 deletions
--- a/BLAS_benchmark/BatchBlasBench.cc
+++ b/BLAS_benchmark/BatchBlasBench.cc
@ -12,13 +12,15 @@
 #include <iostream>
 #include <sys/time.h>
 #define GRID_SYCL
 #undef  GRID_HIP
 #undef  GRID_CUDA
 #ifdef GRID_HIP
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_CUDA
 #include <cublas_v2.h>
 #endif
 #ifdef GRID_SYCL
 #include <oneapi/mkl.hpp>
@ -43,90 +45,6 @@ inline void acceleratorFreeDevice(void *ptr,size_t bytes){free(ptr,*theAccelerat
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();}
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
 #define accelerator_barrier(dummy) { theAccelerator->wait(); }
 #endif
 #ifdef GRID_HIP
 hipStream_t copyStream;
 hipStream_t computeStream;
 void acceleratorInit(void)
 {
  int device = 0;
  auto discard = hipSetDevice(device);
  discard = hipStreamCreate(&copyStream);
  discard = hipStreamCreate(&computeStream);
  printf("AcceleratorHIPInit\n");
 }
 inline void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = hipMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    fprintf(stderr," hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
  }
  return ptr;
 };
 inline void acceleratorFreeDevice(void *ptr,size_t bytes){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 #define accelerator_barrier(dummy)				\
  {								\
    auto tmp=hipStreamSynchronize(computeStream);		\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
      puts(__FILE__);							\
      printf("Line %d\n",__LINE__);				\
      exit(0);							\
    }								\
  }
 #endif
 #ifdef GRID_CUDA
 cudaStream_t copyStream;
 cudaStream_t computeStream;
 void acceleratorInit(void)
 {
  int device = 0;
  cudaSetDevice(device);
  cudaStreamCreate(&copyStream);
  cudaStreamCreate(&computeStream);
 }
 inline void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
 };
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 #define accelerator_barrier(dummy)					\
  {									\
    cudaStreamSynchronize(computeStream);				\
    cudaError err = cudaGetLastError();					\
    if ( cudaSuccess != err ) {						\
      printf("accelerator_barrier(): Cuda error %s \n",			\
 	     cudaGetErrorString( err ));				\
      printf("File %s Line %d\n",__FILE__,__LINE__);			\
      fflush(stdout);							\
      if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);		\
    }									\
  }
 #endif
 template<class T> void acceleratorPut(T& dev,T&host)
 {
  acceleratorCopyToDevice(&host,&dev,sizeof(T));
@ -137,6 +55,9 @@ template<class T> T acceleratorGet(T& dev)
  acceleratorCopyFromDevice(&dev,&host,sizeof(T));
  return host;
 }
 #define accelerator_barrier(dummy) { theAccelerator->wait(); }
 #endif
 /**************************************************************
 * Allocator
@ -290,269 +211,6 @@ public:
 #endif
  }
  /////////////////////////////////////////////////////////////
  // Single matrix GEMM -- fp64 and fp32
  /////////////////////////////////////////////////////////////
  void gemm(GridBLASOperation_t OpA,
 	    GridBLASOperation_t OpB,
 	    int m,int n, int k,
 	    ComplexD alpha,
 	    ComplexD* Amk,  // Device pointer
 	    ComplexD* Bkn,
 	    ComplexD beta,
 	    ComplexD* Cmn)
  {
    RealD t2=usecond();
    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
    assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<ComplexD> alpha_p(1);
    static deviceVector<ComplexD> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
    RealD t0=usecond();
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasZgemm(gridblasHandle,
 			    hOpA,
 			    hOpB,
 			    m,n,k,
 			    (hipblasDoubleComplex *) &alpha_p[0],
 			    (hipblasDoubleComplex *) Amk, lda,
 			    (hipblasDoubleComplex *) Bkn, ldb,
 			    (hipblasDoubleComplex *) &beta_p[0],
 			    (hipblasDoubleComplex *) Cmn, ldc);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasZgemm(gridblasHandle,
 			   hOpA,
 			   hOpB,
 			   m,n,k,
 			   (cuDoubleComplex *) &alpha_p[0],
 			   (cuDoubleComplex *) Amk, lda,
 			   (cuDoubleComplex *) Bkn, ldb,
 			   (cuDoubleComplex *) &beta_p[0],
 			   (cuDoubleComplex *) Cmn, ldc);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
      int64_t lda64=lda;
      int64_t ldb64=ldb;
      int64_t ldc64=ldc;
      oneapi::mkl::transpose iOpA;
      oneapi::mkl::transpose iOpB;
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
      oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
 					    iOpA,
 					    iOpB,
 					    m64,n64,k64,
 					    (ComplexD *) &alpha_p[0],
 					    (const ComplexD *)Amk, (int64_t )lda64,
 					    (const ComplexD *)Bkn, (int64_t )ldb64,
 					    (ComplexD *) &beta_p[0],
 					    (ComplexD *)Cmn, (int64_t)ldc64);
      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation; use Eigen
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
 	Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
 	Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk * eBkn ;
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
 	Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
 	Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
 	Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
 	Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
 	Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
 	Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
      } else { 
 	assert(0);
      }
 #endif
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k;
     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n);
  }
  void gemm(GridBLASOperation_t OpA,
 	    GridBLASOperation_t OpB,
 	    int m,int n, int k,
 	    ComplexF alpha,
 	    ComplexF* Amk,  // Device pointer
 	    ComplexF* Bkn,
 	    ComplexF beta,
 	    ComplexF* Cmn)
  {
    RealD t2=usecond();
    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
    assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<ComplexF> alpha_p(1);
    static deviceVector<ComplexF> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
    RealD t0=usecond();
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasCgemm(gridblasHandle,
 			    hOpA,
 			    hOpB,
 			    m,n,k,
 			    (hipblasComplex *) &alpha_p[0],
 			    (hipblasComplex *) Amk, lda,
 			    (hipblasComplex *) Bkn, ldb,
 			    (hipblasComplex *) &beta_p[0],
 			    (hipblasComplex *) Cmn, ldc);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasCgemm(gridblasHandle,
 			   hOpA,
 			   hOpB,
 			   m,n,k,
 			   (cuComplex *) &alpha_p[0],
 			   (cuComplex *) Amk, lda,
 			   (cuComplex *) Bkn, ldb,
 			   (cuComplex *) &beta_p[0],
 			   (cuComplex *) Cmn, ldc);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
      int64_t lda64=lda;
      int64_t ldb64=ldb;
      int64_t ldc64=ldc;
      oneapi::mkl::transpose iOpA;
      oneapi::mkl::transpose iOpB;
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
      oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
 					    iOpA,
 					    iOpB,
 					    m64,n64,k64,
 					    (ComplexF *) &alpha_p[0],
 					    (const ComplexF *)Amk, (int64_t )lda64,
 					    (const ComplexF *)Bkn, (int64_t )ldb64,
 					    (ComplexF *) &beta_p[0],
 					    (ComplexF *)Cmn, (int64_t )ldc64);
      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation; use Eigen
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
 	Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
 	Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk * eBkn ;
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
 	Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
 	Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
 	Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
 	Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
 	Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
 	Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
      } else { 
 	assert(0);
      }
 #endif
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k;
     RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n);
  }
  /////////////////////////////////////////////////////////////
  void gemmBatched(int m,int n, int k,
 		   ComplexD alpha,
 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
@ -583,6 +241,36 @@ public:
 		beta,
 		Cmn);
  }
  void gemmBatched(int m,int n, int k,
 		   RealD alpha,
 		   deviceVector<RealD*> &Amk,  // pointer list to matrices
 		   deviceVector<RealD*> &Bkn,
 		   RealD beta,
 		   deviceVector<RealD*> &Cmn)
  {
    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
 		Amk,
 		Bkn,
 		beta,
 		Cmn);
  }
  void gemmBatched(int m,int n, int k,
 		   RealF alpha,
 		   deviceVector<RealF*> &Amk,  // pointer list to matrices
 		   deviceVector<RealF*> &Bkn,
 		   RealF beta,
 		   deviceVector<RealF*> &Cmn)
  {
    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
 		Amk,
 		Bkn,
 		beta,
 		Cmn);
  }
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
@ -936,6 +624,301 @@ public:
     RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
  }
  ///////////////////////////////////////////////////////////////////////////
  // Single precision real GEMM
  ///////////////////////////////////////////////////////////////////////////
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
 		   int m,int n, int k,
 		   RealF alpha,
 		   deviceVector<RealF*> &Amk,  // pointer list to matrices
 		   deviceVector<RealF*> &Bkn,
 		   RealF beta,
 		   deviceVector<RealF*> &Cmn)
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
    assert(OpB!=GridBLAS_OP_C);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<RealF> alpha_p(1);
    static deviceVector<RealF> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
    RealD t0=usecond();
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasSgemmBatched(gridblasHandle,
 				   hOpA,
 				   hOpB,
 				   m,n,k,
 				   (float *) &alpha_p[0],
 				   (float **)&Amk[0], lda,
 				   (float **)&Bkn[0], ldb,
 				   (float *) &beta_p[0],
 				   (float **)&Cmn[0], ldc,
 				   batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasSgemmBatched(gridblasHandle,
 				  hOpA,
 				  hOpB,
 				  m,n,k,
 				  (float *) &alpha_p[0],
 				  (float **)&Amk[0], lda,
 				  (float **)&Bkn[0], ldb,
 				  (float *) &beta_p[0],
 				  (float **)&Cmn[0], ldc,
 				  batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
      int64_t lda64=lda;
      int64_t ldb64=ldb;
      int64_t ldc64=ldc;
      int64_t batchCount64=batchCount;
      oneapi::mkl::transpose iOpA;
      oneapi::mkl::transpose iOpB;
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 						  &iOpA,
 						  &iOpB,
 						  &m64,&n64,&k64,
 						  (float *) &alpha_p[0],
 						  (const float **)&Amk[0], (const int64_t *)&lda64,
 						  (const float **)&Bkn[0], (const int64_t *)&ldb64,
 						  (float *) &beta_p[0],
 						  (float **)&Cmn[0], (const int64_t *)&ldc64,
 						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation; use Eigen
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
      }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
  }
  ///////////////////////////////////////////////////////////////////////////
  // Double precision real GEMM
  ///////////////////////////////////////////////////////////////////////////
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
 		   int m,int n, int k,
 		   RealD alpha,
 		   deviceVector<RealD*> &Amk,  // pointer list to matrices
 		   deviceVector<RealD*> &Bkn,
 		   RealD beta,
 		   deviceVector<RealD*> &Cmn)
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
    assert(OpB!=GridBLAS_OP_C);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<RealD> alpha_p(1);
    static deviceVector<RealD> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
    RealD t0=usecond();
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasDgemmBatched(gridblasHandle,
 				   HIPBLAS_OP_N,
 				   HIPBLAS_OP_N,
 				   m,n,k,
 				   (double *) &alpha_p[0],
 				   (double **)&Amk[0], lda,
 				   (double **)&Bkn[0], ldb,
 				   (double *) &beta_p[0],
 				   (double **)&Cmn[0], ldc,
 				   batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasDgemmBatched(gridblasHandle,
 				  hOpA,
 				  hOpB,
 				  m,n,k,
 				  (double *) &alpha_p[0],
 				  (double **)&Amk[0], lda,
 				  (double **)&Bkn[0], ldb,
 				  (double *) &beta_p[0],
 				  (double **)&Cmn[0], ldc,
 				  batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
      int64_t lda64=lda;
      int64_t ldb64=ldb;
      int64_t ldc64=ldc;
      int64_t batchCount64=batchCount;
      oneapi::mkl::transpose iOpA;
      oneapi::mkl::transpose iOpB;
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 						  &iOpA,
 						  &iOpB,
 						  &m64,&n64,&k64,
 						  (double *) &alpha_p[0],
 						  (const double **)&Amk[0], (const int64_t *)&lda64,
 						  (const double **)&Bkn[0], (const int64_t *)&ldb64,
 						  (double *) &beta_p[0],
 						  (double **)&Cmn[0], (const int64_t *)&ldc64,
 						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation; use Eigen
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
      }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
  }
  template<class CComplex>
  double benchmark(int M, int N, int K, int BATCH)
  {
@ -984,47 +967,6 @@ public:
    return flops; // Returns gigaflops
  }
  template<class CComplex>
  double benchmark(int M, int N, int K)
  {
    int32_t N_A = M*K;
    int32_t N_B = K*N;
    int32_t N_C = M*N;
    deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
    deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
    deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
    CComplex alpha(1.0);
    CComplex beta (1.0);
    RealD flops = 8.0*M*N*K;
    int ncall=10;
    gemm(GridBLAS_OP_C,GridBLAS_OP_N,
 	 M,N,K,
 	 alpha,
 	 &A[0], // m x k 
 	 &B[0], // k x n
 	 beta, 
 	 &C[0]);
    synchronise();
    RealD t0 = usecond();
    for(int i=0;i<ncall;i++){
      gemm(GridBLAS_OP_N,GridBLAS_OP_N,
 	   M,N,K,
 	   alpha,
 	   &A[0], // m x k 
 	   &B[0], // k x n
 	   beta, 
 	   &C[0]);
      synchronise();
    }
    RealD t1 = usecond();
    RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K);
    flops = 8.0*M*N*K*ncall;
    flops = flops/(t1-t0)/1.e3;
    return flops; // Returns gigaflops
  }
 };
@ -1093,21 +1035,6 @@ static void BLAS(void)
      std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
    }}
  fprintf(FP,"\n\n\n");
  std::cout << "----------------------------------------------------------"<<std::endl;
  std::cout << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (inner product matrix)"<<std::endl;
  std::cout << "----------------------------------------------------------"<<std::endl;
  {
    int M=12;
    int N=12;
    std::vector<int> ks({4*1024*1024, 2*1024*1024, 1024*1024, 256*1024, 1024 });
    for( int kk=0;kk<ks.size();kk++ ) {
      int K = ks[kk];
      double p=blas.benchmark<CComplex>(M,N,K);
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, 1, p);
      std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<1<<"\t\t"<<p<<std::endl;
    }
  }
  std::cout << "=================================================================================="<<std::endl;
 };
--- a/BLAS_benchmark/compile-command
+++ b/BLAS_benchmark/compile-command
@ -1,2 +1,2 @@
-mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
+mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
--- a/BLAS_benchmark/compile-command-frontier
+++ b/BLAS_benchmark/compile-command-frontier
@ -1,5 +0,0 @@
 CXX=hipcc
 MPICXX=mpicxx 
 CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
 LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
 hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench
--- a/BLAS_benchmark/compile-command-sunspot
+++ b/BLAS_benchmark/compile-command-sunspot
@ -1,2 +0,0 @@
 mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@ -50,7 +50,6 @@ NAMESPACE_CHECK(approx);
 #include <Grid/algorithms/deflation/Deflation.h>
 #include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
 #include <Grid/algorithms/deflation/MultiRHSDeflation.h>
 #include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
 NAMESPACE_CHECK(deflation);
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 NAMESPACE_CHECK(ConjGrad);
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@ -168,7 +168,6 @@ public:
  template<class vobj>
  void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
 #ifndef HAVE_FFTW
    std::cerr << "FFTW is not compiled but is called"<<std::endl;
    assert(0);
 #else
    conformable(result.Grid(),vgrid);
@ -191,7 +190,6 @@ public:
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
    std::cout << "CPU view" << std::endl;
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@ -215,7 +213,6 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
    std::cout << "Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -229,7 +226,6 @@ public:
    }
    // Barrel shift and collect global pencil
    std::cout << "Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@ -251,7 +247,6 @@ public:
      }
    }
    std::cout << "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@ -274,7 +269,6 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
    std::cout << "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@ -291,7 +285,6 @@ public:
    }
    result = result*div;
    std::cout << "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -103,38 +103,6 @@ public:
    _Mat.MdagM(in,out);
  }
 };
 template<class Matrix,class Field>
 class MMdagLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
 public:
  MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
    _Mat.Mdiag(in,out);
  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){
    _Mat.MdirAll(in,out);
  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
  void AdjOp     (const Field &in, Field &out){
    _Mat.Mdag(in,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    _Mat.MMdag(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
  void HermOp(const Field &in, Field &out){
    _Mat.MMdag(in,out);
  }
 };
 ////////////////////////////////////////////////////////////////////
 // Construct herm op and shift it for mgrid smoother
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@ -45,11 +45,6 @@ public:
    M(in,tmp);
    Mdag(tmp,out);
  }
  virtual void  MMdag(const Field &in, Field &out) {
    Field tmp (in.Grid());
    Mdag(in,tmp);
    M(tmp,out);
  }
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@ -59,7 +59,7 @@ public:
    RealD diff = hi-lo;
    RealD delta = diff*1.0e-9;
    for (RealD x=lo; x<hi; x+=delta) {
-      delta*=1.02;
+      delta*=1.1;
      RealD f = approx(x);
      out<< x<<" "<<f<<std::endl;
    }
@ -131,26 +131,6 @@ public:
      Coeffs[j] = s * 2.0/order;
    }
  };
  template<class functor>
  void Init(RealD _lo,RealD _hi,int _order, functor & func)
  {
    lo=_lo;
    hi=_hi;
    order=_order;
    if(order < 2) exit(-1);
    Coeffs.resize(order);
    for(int j=0;j<order;j++){
      RealD s=0;
      for(int k=0;k<order;k++){
 	RealD y=std::cos(M_PI*(k+0.5)/order);
 	RealD x=0.5*(y*(hi-lo)+(hi+lo));
 	RealD f=func(x);
 	s=s+f*std::cos( j*M_PI*(k+0.5)/order );
      }
      Coeffs[j] = s * 2.0/order;
    }
  };
  void JacksonSmooth(void){
--- a/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
+++ b/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
@ -1,376 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: MultiRHSBlockCGLinalg.h
    Copyright (C) 2024
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 /* Need helper object for BLAS accelerated mrhs blockCG */
 template<class Field>
 class MultiRHSBlockCGLinalg
 {
 public:
  typedef typename Field::scalar_type   scalar;
  typedef typename Field::scalar_object scalar_object;
  typedef typename Field::vector_object vector_object;
  deviceVector<scalar> BLAS_X;      // nrhs x vol -- the sources
  deviceVector<scalar> BLAS_Y;      // nrhs x vol -- the result
  deviceVector<scalar> BLAS_C;      // nrhs x nrhs -- the coefficients 
  deviceVector<scalar> BLAS_Cred;   // nrhs x nrhs x oSites -- reduction buffer
  deviceVector<scalar *> Xdip;
  deviceVector<scalar *> Ydip;
  deviceVector<scalar *> Cdip;
  MultiRHSBlockCGLinalg() {};
  ~MultiRHSBlockCGLinalg(){ Deallocate(); };
  void Deallocate(void)
  {
    Xdip.resize(0);
    Ydip.resize(0);
    Cdip.resize(0);
    BLAS_Cred.resize(0);
    BLAS_C.resize(0);
    BLAS_X.resize(0);
    BLAS_Y.resize(0);
  }
  void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
  {
    std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
    for(int r=0;r<AP.size();r++){
      Y_copy[r] = Y[r];
    }
    MulMatrix(AP,m,X);
    for(int r=0;r<AP.size();r++){
      AP[r] = scale*AP[r]+Y_copy[r];
    }
  }
  void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
  {
    typedef typename Field::scalar_type scomplex;
    GridBase *grid;
    uint64_t vol;
    uint64_t words;
    int nrhs = Y.size();
    grid  = X[0].Grid();
    vol   = grid->lSites();
    words = sizeof(scalar_object)/sizeof(scalar);
    int64_t vw = vol * words;
    RealD t0 = usecond();
    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
    RealD t1 = usecond();
    /////////////////////////////////////////////
    // Copy in the multi-rhs sources
    /////////////////////////////////////////////
    for(int r=0;r<nrhs;r++){
      int64_t offset = r*vw;
      autoView(x_v,X[r],AcceleratorRead);
      acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
    }
    // Assumes Eigen storage contiguous
    acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
  /*
   * in Fortran column major notation (cuBlas order)
   *
   * Xxr = [X1(x)][..][Xn(x)]
   * Yxr = [Y1(x)][..][Ym(x)]
   * Y = X . C
   */
    deviceVector<scalar *> Xd(1);
    deviceVector<scalar *> Yd(1);
    deviceVector<scalar *> Cd(1);
    scalar * Xh = & BLAS_X[0];
    scalar * Yh = & BLAS_Y[0];
    scalar * Ch = & BLAS_C[0];
    acceleratorPut(Xd[0],Xh);
    acceleratorPut(Yd[0],Yh);
    acceleratorPut(Cd[0],Ch);
    RealD t2 = usecond();
    GridBLAS BLAS;
    /////////////////////////////////////////
    // Y = X*C (transpose?)
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
    		     vw,nrhs,nrhs,
 		     scalar(1.0),
 		     Xd,
 		     Cd,
 		     scalar(0.0),  // wipe out Y
 		     Yd);
    BLAS.synchronise();
    RealD t3 = usecond();
    // Copy back Y = m X 
    for(int r=0;r<nrhs;r++){
      int64_t offset = r*vw;
      autoView(y_v,Y[r],AcceleratorWrite);
      acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
    }    
    RealD t4 = usecond();
    std::cout << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
    std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
    std::cout << "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
    std::cout << "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
    std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl;
  }
  void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
  {
 #if 0    
    int nrhs;
    GridBase *grid;
    uint64_t vol;
    uint64_t words;
    nrhs = X.size();
    assert(X.size()==Y.size());
    conformable(X[0],Y[0]);
    grid  = X[0].Grid();
    vol   = grid->lSites();
    words = sizeof(scalar_object)/sizeof(scalar);
    int64_t vw = vol * words;
    RealD t0 = usecond();
    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
    RealD t1 = usecond();
    /////////////////////////////////////////////
    // Copy in the multi-rhs sources
    /////////////////////////////////////////////
    for(int r=0;r<nrhs;r++){
      int64_t offset = r*vw;
      autoView(x_v,X[r],AcceleratorRead);
      acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
      autoView(y_v,Y[r],AcceleratorRead);
      acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
    }
    RealD t2 = usecond();
  /*
   * in Fortran column major notation (cuBlas order)
   *
   * Xxr = [X1(x)][..][Xn(x)]
   *
   * Yxr = [Y1(x)][..][Ym(x)]
   *
   * C_rs = X^dag Y
   */
    deviceVector<scalar *> Xd(1);
    deviceVector<scalar *> Yd(1);
    deviceVector<scalar *> Cd(1);
    scalar * Xh = & BLAS_X[0];
    scalar * Yh = & BLAS_Y[0];
    scalar * Ch = & BLAS_C[0];
    acceleratorPut(Xd[0],Xh);
    acceleratorPut(Yd[0],Yh);
    acceleratorPut(Cd[0],Ch);
    GridBLAS BLAS;
    RealD t3 = usecond();
    /////////////////////////////////////////
    // C_rs = X^dag Y
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
    		     nrhs,nrhs,vw,
 		     ComplexD(1.0),
 		     Xd,
 		     Yd,
 		     ComplexD(0.0),  // wipe out C
 		     Cd);
    BLAS.synchronise();
    RealD t4 = usecond();
    std::vector<scalar> HOST_C(BLAS_C.size());      // nrhs . nrhs -- the coefficients 
    acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
    grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
    RealD t5 = usecond();
    for(int rr=0;rr<nrhs;rr++){
      for(int r=0;r<nrhs;r++){
 	int off = r+nrhs*rr;
 	m(r,rr)=HOST_C[off];
      }
    }
    RealD t6 = usecond();
    uint64_t M=nrhs;
    uint64_t N=nrhs;
    uint64_t K=vw;
    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
    RealD flops = 8.0*M*N*K;
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
    std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
    std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
    std::cout << "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
    std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
    std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
    std::cout << "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
    std::cout << "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
    std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
    std::cout << "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
    std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #else
    int nrhs;
    GridBase *grid;
    uint64_t vol;
    uint64_t words;
    nrhs = X.size();
    assert(X.size()==Y.size());
    conformable(X[0],Y[0]);
    grid  = X[0].Grid();
    int rd0 =  grid->_rdimensions[0] * grid->_rdimensions[1];
    vol   = grid->oSites()/rd0;
    words = rd0*sizeof(vector_object)/sizeof(scalar);
    int64_t vw = vol * words;
    assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
    RealD t0 = usecond();
    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
    RealD t1 = usecond();
    /////////////////////////////////////////////
    // Copy in the multi-rhs sources -- layout batched BLAS ready
    /////////////////////////////////////////////
    for(int r=0;r<nrhs;r++){
      autoView(x_v,X[r],AcceleratorRead);
      autoView(y_v,Y[r],AcceleratorRead);
      scalar *from_x=(scalar *)&x_v[0];
      scalar *from_y=(scalar *)&y_v[0];
      scalar *BX = &BLAS_X[0];
      scalar *BY = &BLAS_Y[0];
      accelerator_for(ssw,vw,1,{
 	  uint64_t ss=ssw/words;
 	  uint64_t  w=ssw%words;
 	  uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
 	  BX[offset] = from_x[ssw];
 	  BY[offset] = from_y[ssw];
 	});
    }
    RealD t2 = usecond();
  /*
   * in Fortran column major notation (cuBlas order)
   *
   * Xxr = [X1(x)][..][Xn(x)]
   *
   * Yxr = [Y1(x)][..][Ym(x)]
   *
   * C_rs = X^dag Y
   */
    Xdip.resize(vol);
    Ydip.resize(vol);
    Cdip.resize(vol);
    std::vector<scalar *> Xh(vol);
    std::vector<scalar *> Yh(vol);
    std::vector<scalar *> Ch(vol);
    for(uint64_t ss=0;ss<vol;ss++){
      Xh[ss] = & BLAS_X[ss*nrhs*words];
      Yh[ss] = & BLAS_Y[ss*nrhs*words];
      Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
    }
    acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
    acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
    acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
    GridBLAS BLAS;
    RealD t3 = usecond();
    /////////////////////////////////////////
    // C_rs = X^dag Y
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
    		     nrhs,nrhs,words,
 		     ComplexD(1.0),
 		     Xdip,
 		     Ydip,
 		     ComplexD(0.0),  // wipe out C
 		     Cdip);
    BLAS.synchronise();
    RealD t4 = usecond();
    std::vector<scalar> HOST_C(BLAS_Cred.size());      // nrhs . nrhs -- the coefficients 
    acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
    RealD t5 = usecond();
    m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    for(int ss=0;ss<vol;ss++){
      Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
      m = m + eC;
    }
    RealD t6l = usecond();
    grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
    RealD t6 = usecond();
    uint64_t M=nrhs;
    uint64_t N=nrhs;
    uint64_t K=vw;
    RealD xybytes = grid->lSites()*sizeof(scalar_object);
    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
    RealD flops = 8.0*M*N*K;
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
    xybytes = 4*xybytes/(t2-t1)/1.e3;
    std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
    std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
    std::cout << "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
    std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
    std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
    std::cout << "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
    std::cout << "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
    std::cout << "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
    std::cout << "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
    std::cout << "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
    std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #endif
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/deflation/MultiRHSBlockProject.h
+++ b/Grid/algorithms/deflation/MultiRHSBlockProject.h
@ -447,10 +447,10 @@ public:
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
    		     nbasis,nrhs,vw,
-		     scalar(1.0),
+		     ComplexD(1.0),
 		     Vd,
 		     Fd,
-		     scalar(0.0),  // wipe out C
+		     ComplexD(0.0),  // wipe out C
 		     Cd);
    BLAS.synchronise();
    //    std::cout << "BlockProject done"<<std::endl;
@ -497,10 +497,10 @@ public:
    int64_t vw = block_vol * words;
    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
    		     vw,nrhs,nbasis,
-		     scalar(1.0),
+		     ComplexD(1.0),
 		     Vd,
 		     Cd,
-		     scalar(0.0),  // wipe out C
+		     ComplexD(0.0),  // wipe out C
 		     Fd);
    BLAS.synchronise();
    //    std::cout << " blas call done"<<std::endl;
--- a/Grid/algorithms/deflation/MultiRHSDeflation.h
+++ b/Grid/algorithms/deflation/MultiRHSDeflation.h
@ -182,10 +182,10 @@ public:
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
    		     nev,nrhs,vw,
-		     scalar(1.0),
+		     ComplexD(1.0),
 		     Ed,
 		     Rd,
-		     scalar(0.0),  // wipe out C
+		     ComplexD(0.0),  // wipe out C
 		     Cd);
    BLAS.synchronise();
@ -210,10 +210,10 @@ public:
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
 		     vw,nrhs,nev,
-		     scalar(1.0),
+		     ComplexD(1.0),
 		     Ed, // x . nev
 		     Cd, // nev . nrhs
-		     scalar(0.0),
+		     ComplexD(0.0),
 		     Gd);
    BLAS.synchronise();
--- a/Grid/algorithms/iterative/AdefMrhs.h
+++ b/Grid/algorithms/iterative/AdefMrhs.h
@ -53,7 +53,6 @@ class TwoLevelCGmrhs
  // Fine operator, Smoother, CoarseSolver
  LinearOperatorBase<Field>   &_FineLinop;
  LinearFunction<Field>   &_Smoother;
  MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
  GridStopWatch ProjectTimer;
  GridStopWatch PromoteTimer;
@ -80,301 +79,6 @@ class TwoLevelCGmrhs
  // Vector case
  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
  {
    SolveSingleSystem(src,x);
    //    SolvePrecBlockCG(src,x);
  }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  //Dimensions
  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
  //
  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
  //
  //   Q  C = R => Q = R C^{-1}
  //
  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
  //
  // Set C = L^{dag}, and then Q^dag Q = ident 
  //
  // Checks:
  // Cdag C = Rdag R ; passes.
  // QdagQ  = 1      ; passes
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  void ThinQRfact (Eigen::MatrixXcd &m_zz,
 		   Eigen::MatrixXcd &C,
 		   Eigen::MatrixXcd &Cinv,
 		   std::vector<Field> &  Q,
 		   std::vector<Field> & MQ,
 		   const std::vector<Field> & Z,
 		   const std::vector<Field> & MZ)
  {
    RealD t0=usecond();
    _BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
    RealD t1=usecond();
    m_zz = 0.5*(m_zz+m_zz.adjoint());
    Eigen::MatrixXcd L    = m_zz.llt().matrixL(); 
    C    = L.adjoint();
    Cinv = C.inverse();
    RealD t3=usecond();
    _BlockCGLinalg.MulMatrix( Q,Cinv,Z);
    _BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
    RealD t4=usecond();
    std::cout << " ThinQRfact IP    :"<< t1-t0<<" us"<<std::endl;
    std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
    std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
  }
  virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
  {
    std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
    src[0].Grid()->Barrier();
    int nrhs = src.size();
    //    std::vector<RealD> f(nrhs);
    //    std::vector<RealD> rtzp(nrhs);
    //    std::vector<RealD> rtz(nrhs);
    //    std::vector<RealD> a(nrhs);
    //    std::vector<RealD> d(nrhs);
    //    std::vector<RealD> b(nrhs);
    //    std::vector<RealD> rptzp(nrhs);
    ////////////////////////////////////////////
    //Initial residual computation & set up
    ////////////////////////////////////////////
    std::vector<RealD> ssq(nrhs);
    for(int rhs=0;rhs<nrhs;rhs++){
      ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
    }      
    ///////////////////////////
    // Fields -- eliminate duplicates between fPcg and block cg
    ///////////////////////////
    std::vector<Field> Mtmp(nrhs,grid);
    std::vector<Field> tmp(nrhs,grid);
    std::vector<Field>   Z(nrhs,grid); // Rename Z to R
    std::vector<Field>  MZ(nrhs,grid); // Rename MZ to Z
    std::vector<Field>   Q(nrhs,grid); // 
    std::vector<Field>  MQ(nrhs,grid); // Rename to P
    std::vector<Field>   D(nrhs,grid);
    std::vector<Field>  AD(nrhs,grid);
    /************************************************************************
     * Preconditioned Block conjugate gradient rQ
     * Generalise Sebastien Birk Thesis, after Dubrulle 2001.
     * Introduce preconditioning following Saad Ch9
     ************************************************************************
     * Dimensions:
     *
     *   X,B etc... ==(Nferm x nrhs)
     *  Matrix A==(Nferm x Nferm)
     *  
     * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
     * QC => Thin QR factorisation (google it)
     *
     * R = B-AX
     * Z = Mi R
     * QC = Z
     * D = Q 
     * for k: 
     *   R  = AD
     *   Z  = Mi R
     *   M  = [D^dag R]^{-1}
     *   X  = X + D M C
     *   QS = Q - Z.M
     *   D  = Q + D S^dag
     *   C  = S C
     */
    Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(nrhs,nrhs);
    Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(nrhs,nrhs);
    Eigen::MatrixXcd m_zz     = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(nrhs,nrhs);
    Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(nrhs,nrhs);
    GridStopWatch HDCGTimer;
    //////////////////////////
    // x0 = Vstart -- possibly modify guess
    //////////////////////////
    Vstart(X,src);
    //////////////////////////
    // R = B-AX
    //////////////////////////
    for(int rhs=0;rhs<nrhs;rhs++){
      // r0 = b -A x0
      _FineLinop.HermOp(X[rhs],tmp[rhs]);
      axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]);    // Computes R=Z=src - A X0
    }
    //////////////////////////////////
    // Compute MZ = M1 Z = M1 B - M1 A x0
    //////////////////////////////////
    PcgM1(Z,MZ);  
    //////////////////////////////////
    // QC = Z
    //////////////////////////////////
    ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
    //////////////////////////////////
    // D=MQ
    //////////////////////////////////
    for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
    std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
    ProjectTimer.Reset();
    PromoteTimer.Reset();
    DeflateTimer.Reset();
    CoarseTimer.Reset();
    SmoothTimer.Reset();
    FineTimer.Reset();
    InsertTimer.Reset();
    GridStopWatch M1Timer;
    GridStopWatch M2Timer;
    GridStopWatch M3Timer;
    GridStopWatch LinalgTimer;
    GridStopWatch InnerProdTimer;
    HDCGTimer.Start();
    std::vector<RealD> rn(nrhs);
    for (int k=0;k<=MaxIterations;k++){
      ////////////////////
      // Z  = AD
      ////////////////////
      M3Timer.Start();
      for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);      
      M3Timer.Stop();
      ////////////////////
      // MZ  = M1 Z <==== the Multigrid preconditioner
      ////////////////////
      M1Timer.Start();
      PcgM1(Z,MZ);
      M1Timer.Stop();
      FineTimer.Start();
      ////////////////////
      // M  = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
      ////////////////////
      InnerProdTimer.Start();
      _BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
      InnerProdTimer.Stop();
      m_M       = m_DZ.inverse();
      ///////////////////////////
      // X  = X + D MC
      ///////////////////////////
      m_tmp     = m_M * m_C;
      LinalgTimer.Start();
      _BlockCGLinalg.MaddMatrix(X,m_tmp, D,X);     // D are the search directions and X takes the updates 
      LinalgTimer.Stop();
      ///////////////////////////
      // QS = Q - M Z
      // (MQ) S = MQ - M (M1Z)
      ///////////////////////////
      LinalgTimer.Start();
      _BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
      _BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
      ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
      LinalgTimer.Stop();
      ////////////////////////////
      // D  = MQ + D S^dag
      ////////////////////////////
      m_tmp = m_S.adjoint();
      LinalgTimer.Start();
      _BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
      LinalgTimer.Stop();
      ////////////////////////////
      // C  = S C
      ////////////////////////////
      m_C = m_S*m_C;
      ////////////////////////////
      // convergence monitor
      ////////////////////////////
      m_rr = m_C.adjoint() * m_C;
      FineTimer.Stop();
      RealD max_resid=0;
      RealD rrsum=0;
      RealD sssum=0;
      RealD rr;
      for(int b=0;b<nrhs;b++) {
 	rrsum+=real(m_rr(b,b));
 	sssum+=ssq[b];
 	rr = real(m_rr(b,b))/ssq[b];
 	if ( rr > max_resid ) max_resid = rr;
      }
      std::cout << GridLogMessage <<
 	  "\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
      if ( max_resid < Tolerance*Tolerance ) { 
 	HDCGTimer.Stop();
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg  "<<LinalgTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H  "<<M3Timer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse  "<<CoarseTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine    "<<FineTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth  "<<SmoothTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert  "<<InsertTimer.Elapsed()<<std::endl;;
 	for(int rhs=0;rhs<nrhs;rhs++){
 	  _FineLinop.HermOp(X[rhs],tmp[rhs]);			  
 	  Field mytmp(grid);
 	  axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
 	  RealD  xnorm   = sqrt(norm2(X[rhs]));
 	  RealD  srcnorm = sqrt(norm2(src[rhs]));
 	  RealD  tmpnorm = sqrt(norm2(mytmp));
 	  RealD  true_residual = tmpnorm/srcnorm;
 	  std::cout<<GridLogMessage
 		   <<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
 		   <<" solution "<<xnorm
 		   <<" source "<<srcnorm
 		   <<std::endl;
 	}
 	return;
      }
    }
    HDCGTimer.Stop();
    std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
    assert(0);
  }
  virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
  {
    std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
    src[0].Grid()->Barrier();
@ -657,23 +361,15 @@ public:
    CoarseField PleftProjMrhs(this->coarsegridmrhs);
    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
 #undef SMOOTHER_BLOCK_SOLVE
 #if SMOOTHER_BLOCK_SOLVE
    this->SmoothTimer.Start();
    this->_Smoother(in,Min);
    this->SmoothTimer.Stop();
 #else
    for(int rhs=0;rhs<nrhs;rhs++) {
      this->SmoothTimer.Start();
      this->_Smoother(in[rhs],Min[rhs]);
      this->SmoothTimer.Stop();
    }
 #endif
    for(int rhs=0;rhs<nrhs;rhs++) {
      this->FineTimer.Start();
      this->_FineLinop.HermOp(Min[rhs],out[rhs]);
      axpy(tmp[rhs],-1.0,out[rhs],in[rhs]);          // resid  = in - A Min
      this->FineTimer.Stop();
--- a/Grid/algorithms/iterative/BlockConjugateGradient.h
+++ b/Grid/algorithms/iterative/BlockConjugateGradient.h
@ -31,58 +31,6 @@ directory
 NAMESPACE_BEGIN(Grid);
 template<class Field>
 void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
  typedef typename Field::scalar_type scomplex;
  int Nblock = X.size();
  for(int b=0;b<Nblock;b++){
  for(int bp=0;bp<Nblock;bp++) {
    m(b,bp) = innerProduct(X[b],Y[bp]);  
  }}
 }
 template<class Field>
 void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
  // Should make this cache friendly with site outermost, parallel_for
  // Deal with case AP aliases with either Y or X
  //
  //Could pack "X" and "AP" into a Nblock x Volume dense array.
  // AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
  typedef typename Field::scalar_type scomplex;
  int Nblock = AP.size();
  std::vector<Field> tmp(Nblock,X[0]);
  for(int b=0;b<Nblock;b++){
    tmp[b]   = Y[b];
    for(int bp=0;bp<Nblock;bp++) {
      tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp]; 
    }
  }
  for(int b=0;b<Nblock;b++){
    AP[b] = tmp[b];
  }
 }
 template<class Field>
 void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
  // Should make this cache friendly with site outermost, parallel_for
  typedef typename Field::scalar_type scomplex;
  int Nblock = AP.size();
  for(int b=0;b<Nblock;b++){
    AP[b] = Zero();
    for(int bp=0;bp<Nblock;bp++) {
      AP[b] += scomplex(m(bp,b))*X[bp]; 
    }
  }
 }
 template<class Field>
 double normv(const std::vector<Field> &P){
  int Nblock = P.size();
  double nn = 0.0;
  for(int b=0;b<Nblock;b++) {
    nn+=norm2(P[b]);
  }
  return nn;
 }
 enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
 //////////////////////////////////////////////////////////////////////////
@ -139,19 +87,10 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  // Force manifest hermitian to avoid rounding related
  /*
  int rank=m_rr.rows();
  for(int r=0;r<rank;r++){
  for(int s=0;s<rank;s++){
    std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
  }}
  */
  m_rr = 0.5*(m_rr+m_rr.adjoint());
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
 //  ComplexD det = L.determinant();
 //  std::cout << " Det m_rr "<<det<<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -171,20 +110,11 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 const std::vector<Field> & R)
 {
  InnerProductMatrix(m_rr,R,R);
-  /*
+
  int rank=m_rr.rows();
  for(int r=0;r<rank;r++){
  for(int s=0;s<rank;s++){
    std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
  }}
  */
  m_rr = 0.5*(m_rr+m_rr.adjoint());
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
  //  ComplexD det = L.determinant();
  //  std::cout << " Det m_rr "<<det<<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
@ -256,7 +186,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  sliceNorm(ssq,B,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
  sliceNorm(residuals,B,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
@ -292,9 +221,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  Linop.HermOp(X, AD);
  tmp = B - AD;  
  sliceNorm(residuals,tmp,Orthog);
  for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  D=Q;
@ -310,8 +236,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  RealD max_resid=0;
  int k;
  for (k = 1; k <= MaxIterations; k++) {
@ -356,7 +280,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
     */
    m_rr = m_C.adjoint() * m_C;
-    max_resid=0;
+    RealD max_resid=0;
    RealD rrsum=0;
    RealD rr;
@ -398,9 +322,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    }
  }
-
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
 	    <<" residual "<< std::sqrt(max_resid)<< std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
@ -544,6 +466,43 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
  IterationsToComplete = k;
 }
 void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
  for(int b=0;b<Nblock;b++){
  for(int bp=0;bp<Nblock;bp++) {
    m(b,bp) = innerProduct(X[b],Y[bp]);  
  }}
 }
 void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
  // Should make this cache friendly with site outermost, parallel_for
  // Deal with case AP aliases with either Y or X
  std::vector<Field> tmp(Nblock,X[0]);
  for(int b=0;b<Nblock;b++){
    tmp[b]   = Y[b];
    for(int bp=0;bp<Nblock;bp++) {
      tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp]; 
    }
  }
  for(int b=0;b<Nblock;b++){
    AP[b] = tmp[b];
  }
 }
 void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
  // Should make this cache friendly with site outermost, parallel_for
  for(int b=0;b<Nblock;b++){
    AP[b] = Zero();
    for(int bp=0;bp<Nblock;bp++) {
      AP[b] += scomplex(m(bp,b))*X[bp]; 
    }
  }
 }
 double normv(const std::vector<Field> &P){
  double nn = 0.0;
  for(int b=0;b<Nblock;b++) {
    nn+=norm2(P[b]);
  }
  return nn;
 }
 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQvec implementation:
 //--------------------------
@ -590,7 +549,6 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
  RealD sssum=0;
  for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
  for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
@ -627,7 +585,6 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
  for(int b=0;b<Nblock;b++) {
    Linop.HermOp(X[b], AD[b]);
    tmp[b] = B[b] - AD[b];  
    std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
  }
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@ -38,7 +38,6 @@ NAMESPACE_BEGIN(Grid);
 // single input vec, single output vec.
 /////////////////////////////////////////////////////////////
 template <class Field>
 class ConjugateGradient : public OperatorFunction<Field> {
 public:
@ -58,22 +57,10 @@ public:
      ErrorOnNoConverge(err_on_no_conv)
  {};
  virtual void LogIteration(int k,RealD a,RealD b){
    //    std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
  };
  virtual void LogBegin(void){
    std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
  };
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
      this->LogBegin();
    GRID_TRACE("ConjugateGradient");
    GridStopWatch PreambleTimer;
    GridStopWatch ConstructTimer;
    GridStopWatch NormTimer;
    GridStopWatch AssignTimer;
    PreambleTimer.Start();
    psi.Checkerboard() = src.Checkerboard();
@ -83,19 +70,14 @@ public:
    //RealD b_pred;
    // Was doing copies
    ConstructTimer.Start();
    Field p(src.Grid());
    Field mmp(src.Grid());
    Field r(src.Grid());
    ConstructTimer.Stop();
    // Initial residual computation & set up
    NormTimer.Start();
    ssq = norm2(src);
    RealD guess = norm2(psi);
    NormTimer.Stop();
    assert(std::isnan(guess) == 0);
    AssignTimer.Start();
    if ( guess == 0.0 ) {
      r = src;
      p = r;
@ -107,7 +89,6 @@ public:
      a = norm2(p);
    }
    cp = a;
    AssignTimer.Stop();
    // Handle trivial case of zero src
    if (ssq == 0.){
@ -183,7 +164,6 @@ public:
      }
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
      LogIteration(k,a,b);
      IterationTimer.Stop();
      if ( (k % 500) == 0 ) {
@ -240,9 +220,6 @@ public:
    	      <<" residual "<< std::sqrt(cp / ssq)<< std::endl;
    SolverTimer.Stop();
    std::cout << GridLogMessage << "\tPreamble   " << PreambleTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tConstruct  " << ConstructTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tNorm       " << NormTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tAssign     " << AssignTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tSolver     " << SolverTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
    std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
@ -256,118 +233,5 @@ public:
  }
 };
 template <class Field>
 class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
 public:
  // Optionally record the CG polynomial
  std::vector<double> ak;
  std::vector<double> bk;
  std::vector<double> poly_p;
  std::vector<double> poly_r;
  std::vector<double> poly_Ap;
  std::vector<double> polynomial;
 public:
  ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
  { };
  void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
  {
    Field tmp(src.Grid());
    Field AtoN(src.Grid());
    AtoN = src;
    psi=AtoN*polynomial[0];
    for(int n=1;n<polynomial.size();n++){
      tmp = AtoN;
      Linop.HermOp(tmp,AtoN);
      psi = psi + polynomial[n]*AtoN;
    }
  }
  void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
  {
    Field Ap(src.Grid());
    Field r(src.Grid());
    Field p(src.Grid());
    p=src;
    r=src;
    x=Zero();
    x.Checkerboard()=src.Checkerboard();
    for(int k=0;k<ak.size();k++){
      x = x + ak[k]*p;
      Linop.HermOp(p,Ap);
      r = r - ak[k] * Ap;
      p = r + bk[k] * p;
    }
  }
  void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
  {
    psi=Zero();
    this->operator ()(Linop,src,psi);
  }
  virtual void LogBegin(void)
  {
    std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
    ak.resize(0);
    bk.resize(0);
    polynomial.resize(0);
    poly_Ap.resize(0);
    poly_Ap.resize(0);
    poly_p.resize(1);
    poly_r.resize(1);
    poly_p[0]=1.0;
    poly_r[0]=1.0;
  };
  virtual void LogIteration(int k,RealD a,RealD b)
  {
    // With zero guess,
    // p = r = src
    //
    // iterate:
    //   x =  x + a p
    //   r =  r - a A p
    //   p =  r + b p
    //
    // [0]
    // r = x
    // p = x
    // Ap=0
    //
    // [1]
    // Ap = A x + 0  ==> shift poly P right by 1 and add 0.
    // x  = x + a p  ==> add polynomials term by term 
    // r  = r - a A p  ==> add polynomials term by term
    // p  = r + b p  ==> add polynomials term by term
    //
    std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
    ak.push_back(a);
    bk.push_back(b);
    //  Ap= right_shift(p)
    poly_Ap.resize(k+1);
    poly_Ap[0]=0.0;
    for(int i=0;i<k;i++){
      poly_Ap[i+1]=poly_p[i];
    }
    //  x = x + a p
    polynomial.resize(k);
    polynomial[k-1]=0.0;
    for(int i=0;i<k;i++){
      polynomial[i] = polynomial[i] + a * poly_p[i];
    }
    //  r = r - a Ap
    //  p = r + b p
    poly_r.resize(k+1);
    poly_p.resize(k+1);
    poly_r[k] = poly_p[k] = 0.0;
    for(int i=0;i<k+1;i++){
      poly_r[i] = poly_r[i] - a * poly_Ap[i];
      poly_p[i] = poly_r[i] + b * poly_p[i];
    }
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@ -102,11 +102,11 @@ public:
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
-    // remove dynamic sized arrays on stack; 2d is a pain with vector
+    // dynamic sized arrays on stack; 2d is a pain with vector
-    std::vector<RealD>  bs(nshift);
+    RealD  bs[nshift];
-    std::vector<RealD>  rsq(nshift);
+    RealD  rsq[nshift];
-    std::vector<std::array<RealD,2> >  z(nshift);
+    RealD  z[nshift][2];
-    std::vector<int>     converged(nshift);
+    int     converged[nshift];
    const int       primary =0;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
@ -123,11 +123,11 @@ public:
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
-    std::vector<RealD>  bs(nshift);
+    RealD  bs[nshift];
-    std::vector<RealD>  rsq(nshift);
+    RealD  rsq[nshift];
-    std::vector<RealD>  rsqf(nshift);
+    RealD  rsqf[nshift];
-    std::vector<std::array<RealD,2> >  z(nshift);
+    RealD  z[nshift][2];
-    std::vector<int>     converged(nshift);
+    int     converged[nshift];
    const int       primary =0;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@ -156,11 +156,11 @@ public:
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
-    std::vector<RealD>  bs(nshift);
+    RealD  bs[nshift];
-    std::vector<RealD>  rsq(nshift);
+    RealD  rsq[nshift];
-    std::vector<RealD>  rsqf(nshift);
+    RealD  rsqf[nshift];
-    std::vector<std::array<RealD,2> >  z(nshift);
+    RealD  z[nshift][2];
-    std::vector<int>     converged(nshift);
+    int     converged[nshift];
    const int       primary =0;
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h
@ -279,16 +279,16 @@ public:
      Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
      diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
      _sort.push(eval2,Nm);
-      //      Glog << "#Ritz value before shift: "<< std::endl;
+      Glog << "#Ritz value before shift: "<< std::endl;
      for(int i=0; i<Nm; ++i){
-	//	std::cout.precision(13);
+	std::cout.precision(13);
-	//	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	//	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
      }
      //----------------------------------------------------------------------
      if ( Nm>Nk ) {
-	//        Glog <<" #Apply shifted QR transformations "<<std::endl;
+        Glog <<" #Apply shifted QR transformations "<<std::endl;
        //int k2 = Nk+Nu;
        int k2 = Nk;
@ -326,7 +326,7 @@ public:
        Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
        diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
        _sort.push(eval2,Nk);
-	//	Glog << "#Ritz value after shift: "<< std::endl;
+	Glog << "#Ritz value after shift: "<< std::endl;
        for(int i=0; i<Nk; ++i){
 	  //          std::cout.precision(13);
 	  //          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
@ -644,7 +644,7 @@ private:
      //      for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
      k_start +=mrhs;
    }
-    //    Glog << "LinAlg "<< std::endl;
+    Glog << "LinAlg "<< std::endl;
    if (b>0) {
      for (int u=0; u<Nu; ++u) {
@ -678,7 +678,7 @@ private:
      }
      w_copy[u] = w[u];
    }
-    //    Glog << "LinAlg done"<< std::endl;
+    Glog << "LinAlg done"<< std::endl;
    // In block version, the steps 6 and 7 in Lanczos construction is
    // replaced by the QR decomposition of new basis block.
@ -691,15 +691,15 @@ private:
    }
    // re-orthogonalization for numerical stability
-    //    Glog << "Gram Schmidt"<< std::endl;
+    Glog << "Gram Schmidt"<< std::endl;
    orthogonalize(w,Nu,evec,R);
    // QR part
    for (int u=1; u<Nu; ++u) {
      orthogonalize(w[u],w,u);
    }
-    //    Glog << "Gram Schmidt done "<< std::endl;
+    Glog << "Gram Schmidt done "<< std::endl;
-    //    Glog << "LinAlg "<< std::endl;
+    Glog << "LinAlg "<< std::endl;
    for (int u=0; u<Nu; ++u) {
      //for (int v=0; v<Nu; ++v) {
      for (int v=u; v<Nu; ++v) {
@ -716,7 +716,7 @@ private:
 	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
      }
    }
-    //    Glog << "LinAlg done "<< std::endl;
+    Glog << "LinAlg done "<< std::endl;
    if (b < Nm/Nu-1) {
      for (int u=0; u<Nu; ++u) {
@ -935,7 +935,7 @@ if (1){
         int Nu, int Nb, int Nk, int Nm,
         Eigen::MatrixXcd& M)
  {
-    //    Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
+    Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
    assert( Nk <= Nm );
    M = Eigen::MatrixXcd::Zero(Nk,Nk);
@ -953,7 +953,7 @@ if (1){
        M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
      }
    }
-    //    Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl; 
+    Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl; 
  }
@ -963,7 +963,7 @@ if (1){
         int Nu, int Nb, int Nk, int Nm,
         Eigen::MatrixXcd& M)
  {
-    //    Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
+    Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
    assert( Nk <= Nm );
@ -979,7 +979,7 @@ if (1){
        lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
      }
    }
-    //    Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl; 
+    Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl; 
  }
@ -988,7 +988,7 @@ if (1){
 		            RealD Dsh,
 		            Eigen::MatrixXcd& Qprod)
  {
-    //    Glog << "shiftedQRDecompEigen() begin" << '\n'; 
+    Glog << "shiftedQRDecompEigen() begin" << '\n'; 
    Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
    Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
    Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
@ -1004,7 +1004,7 @@ if (1){
                        // lower triangular part used to represent series
                        // of Q sequence.
-    //    Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n'; 
+    Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n'; 
    // equivalent operation of Qprod *= Q
    //M = Eigen::MatrixXcd::Zero(Nm,Nm);
@ -1025,7 +1025,7 @@ if (1){
    Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
-    //    Glog << "shiftedQRDecompEigen() Mtmp create" << '\n'; 
+    Glog << "shiftedQRDecompEigen() Mtmp create" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      for (int j=0; j<Nm-(Nu+1); ++j) {
        for (int k=0; k<Nu+1+j; ++k) {
@ -1033,7 +1033,7 @@ if (1){
        }
      }
    }
-    //    Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n'; 
+    Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      for (int j=Nm-(Nu+1); j<Nm; ++j) {
        for (int k=0; k<Nm; ++k) {
@ -1041,7 +1041,7 @@ if (1){
        }
      }
    }
-    //    Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n'; 
+    Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n'; 
    //static int ntimes = 2;
    //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
@ -1067,13 +1067,13 @@ if (1){
        Mtmp(j,i) = conj(Mtmp(i,j));
      }
    }
-    //    Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n'; 
+    Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
    }
-    //    Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n'; 
+    Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n'; 
    M = Mtmp;
    //M = Q.adjoint()*(M*Q);
@ -1085,7 +1085,7 @@ if (1){
    //  }
    //}
-    //    Glog << "shiftedQRDecompEigen() end" <<std::endl; 
+    Glog << "shiftedQRDecompEigen() end" <<std::endl; 
  }
  void exampleQRDecompEigen(void)
--- a/Grid/algorithms/iterative/NormalEquations.h
+++ b/Grid/algorithms/iterative/NormalEquations.h
@ -60,32 +60,6 @@ public:
  }     
 };
 template<class Field> class NormalResidual : public LinearFunction<Field>{
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
  LinearFunction<Field>   & _Guess;
 public:
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
 NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
 		 LinearFunction<Field> &Guess) 
   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
  void operator() (const Field &in, Field &out){
    Field res(in.Grid());
    Field tmp(in.Grid());
    MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
    _Guess(in,res);
    _HermitianSolver(MMdagOp,in,res);  // M Mdag res = in ;
    _Matrix.Mdag(res,out);             // out = Mdag res
  }     
 };
 template<class Field> class HPDSolver : public LinearFunction<Field> {
 private:
  LinearOperatorBase<Field> & _Matrix;
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@ -20,7 +20,7 @@ template<class Field> class PowerMethod
    RealD evalMaxApprox = 0.0; 
    auto src_n = src; 
    auto tmp = src; 
-    const int _MAX_ITER_EST_ = 200; 
+    const int _MAX_ITER_EST_ = 100; 
    for (int i=0;i<_MAX_ITER_EST_;i++) { 
@ -30,17 +30,18 @@ template<class Field> class PowerMethod
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
-      std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
+      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
-      //      if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) { 
+      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	// 	evalMaxApprox = na; 
 	// 	return evalMaxApprox; 
      //      } 
 	evalMaxApprox = na; 
      src_n = tmp;
    }
 	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
 	return evalMaxApprox; 
      } 
      evalMaxApprox = na; 
      src_n = tmp;
    }
    assert(0);
    return 0;
  }
 };
 }
--- a/Grid/algorithms/iterative/PowerSpectrum.h
+++ b/Grid/algorithms/iterative/PowerSpectrum.h
@ -1,76 +0,0 @@
 #pragma once
 namespace Grid {
 class Band
 {
  RealD lo, hi;
 public:
  Band(RealD _lo,RealD _hi)
  {
    lo=_lo;
    hi=_hi;
  }
  RealD operator() (RealD x){
    if ( x>lo && x<hi ){
      return 1.0;
    } else {
      return 0.0;
    }
  }
 };
 class PowerSpectrum
 { 
 public: 
  template<typename T>  static RealD normalise(T& v) 
  {
    RealD nn = norm2(v);
    nn = sqrt(nn);
    v = v * (1.0/nn);
    return nn;
  }
  std::vector<RealD> ranges;
  std::vector<int> order;
  PowerSpectrum(  std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order)  { };
  template<class Field>
  RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src) 
  { 
    GridBase *grid = src.Grid(); 
    int N=ranges.size();
    RealD hi = ranges[N-1];
    RealD lo_band = 0.0;
    RealD hi_band;
    RealD nn=norm2(src);
    RealD ss=0.0;
    Field tmp = src;
    for(int b=0;b<N;b++){
      hi_band = ranges[b];
      Band Notch(lo_band,hi_band);
      Chebyshev<Field> polynomial;
      polynomial.Init(0.0,hi,order[b],Notch);
      polynomial.JacksonSmooth();
      polynomial(HermOp,src,tmp) ;
      RealD p=norm2(tmp);
      ss=ss+p;
      std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
      lo_band=hi_band;
    }
    std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
    std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
    return 0;
  };
 };
 }
--- a/Grid/algorithms/multigrid/CoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/CoarsenedMatrix.h
@ -99,7 +99,7 @@ public:
  CoarseMatrix AselfInvEven;
  CoarseMatrix AselfInvOdd;
-  deviceVector<RealD> dag_factor;
+  Vector<RealD> dag_factor;
  ///////////////////////
  // Interface
@ -124,13 +124,9 @@ public:
    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
-    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    Vector<Aview> AcceleratorViewContainer;
    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
-    for(int p=0;p<geom.npoint;p++) {
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
    }
    Aview *Aview_p = & AcceleratorViewContainer[0];
    const int Nsimd = CComplex::Nsimd();
@ -165,7 +161,7 @@ public:
      coalescedWrite(out_v[ss](b),res);
      });
-    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
  };
  void Mdag (const CoarseVector &in, CoarseVector &out)
@ -194,14 +190,9 @@ public:
    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
-    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
    for(int p=0;p<geom.npoint;p++) {
      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
    }
    Aview *Aview_p = & AcceleratorViewContainer[0];
    const int Nsimd = CComplex::Nsimd();
@ -210,10 +201,10 @@ public:
    int osites=Grid()->oSites();
-    deviceVector<int> points(geom.npoint);
+    Vector<int> points(geom.npoint, 0);
-    for(int p=0; p<geom.npoint; p++) { 
+    for(int p=0; p<geom.npoint; p++)
-      acceleratorPut(points[p],geom.points_dagger[p]);
+      points[p] = geom.points_dagger[p];
-    }
+
    auto points_p = &points[0];
    RealD* dag_factor_p = &dag_factor[0];
@ -245,7 +236,7 @@ public:
      coalescedWrite(out_v[ss](b),res);
      });
-    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
  }
  void MdirComms(const CoarseVector &in)
@ -260,14 +251,8 @@ public:
    out.Checkerboard() = in.Checkerboard();
    typedef LatticeView<Cobj> Aview;
-
+    Vector<Aview> AcceleratorViewContainer;
-    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
    for(int p=0;p<geom.npoint;p++) {
      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
    }
    Aview *Aview_p = & AcceleratorViewContainer[0];
    autoView( out_v , out, AcceleratorWrite);
@ -300,7 +285,7 @@ public:
      }
      coalescedWrite(out_v[ss](b),res);
    });
-    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
  }
  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
  {
@ -484,20 +469,14 @@ public:
    // determine in what order we need the points
    int npoint = geom.npoint-1;
-    deviceVector<int> points(npoint);
+    Vector<int> points(npoint, 0);
-    for(int p=0; p<npoint; p++) {
+    for(int p=0; p<npoint; p++)
-      int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
+      points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
-      acceleratorPut(points[p], val);
+
    }
    auto points_p = &points[0];
-    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    Vector<Aview> AcceleratorViewContainer;
-    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
+    for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
    for(int p=0;p<geom.npoint;p++) {
      hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
    }
    Aview *Aview_p = & AcceleratorViewContainer[0];
    const int Nsimd = CComplex::Nsimd();
@ -560,7 +539,7 @@ public:
      });
    }
-    for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
  }
  CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	:
@ -611,13 +590,11 @@ public:
    }
    // GPU readable prefactor
    std::vector<RealD> h_dag_factor(nbasis*nbasis);
    thread_for(i, nbasis*nbasis, {
      int j = i/nbasis;
      int k = i%nbasis;
-      h_dag_factor[i] = dag_factor_eigen(j, k);
+      dag_factor[i] = dag_factor_eigen(j, k);
    });
    acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
  }
  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@ -174,11 +174,21 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview
+#ifdef ACCELERATOR_CSHIFT
-template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // 
+// Cshift on device
-template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page
+template<class T> using cshiftAllocator = devAllocator<T>;
-template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector
+#else
 // Cshift on host
 template<class T> using cshiftAllocator = std::allocator<T>;
 #endif
 template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
 template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
 template<class T> using commVector    = std::vector<T,devAllocator<T> >;
 template<class T> using deviceVector  = std::vector<T,devAllocator<T> >;
 template<class T> using cshiftVector  = std::vector<T,cshiftAllocator<T> >;
 /*
 template<class T> class vecView
 {
 protected:
@ -187,9 +197,8 @@ template<class T> class vecView
  ViewMode mode;
  void * cpu_ptr;
 public:
  // Rvalue accessor
  accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
-  vecView(Vector<T> &refer_to_me,ViewMode _mode)
+  vecView(std::vector<T> &refer_to_me,ViewMode _mode)
  {
    cpu_ptr = &refer_to_me[0];
    size = refer_to_me.size();
@ -205,15 +214,26 @@ template<class T> class vecView
  }
 };
-template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
+template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
 {
  vecView<T> ret(vec,_mode); // does the open
  return ret;                // must be closed
 }
 // Little autoscope assister
 template<class View> 
 class VectorViewCloser
 {
  View v;  // Take a copy of view and call view close when I go out of scope automatically
 public:
  VectorViewCloser(View &_v) : v(_v) {};
  ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose();  MemoryManager::NotifyDeletion(ptr);}
 };
 #define autoVecView(v_v,v,mode)					\
  auto v_v = VectorView(v,mode);				\
  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
 */
 NAMESPACE_END(Grid);
--- a/Grid/allocator/MemoryStats.cc
+++ b/Grid/allocator/MemoryStats.cc
@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
-  std::vector<uint64_t> pagedata(npages);
+  uint64_t pagedata[npages];
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
-  ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
+  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@ -82,7 +82,6 @@ public:
  bool _isCheckerBoarded; 
  int        LocallyPeriodic;
  Coordinate _checker_dim_mask;
  int              _checker_dim;
 public:
@ -92,6 +91,7 @@ public:
  ////////////////////////////////////////////////////////////////
  virtual int CheckerBoarded(int dim)=0;
  virtual int CheckerBoard(const Coordinate &site)=0;
  virtual int CheckerDim(void){ return 0; };
  virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
  virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
  virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
--- a/Grid/cartesian/Cartesian_full.h
+++ b/Grid/cartesian/Cartesian_full.h
@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
 public:
  int dummy;
-  //  Coordinate _checker_dim_mask;
+  Coordinate _checker_dim_mask;
  virtual int  CheckerBoardFromOindexTable (int Oindex) {
    return 0;
  }
@ -106,7 +106,6 @@ public:
    _rdimensions.resize(_ndimension);
    _simd_layout.resize(_ndimension);
    _checker_dim_mask.resize(_ndimension);;
    _checker_dim = -1;
    _lstart.resize(_ndimension);
    _lend.resize(_ndimension);
--- a/Grid/cartesian/Cartesian_red_black.h
+++ b/Grid/cartesian/Cartesian_red_black.h
@ -57,10 +57,10 @@ class GridRedBlackCartesian : public GridBase
 {
 public:
  //  Coordinate _checker_dim_mask;
-  //  int              _checker_dim;
+  int              _checker_dim;
  std::vector<int> _checker_board;
-  virtual int isCheckerBoarded(void) const { return 1; };
+  virtual int CheckerDim(void){ return _checker_dim; };
  virtual int CheckerBoarded(int dim){
    if( dim==_checker_dim) return 1;
    else return 0;
--- a/Grid/cshift/Cshift.h
+++ b/Grid/cshift/Cshift.h
@ -51,6 +51,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #endif 
 NAMESPACE_BEGIN(Grid);
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
 auto Cshift(const Expression &expr,int dim,int shift)  -> decltype(closure(expr)) 
 {
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@ -30,11 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 extern std::vector<std::pair<int,int> > Cshift_table; 
-extern deviceVector<std::pair<int,int> > Cshift_table_device; 
+extern commVector<std::pair<int,int> > Cshift_table_device; 
 inline std::pair<int,int> *MapCshiftTable(void)
 {
  // GPU version
 #ifdef ACCELERATOR_CSHIFT    
  uint64_t sz=Cshift_table.size();
  if (Cshift_table_device.size()!=sz )    {
    Cshift_table_device.resize(sz);
@ -44,13 +45,16 @@ inline std::pair<int,int> *MapCshiftTable(void)
 			  sizeof(Cshift_table[0])*sz);
  return &Cshift_table_device[0];
 #else 
  return &Cshift_table[0];
 #endif
  // CPU version use identify map
 }
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];
@ -90,10 +94,17 @@ Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dim
  {
    auto buffer_p = & buffer[0];
    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
 #else
    autoView(rhs_v , rhs, CpuRead);
    thread_for(i,ent,{
      buffer_p[table[i].first]=rhs_v[table[i].second];
    });
 #endif
  }
 }
@ -118,6 +129,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
  int n1=rhs.Grid()->_slice_stride[dimension];
  if ( cbmask ==0x3){
 #ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@ -128,10 +140,21 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      });
 #else
    autoView(rhs_v , rhs, CpuRead);
    thread_for2d(n,e1,b,e2,{
 	int o      =   n*n1;
 	int offset = b+n*e2;
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      });
 #endif
  } else { 
    Coordinate rdim=rhs.Grid()->_rdimensions;
    Coordinate cdm =rhs.Grid()->_checker_dim_mask;
    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@ -152,13 +175,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	  extract<vobj>(temp,pointers,offset);
 	}
      });
 #else
    autoView(rhs_v , rhs, CpuRead);
    thread_for2d(n,e1,b,e2,{
 	Coordinate coor;
 	int o=n*n1;
 	int oindex = o+b;
       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
 	int ocb=1<<cb;
 	int offset = b+n*e2;
 	if ( ocb & cbmask ) {
 	  vobj temp =rhs_v[so+o+b];
 	  extract<vobj>(temp,pointers,offset);
 	}
      });
 #endif
  }
 }
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];
@ -202,10 +245,17 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
  {
    auto buffer_p = & buffer[0];
    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
    });
 #else
    autoView( rhs_v, rhs, CpuWrite);
    thread_for(i,ent,{
      rhs_v[table[i].first]=buffer_p[table[i].second];
    });
 #endif
  }
 }
@ -228,6 +278,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  if(cbmask ==0x3 ) {
    int _slice_stride = rhs.Grid()->_slice_stride[dimension];
    int _slice_block = rhs.Grid()->_slice_block[dimension];
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v , rhs, AcceleratorWrite);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@ -236,6 +287,14 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
      });
 #else
    autoView( rhs_v , rhs, CpuWrite);
    thread_for2d(n,e1,b,e2,{
 	int o      = n*_slice_stride;
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
    });
 #endif
  } else { 
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
@ -301,11 +360,19 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  {
    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
 #else
    autoView(rhs_v , rhs, CpuRead);
    autoView(lhs_v , lhs, CpuWrite);
    thread_for(i,ent,{
      lhs_v[table[i].first]=rhs_v[table[i].second];
    });
 #endif
  }
 }
@ -345,11 +412,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  {
    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);
    accelerator_for(i,ent,1,{
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
    });
 #else
    autoView( rhs_v, rhs, CpuRead);
    autoView( lhs_v, lhs, CpuWrite);
    thread_for(i,ent,{
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
    });
 #endif
  }
 }
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@ -94,7 +94,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
  sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
  sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
-  //  std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+  //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
  if ( sshift[0] == sshift[1] ) {
    //std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
@ -104,6 +104,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
 }
 #define ACCELERATOR_CSHIFT_NO_COPY
 #ifdef ACCELERATOR_CSHIFT_NO_COPY
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
@ -123,8 +125,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
+  static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
-  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
+  static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@ -159,7 +161,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      tcomms-=usecond();
-      grid->Barrier();
+      //      grid->Barrier();
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
@ -167,7 +169,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 			   recv_from_rank,
 			   bytes);
      xbytes+=bytes;
-      grid->Barrier();
+      //      grid->Barrier();
      tcomms+=usecond();
      tscatter-=usecond();
@ -175,11 +177,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      tscatter+=usecond();
    }
  }
  /*
  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -220,8 +224,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);
-  static std::vector<deviceVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
-  static std::vector<deviceVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
@ -277,7 +281,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 	tcomms-=usecond();
-	grid->Barrier();
+	//	grid->Barrier();
 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
@ -288,7 +292,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 			     bytes);
 	xbytes+=bytes;
-	grid->Barrier();
+	//	grid->Barrier();
 	tcomms+=usecond();
 	rpointers[i] = &recv_buf_extract[i][0];
@ -301,12 +305,242 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
  }
  /*
  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
 }
 #else 
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  GridBase *grid=rhs.Grid();
  Lattice<vobj> temp(rhs.Grid());
  int fd              = rhs.Grid()->_fdimensions[dimension];
  int rd              = rhs.Grid()->_rdimensions[dimension];
  int pd              = rhs.Grid()->_processors[dimension];
  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
  assert(simd_layout==1);
  assert(comm_dim==1);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
  vobj *send_buf;
  vobj *recv_buf;
  {
    grid->ShmBufferFreeAll();
    size_t bytes = buffer_size*sizeof(vobj);
    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
  }
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  for(int x=0;x<rd;x++){       
    int sx        =  (x+sshift)%rd;
    int comm_proc = ((x+sshift)/rd)%pd;
    if (comm_proc==0) {
      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
      tcopy+=usecond();
    } else {
      int words = buffer_size;
      if (cbmask != 0x3) words=words>>1;
      int bytes = words * sizeof(vobj);
      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
      tgather+=usecond();
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      tcomms-=usecond();
      //      grid->Barrier();
      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
      xbytes+=bytes;
      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
      //      grid->Barrier();
      tcomms+=usecond();
      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
      tscatter+=usecond();
    }
  }
  /*
  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid=rhs.Grid();
  const int Nsimd = grid->Nsimd();
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  int fd = grid->_fdimensions[dimension];
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int pd = grid->_processors[dimension];
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;
  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
  assert(comm_dim==1);
  assert(simd_layout==2);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int permute_type=grid->PermuteType(dimension);
  ///////////////////////////////////////////////
  // Simd direction uses an extract/merge pair
  ///////////////////////////////////////////////
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);
  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
  {
    size_t bytes = sizeof(scalar_object)*buffer_size;
    grid->ShmBufferFreeAll();
    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
  }
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
  int bytes = buffer_size*sizeof(scalar_object);
  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
  ///////////////////////////////////////////
  // Work out what to send where
  ///////////////////////////////////////////
  int cb    = (cbmask==0x2)? Odd : Even;
  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  // loop over outer coord planes orthog to dim
  for(int x=0;x<rd;x++){       
    // FIXME call local permute copy if none are offnode.
    for(int i=0;i<Nsimd;i++){       
      pointers[i] = &send_buf_extract[i][0];
    }
    tgather-=usecond();
    int sx   = (x+sshift)%rd;
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
    tgather+=usecond();
    for(int i=0;i<Nsimd;i++){
      int inner_bit = (Nsimd>>(permute_type+1));
      int ic= (i&inner_bit)? 1:0;
      int my_coor          = rd*ic + x;
      int nbr_coor         = my_coor+sshift;
      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
      int nbr_lane = (i&(~inner_bit));
      int recv_from_rank;
      int xmit_to_rank;
      if (nbr_ic) nbr_lane|=inner_bit;
      assert (sx == nbr_ox);
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 	tcomms-=usecond();
 	//	grid->Barrier();
 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
 	xbytes+=bytes;
 	//	grid->Barrier();
 	tcomms+=usecond();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
      }
    }
    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
  }
  /*
  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
  */
 }
 #endif
 NAMESPACE_END(Grid); 
 #endif
--- a/Grid/cshift/Cshift_table.cc
+++ b/Grid/cshift/Cshift_table.cc
@ -1,5 +1,5 @@
 #include <Grid/GridCore.h>       
 NAMESPACE_BEGIN(Grid);
 std::vector<std::pair<int,int> > Cshift_table; 
-deviceVector<std::pair<int,int> > Cshift_table_device; 
+commVector<std::pair<int,int> > Cshift_table_device; 
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -236,20 +236,17 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
-#if 0
+#if 1
    deviceVector<vobj> vvtmp(1);
    acceleratorPut(vvtmp[0],vtmp);
    vobj *vvtmp_p = & vvtmp[0];
    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 	auto stmp=coalescedRead(*vvtmp_p);
 	coalescedWrite(me[ss],stmp);
    });
 #else    
    auto me  = View(CpuWrite);
    thread_for(ss,me.size(),{
       me[ss]= r;
      });
 #else    
    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 	auto stmp=coalescedRead(vtmp);
 	coalescedWrite(me[ss],stmp);
    });
 #endif    
    me.ViewClose();
    return *this;
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@ -53,19 +53,36 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  typedef decltype(basis[0]) Field;
  typedef decltype(basis[0].View(AcceleratorRead)) View;
-  hostVector<View>  h_basis_v(basis.size());
+  Vector<View> basis_v; basis_v.reserve(basis.size());
-  deviceVector<View> d_basis_v(basis.size());
+  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
  typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
  typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
  GridBase* grid = basis[0].Grid();
  for(int k=0;k<basis.size();k++){
-    h_basis_v[k] = basis[k].View(AcceleratorWrite);
+    basis_v.push_back(basis[k].View(AcceleratorWrite));
    acceleratorPut(d_basis_v[k],h_basis_v[k]);
  }
-  View *basis_vp = &d_basis_v[0];
+#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
    {
      vobj* B = &Bt[Nm * thread_num()];
      thread_for_in_region(ss, grid->oSites(),{
 	  for(int j=j0; j<j1; ++j) B[j]=0.;
 	  for(int j=j0; j<j1; ++j){
 	    for(int k=k0; k<k1; ++k){
 	      B[j] +=Qt(j,k) * basis_v[k][ss];
 	    }
 	  }
 	  for(int j=j0; j<j1; ++j){
 	    basis_v[j][ss] = B[j];
 	  }
 	});
    }
 #else
  View *basis_vp = &basis_v[0];
  int nrot = j1-j0;
  if (!nrot) // edge case not handled gracefully by Cuda
@ -74,19 +91,17 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  uint64_t oSites   =grid->oSites();
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
-  deviceVector <vobj> Bt(siteBlock * nrot); 
+  Vector <vobj> Bt(siteBlock * nrot); 
  auto Bp=&Bt[0];
  // GPU readable copy of matrix
-  hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
+  Vector<Coeff_t> Qt_jv(Nm*Nm);
  deviceVector<Coeff_t> Qt_jv(Nm*Nm);
  Coeff_t *Qt_p = & Qt_jv[0];
  thread_for(i,Nm*Nm,{
      int j = i/Nm;
      int k = i%Nm;
-      h_Qt_jv[i]=Qt(j,k);
+      Qt_p[i]=Qt(j,k);
  });
  acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
  // Block the loop to keep storage footprint down
  for(uint64_t s=0;s<oSites;s+=siteBlock){
@ -122,8 +137,9 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 	coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
      });
  }
 #endif
-  for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
+  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
 }
 // Extract a single rotated vector
@ -136,19 +152,16 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
  result.Checkerboard() = basis[0].Checkerboard();
-  hostVector<View>  h_basis_v(basis.size());
+  Vector<View> basis_v; basis_v.reserve(basis.size());
  deviceVector<View> d_basis_v(basis.size());
  for(int k=0;k<basis.size();k++){
-    h_basis_v[k]=basis[k].View(AcceleratorRead);
+    basis_v.push_back(basis[k].View(AcceleratorRead));
    acceleratorPut(d_basis_v[k],h_basis_v[k]);
  }
  vobj zz=Zero();
-  deviceVector<double> Qt_jv(Nm);
+  Vector<double> Qt_jv(Nm);
  double * Qt_j = & Qt_jv[0];
-  for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
+  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
-  auto basis_vp=& d_basis_v[0];
+  auto basis_vp=& basis_v[0];
  autoView(result_v,result,AcceleratorWrite);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
    vobj zzz=Zero();
@ -158,7 +171,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
    }
    coalescedWrite(result_v[ss], B);
  });
-  for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
+  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
 }
 template<class Field>
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@ -165,7 +165,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  int Nsimd = grid->Nsimd();
-  //  assert( l.Checkerboard()== grid->CheckerBoard(site));
+  assert( l.Checkerboard()== grid->CheckerBoard(site));
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
  static const int words=sizeof(vobj)/sizeof(vector_type);
@ -179,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  for(int w=0;w<words;w++){
    pt[w] = getlane(vp[w],idx);
  }
-  //  std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
+      
  return;
 };
 template<class vobj,class sobj>
@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
  int Nsimd = grid->Nsimd();
-  //  assert( l.Checkerboard()== grid->CheckerBoard(site));
+  assert( l.Checkerboard()== grid->CheckerBoard(site));
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
  static const int words=sizeof(vobj)/sizeof(vector_type);
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
  //  const int Nsimd = vobj::Nsimd();
  const int nthread = GridThread::GetThreads();
-  std::vector<sobj> sumarray(nthread);
+  Vector<sobj> sumarray(nthread);
  for(int i=0;i<nthread;i++){
    sumarray[i]=Zero();
  }
@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
  const int nthread = GridThread::GetThreads();
-  std::vector<sobj> sumarray(nthread);
+  Vector<sobj> sumarray(nthread);
  for(int i=0;i<nthread;i++){
    sumarray[i]=Zero();
  }
@ -343,6 +343,18 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
 #if 0
  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  accelerator_for( ss, sites, nsimd,{
      auto tmp = a*x_v(ss)+b*y_v(ss);
      coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 #else
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
  deviceVector<inner_t> inner_tmp;
  inner_tmp.resize(sites);
@ -354,6 +366,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
 #endif
  grid->GlobalSum(nrm);
  return nrm; 
 }
@ -364,7 +377,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
  conformable(left,right);
  typedef typename vobj::vector_typeD vector_type;
-  std::vector<ComplexD> tmp(2);
+  Vector<ComplexD> tmp(2);
  GridBase *grid = left.Grid();
@ -374,8 +387,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
  // GPU
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  typedef decltype(innerProductD(vobj(),vobj())) norm_t;
-  deviceVector<inner_t> inner_tmp(sites);
+  Vector<inner_t> inner_tmp(sites);
-  deviceVector<norm_t>  norm_tmp(sites);
+  Vector<norm_t>  norm_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  auto norm_tmp_v = &norm_tmp[0];
  {
@ -425,9 +438,7 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
 // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
+template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
 					  std::vector<typename vobj::scalar_object> &result,
 					  int orthogdim)
 {
  ///////////////////////////////////////////////////////
  // FIXME precision promoted summation
@ -449,8 +460,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];
-  std::vector<vobj> lvSum(rd); // will locally sum vectors first
+  Vector<vobj> lvSum(rd); // will locally sum vectors first
-  std::vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
+  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD
  result.resize(fd); // And then global sum to return the same vector to every node 
@ -508,20 +519,7 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
  return result;
 }
 /*
 Reimplement
 1)
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 2)
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 3)
 -- Make Slice Mul Matrix call sliceMaddMatrix
 */
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
@ -541,8 +539,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];
-  std::vector<vector_type> lvSum(rd); // will locally sum vectors first
+  Vector<vector_type> lvSum(rd); // will locally sum vectors first
-  std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
+  Vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  
  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@ -672,96 +670,203 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };
 /*
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
  int nsimd = BlockSolverGrid->Nsimd();
-  std::vector<int> latt_phys(NN-1);
+  std::vector<int> latt_phys(0);
-  Coordinate simd_phys;
+  std::vector<int> simd_phys(0);
-  std::vector<int>  mpi_phys(NN-1);
+  std::vector<int>  mpi_phys(0);
  Coordinate checker_dim_mask(NN-1);
  int checker_dim=-1;
  int dd;
  for(int d=0;d<NN;d++){
    if( d!=Orthog ) { 
-      latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
+      latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
-      mpi_phys[dd] =BlockSolverGrid->_processors[d];
+      simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
-      checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
+      mpi_phys.push_back(BlockSolverGrid->_processors[d]);
      if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
      dd++;
    }
  }
-  simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
+  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
  GridCartesian *tmp         = new GridCartesian(latt_phys,simd_phys,mpi_phys);
  if(BlockSolverGrid->_isCheckerBoarded) {
    GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
    delete tmp;
    return (GridBase *) ret;
  } else { 
    return (GridBase *) tmp;
  }
 }
 */
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  GridBase *FullGrid = X.Grid();
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  Lattice<vobj> Ys(SliceGrid);
  Lattice<vobj> Rs(SliceGrid);
  Lattice<vobj> Xs(SliceGrid);
  Lattice<vobj> RR(FullGrid);
  RR = R; // Copies checkerboard for insert
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::vector_type vector_type;
-  int Nslice = X.Grid()->GlobalDimensions()[Orthog];
+
-  for(int i=0;i<Nslice;i++){
+  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
-    ExtractSlice(Ys,Y,i,Orthog);
+
-    ExtractSlice(Rs,R,i,Orthog);
+  GridBase *FullGrid  = X.Grid();
-    Rs=Ys;
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-    for(int j=0;j<Nslice;j++){
+
-      ExtractSlice(Xs,X,j,Orthog);
+  //  Lattice<vobj> Xslice(SliceGrid);
-      Rs = Rs + Xs*(scale*aa(j,i));
+  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  //  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
  autoView( X_v, X, CpuRead);
  autoView( Y_v, Y, CpuRead);
  autoView( R_v, R, CpuWrite);
  thread_region
  {
    Vector<vobj> s_x(Nblock);
    thread_for_collapse_in_region(2, n,nblock, {
     for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X_v[o+i*ostride];
      }
-    InsertSlice(Rs,RR,i,Orthog);
+
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = Y_v[o+i*ostride];
 	for(int j=0;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R_v[o+i*ostride]=dot;
      }
    }});
  }
  R=RR; // Copy back handles arguments aliasing case
  delete SliceGrid;
 };
 template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
-  R=Zero();
+  typedef typename vobj::scalar_object sobj;
-  sliceMaddMatrix(R,aa,X,R,Orthog,scale);
+  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X.Grid();
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  //  Lattice<vobj> Xslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  //  int nl=1;
  //FIXME package in a convenient iterator
  // thread_for2d_in_region
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
  autoView( R_v, R, CpuWrite);
  autoView( X_v, X, CpuRead);
  thread_region
  {
    std::vector<vobj> s_x(Nblock);
    thread_for_collapse_in_region( 2 ,n,nblock,{
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X_v[o+i*ostride];
      }
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = s_x[0]*(scale*aa(0,i));
 	for(int j=1;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R_v[o+i*ostride]=dot;
      }
    }});
  }
 };
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
  Lattice<vobj> ls(SliceGrid);
  Lattice<vobj> rs(SliceGrid);
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::vector_type vector_type;
-  int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
+  
-  mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
+  GridBase *FullGrid  = lhs.Grid();
-  for(int s=0;s<Nslice;s++){
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-    ExtractSlice(ls,lhs,s,Orthog);
+  
-    for(int ss=0;ss<Nslice;ss++){
+  int Nblock = FullGrid->GlobalDimensions()[Orthog];
-      ExtractSlice(rs,rhs,ss,Orthog);
+  
-      mat(s,ss) = innerProduct(ls,rs);
+  //  Lattice<vobj> Lslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  assert( FullGrid->_simd_layout[Orthog]==1);
  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  //  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
  typedef typename vobj::vector_typeD vector_typeD;
  autoView( lhs_v, lhs, CpuRead);
  autoView( rhs_v, rhs, CpuRead);
  thread_region
  {
    std::vector<vobj> Left(Nblock);
    std::vector<vobj> Right(Nblock);
    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
    thread_for_collapse_in_region( 2, n,nblock,{
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	Left [i] = lhs_v[o+i*ostride];
 	Right[i] = rhs_v[o+i*ostride];
      }
      for(int i=0;i<Nblock;i++){
      for(int j=0;j<Nblock;j++){
 	auto tmp = innerProduct(Left[i],Right[j]);
 	auto rtmp = TensorRemove(tmp);
 	auto red  =  Reduce(rtmp);
 	mat_thread(i,j) += std::complex<double>(real(red),imag(red));
      }}
    }});
    thread_critical
    {
      mat += mat_thread;
    }  
  }
-  delete SliceGrid;
+
  for(int i=0;i<Nblock;i++){
  for(int j=0;j<Nblock;j++){
    ComplexD sum = mat(i,j);
    FullGrid->GlobalSum(sum);
    mat(i,j)=sum;
  }}
  return;
 }
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@ -214,12 +214,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  // Move out of UVM
  // Turns out I had messed up the synchronise after move to compute stream
  // as running this on the default stream fools the synchronise
-  deviceVector<sobj> buffer(numBlocks);
+#undef UVM_BLOCK_BUFFER  
 #ifndef UVM_BLOCK_BUFFER  
  commVector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
  sobj result;
  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
  accelerator_barrier();
  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
 #else
  Vector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
  sobj result;
  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
  accelerator_barrier();
  result = *buffer_v;
 #endif
  return result;
 }
@ -234,7 +244,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
  const int words = sizeof(vobj)/sizeof(vector);
-  deviceVector<vector> buffer(osites);
+  Vector<vector> buffer(osites);
  vector *dat = (vector *)lat;
  vector *buf = &buffer[0];
  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@ -4,20 +4,23 @@ NAMESPACE_BEGIN(Grid);
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_objectD sobjD;
-
+  static Vector<sobj> mysum;
  mysum.resize(1);
  sobj *mysum_p = & mysum[0];
  sobj identity; zeroit(identity);
-  sobj ret; zeroit(ret);
+  mysum[0] = identity;
  sobj ret ; 
  Integer nsimd= vobj::Nsimd();
-  { 
+
-    sycl::buffer<sobj, 1> abuff(&ret, {1});
+  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>());
+    auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
     cgh.parallel_for(cl::sycl::range<1>{osites},
 		      Reduction,
 		      [=] (cl::sycl::id<1> item, auto &sum) {
@ -25,7 +28,9 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
      sum +=Reduce(lat[osite]);
     });
   });
-  }
+  theGridAccelerator->wait();
  ret = mysum[0];
  //  free(mysum,*theGridAccelerator);
  sobjD dret; convertType(dret,ret);
  return dret;
 }
@ -71,22 +76,59 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
 template<class Word> Word svm_xor(Word *vec,uint64_t L)
 {
  Word xorResult; xorResult = 0;
  static Vector<Word> d_sum;
  d_sum.resize(1);
  Word *d_sum_p=&d_sum[0];
  Word identity;  identity=0;
-  Word ret = 0;
+  d_sum[0] = identity;
-  { 
+  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
    sycl::buffer<Word, 1> abuff(&ret, {1});
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
+    auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
     cgh.parallel_for(cl::sycl::range<1>{L},
 		      Reduction,
 		      [=] (cl::sycl::id<1> index, auto &sum) {
 	 sum^=vec[index];
     });
   });
  }
  theGridAccelerator->wait();
  Word ret = d_sum[0];
  //  free(d_sum,*theGridAccelerator);
  return ret;
 }
 NAMESPACE_END(Grid);
 /*
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_type  scalar;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobjD;
  sobjD ret;
  scalarD *ret_p = (scalarD *)&ret;
  const int nsimd = vobj::Nsimd();
  const int words = sizeof(vobj)/sizeof(vector);
  Vector<scalar> buffer(osites*nsimd);
  scalar *buf = &buffer[0];
  vector *dat = (vector *)lat;
  for(int w=0;w<words;w++) {
    accelerator_for(ss,osites,nsimd,{
 	int lane = acceleratorSIMTlane(nsimd);
 	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
    });
    //Precision change at this point is to late to gain precision
    ret_p[w] = svm_reduce(buf,nsimd*osites);
  }
  return ret;
 }
 */
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@ -21,18 +21,9 @@ NAMESPACE_BEGIN(Grid);
 #if defined(GRID_CUDA) || defined(GRID_HIP)
-template<class vobj>
+template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
 inline void sliceSumReduction_cub_small(const vobj *Data,
 					std::vector<vobj> &lvSum,
 					const int rd,
 					const int e1,
 					const int e2,
 					const int stride,
 					const int ostride,
 					const int Nsimd)
 {
  size_t subvol_size = e1*e2;
-  deviceVector<vobj> reduction_buffer(rd*subvol_size);
+  commVector<vobj> reduction_buffer(rd*subvol_size);
  auto rb_p = &reduction_buffer[0];
  vobj zero_init;
  zeroit(zero_init);
@ -103,15 +94,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
 #if defined(GRID_SYCL)
-template<class vobj>
+template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 inline void sliceSumReduction_sycl_small(const vobj *Data,
 					 std::vector <vobj> &lvSum,
 					 const int  &rd,
 					 const int &e1,
 					 const int &e2,
 					 const int &stride,
 					 const int &ostride,
 					 const int &Nsimd)
 {
  size_t subvol_size = e1*e2;
@ -122,7 +105,7 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
    mysum[r] = vobj_zero; 
  }
-  deviceVector<vobj> reduction_buffer(rd*subvol_size);    
+  commVector<vobj> reduction_buffer(rd*subvol_size);    
  auto rb_p = &reduction_buffer[0];
@ -161,23 +144,14 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
 }
 #endif
-template<class vobj>
+template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
 inline void sliceSumReduction_large(const vobj *Data,
 				    std::vector<vobj> &lvSum,
 				    const int rd,
 				    const int e1,
 				    const int e2,
 				    const int stride,
 				    const int ostride,
 				    const int Nsimd)
 {
  typedef typename vobj::vector_type vector;
  const int words = sizeof(vobj)/sizeof(vector);
  const int osites = rd*e1*e2;
-  deviceVector<vector>buffer(osites);
+  commVector<vector>buffer(osites);
  vector *dat = (vector *)Data;
  vector *buf = &buffer[0];
-  std::vector<vector> lvSum_small(rd);
+  Vector<vector> lvSum_small(rd);
  vector *lvSum_ptr = (vector *)&lvSum[0];
  for (int w = 0; w < words; w++) {
@ -194,18 +168,13 @@ inline void sliceSumReduction_large(const vobj *Data,
    for (int r = 0; r < rd; r++) {
      lvSum_ptr[w+words*r]=lvSum_small[r];
    }
-  }
+
  }
-template<class vobj>
+  
-inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
+}
-				  std::vector<vobj> &lvSum,
+
-				  const int rd,
+template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 				  const int e1,
 				  const int e2,
 				  const int stride,
 				  const int ostride,
 				  const int Nsimd)
 {
  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
    if constexpr (sizeof(vobj) <= 256) { 
@ -223,15 +192,7 @@ inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
 }
-template<class vobj>
+template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
 				  std::vector<vobj> &lvSum,
 				  const int &rd,
 				  const int &e1,
 				  const int &e2,
 				  const int &stride,
 				  const int &ostride,
 				  const int &Nsimd)
 {
  // sum over reduced dimension planes, breaking out orthog dir
  // Parallel over orthog direction
@ -247,19 +208,15 @@ inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
  });
 }
-template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
+template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
 						   std::vector<vobj> &lvSum,
 						   const int &rd,
 						   const int &e1,
 						   const int &e2,
 						   const int &stride,
 						   const int &ostride,
 						   const int &Nsimd) 
 {
  #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #else
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #endif
 }
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -43,49 +43,20 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // remove and insert a half checkerboard
 ////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
 {
-  half.Checkerboard() = cb;
+  acceleratorPickCheckerboard(cb,half,full);
  autoView( half_v, half, CpuWrite);
  autoView( full_v, full, CpuRead);
  thread_for(ss, full.Grid()->oSites(),{
    int cbos;
    Coordinate coor;
    full.Grid()->oCoorFromOindex(coor,ss);
    cbos=half.Grid()->CheckerBoard(coor);
    if (cbos==cb) {
      int ssh=half.Grid()->oIndex(coor);
      half_v[ssh] = full_v[ss];
    }
  });
 }
 template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
 {
-  int cb = half.Checkerboard();
+  acceleratorSetCheckerboard(full,half);
  autoView( half_v , half, CpuRead);
  autoView( full_v , full, CpuWrite);
  thread_for(ss,full.Grid()->oSites(),{
    Coordinate coor;
    int cbos;
    full.Grid()->oCoorFromOindex(coor,ss);
    cbos=half.Grid()->CheckerBoard(coor);
    if (cbos==cb) {
      int ssh=half.Grid()->oIndex(coor);
      full_v[ss]=half_v[ssh];
    }
  });
 }
-template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
+template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0)
 {
  half.Checkerboard() = cb;
  autoView(half_v, half, AcceleratorWrite);
@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  int checker_dim_half             = half.Grid()->CheckerDim();
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
    }
  });
 }
-template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
+template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0)
 {
  int cb = half.Checkerboard();
  autoView(half_v , half, AcceleratorRead);
@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  int checker_dim_half             = half.Grid()->CheckerDim();
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
@ -981,14 +954,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
    hcoor[orthog] = slice;
    for(int d=0;d<nh;d++){
      if ( d!=orthog ) { 
-	hcoor[d]=lcoor[ddl];
+	hcoor[d]=lcoor[ddl++];
 	if ( hg->_checker_dim == d ) {
 	  hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
 	  lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
      }
 	ddl++;
      }
    }
    peekLocalSite(s,lowDimv,lcoor);
    pokeLocalSite(s,higherDimv,hcoor);
@ -1009,7 +976,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
  assert(orthog<nh);
  assert(orthog>=0);
  assert(hg->_processors[orthog]==1);
  lowDim.Checkerboard() = higherDim.Checkerboard();
  int dl; dl = 0;
  for(int d=0;d<nh;d++){
@ -1027,16 +993,11 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
    Coordinate lcoor(nl);
    Coordinate hcoor(nh);
    lg->LocalIndexToLocalCoor(idx,lcoor);
    hcoor[orthog] = slice;
    int ddl=0;
    hcoor[orthog] = slice;
    for(int d=0;d<nh;d++){
      if ( d!=orthog ) { 
-	hcoor[d]=lcoor[ddl];
+	hcoor[d]=lcoor[ddl++];
 	if ( hg->_checker_dim == d ) {
 	  hcoor[d]=hcoor[d]*2;     // factor in the full gridd coor for peekLocalSite
 	  lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
 	}
 	ddl++;
      }
    }
    peekLocalSite(s,higherDimv,hcoor);
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
 *
 */
-template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
+template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
 					      Lattice<vobj> &lat,
 					      int x,
 					      int dim,
@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
  });
 }
-template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
+template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
 					     const Lattice<vobj> &lat,
 					     int x,
 					     int dim,
@ -462,8 +462,8 @@ public:
    int rNsimd = Nsimd / simd[dimension];
    assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
-    static deviceVector<vobj> send_buf; 
+    static cshiftVector<vobj> send_buf; 
-    static deviceVector<vobj> recv_buf;
+    static cshiftVector<vobj> recv_buf;
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -90,16 +90,16 @@ public:
  void M5D(const FermionField &psi,
 	   const FermionField &phi,
 	   FermionField &chi,
-	   std::vector<Coeff_t> &lower,
+	   Vector<Coeff_t> &lower,
-	   std::vector<Coeff_t> &diag,
+	   Vector<Coeff_t> &diag,
-	   std::vector<Coeff_t> &upper);
+	   Vector<Coeff_t> &upper);
  void M5Ddag(const FermionField &psi,
 	      const FermionField &phi,
 	      FermionField &chi,
-	      std::vector<Coeff_t> &lower,
+	      Vector<Coeff_t> &lower,
-	      std::vector<Coeff_t> &diag,
+	      Vector<Coeff_t> &diag,
-	      std::vector<Coeff_t> &upper);
+	      Vector<Coeff_t> &upper);
  virtual void   Instantiatable(void)=0;
@ -119,35 +119,35 @@ public:
  RealD mass_plus, mass_minus;
  // Save arguments to SetCoefficientsInternal
-  std::vector<Coeff_t> _gamma;
+  Vector<Coeff_t> _gamma;
  RealD                _zolo_hi;
  RealD                _b;
  RealD                _c;
  // Cayley form Moebius (tanh and zolotarev)
-  std::vector<Coeff_t> omega;
+  Vector<Coeff_t> omega;
-  std::vector<Coeff_t> bs;    // S dependent coeffs
+  Vector<Coeff_t> bs;    // S dependent coeffs
-  std::vector<Coeff_t> cs;
+  Vector<Coeff_t> cs;
-  std::vector<Coeff_t> as;
+  Vector<Coeff_t> as;
  // For preconditioning Cayley form
-  std::vector<Coeff_t> bee;
+  Vector<Coeff_t> bee;
-  std::vector<Coeff_t> cee;
+  Vector<Coeff_t> cee;
-  std::vector<Coeff_t> aee;
+  Vector<Coeff_t> aee;
-  std::vector<Coeff_t> beo;
+  Vector<Coeff_t> beo;
-  std::vector<Coeff_t> ceo;
+  Vector<Coeff_t> ceo;
-  std::vector<Coeff_t> aeo;
+  Vector<Coeff_t> aeo;
  // LDU factorisation of the eeoo matrix
-  std::vector<Coeff_t> lee;
+  Vector<Coeff_t> lee;
-  std::vector<Coeff_t> leem;
+  Vector<Coeff_t> leem;
-  std::vector<Coeff_t> uee;
+  Vector<Coeff_t> uee;
-  std::vector<Coeff_t> ueem;
+  Vector<Coeff_t> ueem;
-  std::vector<Coeff_t> dee;
+  Vector<Coeff_t> dee;
  // Matrices of 5d ee inverse params
-  //  std::vector<iSinglet<Simd> >  MatpInv;
+  Vector<iSinglet<Simd> >  MatpInv;
-  //  std::vector<iSinglet<Simd> >  MatmInv;
+  Vector<iSinglet<Simd> >  MatmInv;
-  //  std::vector<iSinglet<Simd> >  MatpInvDag;
+  Vector<iSinglet<Simd> >  MatpInvDag;
-  //  std::vector<iSinglet<Simd> >  MatmInvDag;
+  Vector<iSinglet<Simd> >  MatmInvDag;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
@ -187,7 +187,7 @@ public:
 protected:
  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
-  virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
+  virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@ -90,12 +90,12 @@ protected:
  RealD mass;
  RealD R;
  RealD ZoloHiInv;
-  std::vector<double> Beta;
+  Vector<double> Beta;
-  std::vector<double> cc;;
+  Vector<double> cc;;
-  std::vector<double> cc_d;;
+  Vector<double> cc_d;;
-  std::vector<double> sqrt_cc;
+  Vector<double> sqrt_cc;
-  std::vector<double> See;
+  Vector<double> See;
-  std::vector<double> Aee;
+  Vector<double> Aee;
 };
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
@ -69,10 +69,10 @@ public:
  // Instantiate different versions depending on Impl
  /////////////////////////////////////////////////////
  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	   std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
  virtual void RefreshShiftCoefficients(RealD new_shift);
@ -83,7 +83,7 @@ public:
 			RealD _M5, const ImplParams& p=ImplParams());
 protected:
-  void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
+  void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@ -102,11 +102,11 @@ public:
 		     GaugeField &mat, 
 		     const FermionField &A, const FermionField &B, int dag);
-  void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);
  //////////////////////////////////////////////////////////////////////////
@ -164,6 +164,8 @@ public:
  DoubledGaugeField UUUmuEven;
  DoubledGaugeField UUUmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@ -100,6 +100,7 @@ public:
 		     int dag);
  void DhopInternal(StencilImpl & st,
 		    LebesgueOrder &lo,
 		    DoubledGaugeField &U,
 		    DoubledGaugeField &UUU,
 		    const FermionField &in, 
@ -107,6 +108,7 @@ public:
 		    int dag);
    void DhopInternalOverlappedComms(StencilImpl & st,
 		      LebesgueOrder &lo,
 		      DoubledGaugeField &U,
 		      DoubledGaugeField &UUU,
 		      const FermionField &in, 
@ -114,6 +116,7 @@ public:
 		      int dag);
    void DhopInternalSerialComms(StencilImpl & st,
 		      LebesgueOrder &lo,
 		      DoubledGaugeField &U,
 		      DoubledGaugeField &UUU,
 		      const FermionField &in, 
@ -189,6 +192,8 @@ public:
  DoubledGaugeField UUUmuEven;
  DoubledGaugeField UUUmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
--- a/Grid/qcd/action/fermion/MobiusEOFAFermion.h
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.h
@ -42,11 +42,11 @@ public:
 public:
  // Shift operator coefficients for red-black preconditioned Mobius EOFA
-  std::vector<Coeff_t> Mooee_shift;
+  Vector<Coeff_t> Mooee_shift;
-  std::vector<Coeff_t> MooeeInv_shift_lc;
+  Vector<Coeff_t> MooeeInv_shift_lc;
-  std::vector<Coeff_t> MooeeInv_shift_norm;
+  Vector<Coeff_t> MooeeInv_shift_norm;
-  std::vector<Coeff_t> MooeeInvDag_shift_lc;
+  Vector<Coeff_t> MooeeInvDag_shift_lc;
-  std::vector<Coeff_t> MooeeInvDag_shift_norm;
+  Vector<Coeff_t> MooeeInvDag_shift_norm;
  virtual void Instantiatable(void) {};
@ -74,18 +74,18 @@ public:
  // Instantiate different versions depending on Impl
  /////////////////////////////////////////////////////
  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	   std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
  void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-		 std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+		 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-		 std::vector<Coeff_t>& shift_coeffs);
+		 Vector<Coeff_t>& shift_coeffs);
  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
  void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-		    std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+		    Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-		    std::vector<Coeff_t>& shift_coeffs);
+		    Vector<Coeff_t>& shift_coeffs);
  virtual void RefreshShiftCoefficients(RealD new_shift);
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@ -102,11 +102,11 @@ public:
 		     GaugeField &mat, 
 		     const FermionField &A, const FermionField &B, int dag);
-  void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
+  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
+  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 			       const FermionField &in, FermionField &out, int dag);
-  void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
+  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 				   const FermionField &in, FermionField &out, int dag);
  //////////////////////////////////////////////////////////////////////////
@ -152,6 +152,9 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@ -94,8 +94,8 @@ protected:
  RealD R;
  RealD amax;
  RealD scale;
-  std::vector<double> p; 
+  Vector<double> p; 
-  std::vector<double> q;
+  Vector<double> q;
 };
--- a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
+++ b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
@ -35,7 +35,7 @@ template<class Matrix, class Field>
 class KappaSimilarityTransform {
 public:
  INHERIT_IMPL_TYPES(Matrix);
-  std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
+  Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
  KappaSimilarityTransform (Matrix &zmob) {
    for (int i=0;i<(int)zmob.bs.size();i++) {
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
 public:
-  void DhopImproved(StencilImpl &st,
+  void DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
 		    const FermionField &in, FermionField &out, int dag, int interior,int exterior);
-  void DhopNaive(StencilImpl &st,
+  void DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 		 DoubledGaugeField &U,
 		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -47,7 +47,7 @@ public:
  static int PartialCompressionFactor(GridBase *grid) { return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
+  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
 				   const Lattice<vobj> &rhs,
 				   cobj *buffer,
 				   compressor &compress,
@ -109,7 +109,7 @@ public:
  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
@ -197,7 +197,7 @@ public:
 #endif
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
+  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
 					 const Lattice<vobj> &rhs,
 					 cobj *buffer,
 					 compressor &compress,
@ -208,7 +208,7 @@ public:
    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
  }
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
@ -402,6 +402,7 @@ public:
  typedef CartesianStencil<vobj,cobj,Parameters> Base;
  typedef typename Base::View_type View_type;
  typedef typename Base::StencilVector StencilVector;
  //  Vector<int> surface_list;
  WilsonStencil(GridBase *grid,
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@ -126,16 +126,13 @@ public:
  void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
                     const FermionField &A, const FermionField &B, int dag);
-  void DhopInternal(StencilImpl &st,
+  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 		    DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalSerial(StencilImpl &st,
+  void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 			  DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalOverlappedComms(StencilImpl &st,
+  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 				   DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
  // Constructor
@ -171,6 +168,9 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  WilsonAnisotropyCoefficients anisotropyCoeff;
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -135,18 +135,21 @@ public:
 		     int dag);
  void DhopInternal(StencilImpl & st,
 		    LebesgueOrder &lo,
 		    DoubledGaugeField &U,
 		    const FermionField &in, 
 		    FermionField &out,
 		    int dag);
  void DhopInternalOverlappedComms(StencilImpl & st,
 				   LebesgueOrder &lo,
 				   DoubledGaugeField &U,
 				   const FermionField &in, 
 				   FermionField &out,
 				   int dag);
  void DhopInternalSerialComms(StencilImpl & st,
 			       LebesgueOrder &lo,
 			       DoubledGaugeField &U,
 			       const FermionField &in, 
 			       FermionField &out,
@ -200,6 +203,9 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
--- a/Grid/qcd/action/fermion/ZMobiusFermion.h
+++ b/Grid/qcd/action/fermion/ZMobiusFermion.h
@ -58,7 +58,7 @@ public:
  {
    //    RealD eps = 1.0;
    std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
-    std::vector<Coeff_t> zgamma(this->Ls);
+    Vector<Coeff_t> zgamma(this->Ls);
    for(int s=0;s<this->Ls;s++){
      zgamma[s] = gamma[s];
    }
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -156,18 +156,18 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag (Ls,1.0);
+  Vector<Coeff_t> diag (Ls,1.0);
-  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
-  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = bs;
+  Vector<Coeff_t> diag = bs;
-  std::vector<Coeff_t> upper= cs;
+  Vector<Coeff_t> upper= cs;
-  std::vector<Coeff_t> lower= cs; 
+  Vector<Coeff_t> lower= cs; 
  upper[Ls-1]=-mass_minus*upper[Ls-1];
  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
@ -176,9 +176,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = beo;
+  Vector<Coeff_t> diag = beo;
-  std::vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
@ -191,9 +191,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = bee;
+  Vector<Coeff_t> diag = bee;
-  std::vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-cee[i];
    lower[i]=-cee[i];
@ -206,9 +206,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag = bee;
+  Vector<Coeff_t> diag = bee;
-  std::vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> lower(Ls);
  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
@ -236,9 +236,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> diag(Ls,1.0);
-  std::vector<Coeff_t> upper(Ls,-1.0);
+  Vector<Coeff_t> upper(Ls,-1.0);
-  std::vector<Coeff_t> lower(Ls,-1.0);
+  Vector<Coeff_t> lower(Ls,-1.0);
  upper[Ls-1]=-mass_plus*upper[Ls-1];
  lower[0]   =-mass_minus*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
@ -248,9 +248,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<Coeff_t> diag =bs;
+  Vector<Coeff_t> diag =bs;
-  std::vector<Coeff_t> upper=cs;
+  Vector<Coeff_t> upper=cs;
-  std::vector<Coeff_t> lower=cs; 
+  Vector<Coeff_t> lower=cs; 
  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
@ -394,7 +394,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  std::vector<Coeff_t> gamma(this->Ls);
+  Vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(1.0,gamma,b,c);
 }
@ -402,13 +402,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  std::vector<Coeff_t> gamma(this->Ls);
+  Vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(zolo_hi,gamma,b,c);
 }
 //Zolo
 template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
+void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
 {
  int Ls=this->Ls;
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@ -43,9 +43,9 @@ void
 CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
 			       const FermionField &phi_i, 
 			       FermionField &chi_i,
-			       std::vector<Coeff_t> &lower,
+			       Vector<Coeff_t> &lower,
-			       std::vector<Coeff_t> &diag,
+			       Vector<Coeff_t> &diag,
-			       std::vector<Coeff_t> &upper)
+			       Vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard()=psi_i.Checkerboard();
@ -55,16 +55,12 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
  auto pdiag = &diag[0];
  auto pupper = &upper[0];
  auto plower = &lower[0];
  int Ls =this->Ls;
  static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
  auto plower = &d_lower[0];
  // 10 = 3 complex mult + 2 complex add
  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
  uint64_t nloop = grid->oSites();
@ -86,9 +82,9 @@ void
 CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
 			      const FermionField &phi_i, 
 			      FermionField &chi_i,
-			      std::vector<Coeff_t> &lower,
+			      Vector<Coeff_t> &lower,
-			      std::vector<Coeff_t> &diag,
+			      Vector<Coeff_t> &diag,
-			      std::vector<Coeff_t> &upper)
+			      Vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
@ -97,16 +93,12 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
  auto pdiag = &diag[0];
  auto pupper = &upper[0];
  auto plower = &lower[0];
  int Ls=this->Ls;
  static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
  auto plower = &d_lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -134,17 +126,11 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
  int Ls=this->Ls;
-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
+  auto plee  = & lee [0];
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
+  auto pdee  = & dee [0];
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
+  auto puee  = & uee [0];
-  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
+  auto pleem = & leem[0];
-  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+  auto pueem = & ueem[0];
  auto plee  = & d_lee [0];
  auto pdee  = & d_dee [0];
  auto puee  = & d_uee [0];
  auto pleem = & d_leem[0];
  auto pueem = & d_ueem[0];
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -196,17 +182,11 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
  autoView(psi , psi_i,AcceleratorRead);
  autoView(chi , chi_i,AcceleratorWrite);
-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
+  auto plee  = & lee [0];
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
+  auto pdee  = & dee [0];
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
+  auto puee  = & uee [0];
-  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
+  auto pleem = & leem[0];
-  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+  auto pueem = & ueem[0];
  auto plee  = & d_lee [0];
  auto pdee  = & d_dee [0];
  auto puee  = & d_uee [0];
  auto pleem = & d_leem[0];
  auto pueem = & d_ueem[0];
  assert(psi.Checkerboard() == psi.Checkerboard());
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
@ -1,5 +1,3 @@
 #if 0
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -820,5 +818,3 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid);
 // Pplus  backwards..
 template<class Impl>
 void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  int Ls = this->Ls;
@ -50,15 +50,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  autoView( psi , psi_i, AcceleratorRead);
  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
-
+  auto pdiag = &diag[0];
-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
+  auto pupper = &upper[0];
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
+  auto plower = &lower[0];
  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
  auto plower = &d_lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  auto nloop=grid->oSites()/Ls;
@ -79,7 +73,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
 template<class Impl>
 void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					 std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase* grid = psi_i.Grid();
@ -89,14 +83,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  autoView( phi , phi_i, AcceleratorRead);
  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
-
+  auto pdiag = &diag[0];
-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
+  auto pupper = &upper[0];
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
+  auto plower = &lower[0];
  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
  auto plower = &d_lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
@ -125,17 +114,12 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
  autoView( chi, chi_i, AcceleratorWrite);
  int Ls = this->Ls;
-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
+  auto plee  = & this->lee[0];
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
+  auto pdee  = & this->dee[0];
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
+  auto puee  = & this->uee[0];
  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
-  auto plee  = & d_lee [0];
+  auto pleem = & this->leem[0];
-  auto pdee  = & d_dee [0];
+  auto pueem = & this->ueem[0];
  auto puee  = & d_uee [0];
  auto pleem = & d_leem[0];
  auto pueem = & d_ueem[0];
  uint64_t nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
@ -131,9 +131,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi
    else{ shiftm = -shift*(mq3-mq2); }
  }
-  std::vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> diag(Ls,1.0);
-  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
 #if(0)
  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
@ -168,9 +168,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField&
    else{ shiftm = -shift*(mq3-mq2); }
  }
-  std::vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> diag(Ls,1.0);
-  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
  this->M5Ddag(psi, chi, chi, lower, diag, upper);
 }
@ -181,9 +181,9 @@ void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& c
 {
  int Ls = this->Ls;
-  std::vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> diag = this->bee;
-  std::vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
@ -200,9 +200,9 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
 {
  int Ls = this->Ls;
-  std::vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> diag = this->bee;
-  std::vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
@ -218,7 +218,7 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
 //Zolo
 template<class Impl>
-void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
+void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
 {
  int   Ls    = this->Ls;
  int   pm    = this->pm;
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@ -61,6 +61,8 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
  UUUmu(&FourDimGrid),
  UUUmuEven(&FourDimRedBlackGrid),
  UUUmuOdd(&FourDimRedBlackGrid),
  Lebesgue(&FourDimGrid),
  LebesgueEvenOdd(&FourDimRedBlackGrid),
  _tmp(&FiveDimRedBlackGrid)
 {
@ -275,18 +277,18 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 /*CHANGE */
 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, 
+void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
+    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
  else
-    DhopInternalSerialComms(st,U,UUU,in,out,dag);
+    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 }
 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, 
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
 								   DoubledGaugeField & U,DoubledGaugeField & UUU,
 								   const FermionField &in, FermionField &out,int dag)
 {
@ -311,7 +313,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  st.CommsMerge(compressor);
@ -321,12 +323,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
 }
 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, 
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
@ -339,7 +341,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
 }
 /*CHANGE END*/
@ -355,7 +357,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
  assert(in.Checkerboard()==Even);
  out.Checkerboard() = Odd;
-  DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag);
+  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
@ -366,7 +368,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
  assert(in.Checkerboard()==Odd);
  out.Checkerboard() = Even;
-  DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag);
+  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
@ -376,7 +378,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  out.Checkerboard() = in.Checkerboard();
-  DhopInternal(Stencil,Umu,UUUmu,in,out,dag);
+  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }
 /////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@ -48,6 +48,8 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
    mass(_mass),
    Lebesgue(_grid),
    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
@ -337,7 +339,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
  out.Checkerboard() = in.Checkerboard();
-  DhopInternal(Stencil, Umu, UUUmu, in, out, dag);
+  DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
 }
 template <class Impl>
@ -349,7 +351,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;
-  DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag);
+  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
 }
 template <class Impl>
@ -361,7 +363,7 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;
-  DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag);
+  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
 }
 template <class Impl>
@ -392,19 +394,19 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, 
+void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 						  DoubledGaugeField &U,
 						  DoubledGaugeField &UUU,
 						  const FermionField &in,
 						  FermionField &out, int dag) 
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
+    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
  else
-    DhopInternalSerialComms(st,U,UUU,in,out,dag);
+    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 }
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, 
+void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
 								 DoubledGaugeField &U,
 								 DoubledGaugeField &UUU,
 								 const FermionField &in,
@ -427,7 +429,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  st.CommunicateComplete(requests);
@ -438,13 +440,13 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
 }
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, 
+void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
 							     DoubledGaugeField &U,
 							     DoubledGaugeField &UUU,
 							     const FermionField &in,
@ -458,7 +460,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
 };
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				  std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
+				  Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@ -50,13 +50,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
  assert(phi.Checkerboard() == psi.Checkerboard());
-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
+  auto pdiag = &diag[0];
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
+  auto pupper = &upper[0];
-  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
+  auto plower = &lower[0];
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
  auto plower = &d_lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@ -78,8 +74,8 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
+					Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
-					std::vector<Coeff_t> &shift_coeffs)
+					Vector<Coeff_t> &shift_coeffs)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@ -93,15 +89,10 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
  assert(phi.Checkerboard() == psi.Checkerboard());
-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
+  auto pdiag = &diag[0];
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
+  auto pupper = &upper[0];
-  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
+  auto plower = &lower[0];
-  static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t));
+  auto pshift_coeffs = &shift_coeffs[0];
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
  auto plower = &d_lower[0];
  auto pshift_coeffs = &d_shift_coeffs[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@ -128,7 +119,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				     std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
+				     Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@ -139,13 +130,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
  assert(phi.Checkerboard() == psi.Checkerboard());
-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
+  auto pdiag = &diag[0];
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
+  auto pupper = &upper[0];
-  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
+  auto plower = &lower[0];
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
  auto plower = &d_lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@ -167,8 +154,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					   std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
+					   Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
-					   std::vector<Coeff_t> &shift_coeffs)
+					   Vector<Coeff_t> &shift_coeffs)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@ -180,15 +167,10 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
  assert(phi.Checkerboard() == psi.Checkerboard());
-  static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
+  auto pdiag = &diag[0];
-  static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
+  auto pupper = &upper[0];
-  static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
+  auto plower = &lower[0];
-  static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t));
+  auto pshift_coeffs = &shift_coeffs[0];
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
  auto plower = &d_lower[0];
  auto pshift_coeffs = &d_shift_coeffs[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  auto pm = this->pm;
@ -230,17 +212,11 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);
-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
+  auto plee = & this->lee [0];
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
+  auto pdee = & this->dee [0];
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
+  auto puee = & this->uee [0];
-  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
+  auto pleem= & this->leem[0];
-  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+  auto pueem= & this->ueem[0];
  auto plee  = & d_lee [0];
  auto pdee  = & d_dee [0];
  auto puee  = & d_uee [0];
  auto pleem = & d_leem[0];
  auto pueem = & d_ueem[0];
  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
@ -292,24 +268,14 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);
  // Move into object and constructor
  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
  auto pm = this->pm;
-  auto plee  = & d_lee [0];
+  auto plee = & this->lee [0];
-  auto pdee  = & d_dee [0];
+  auto pdee = & this->dee [0];
-  auto puee  = & d_uee [0];
+  auto puee = & this->uee [0];
-  auto pleem = & d_leem[0];
+  auto pleem= & this->leem[0];
-  auto pueem = & d_ueem[0];
+  auto pueem= & this->ueem[0];
-
+  auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0];
-  static deviceVector<Coeff_t> d_MooeeInv_shift_lc(Ls); acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&d_MooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
+  auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
  static deviceVector<Coeff_t> d_MooeeInv_shift_norm(Ls); acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&d_MooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
  auto pMooeeInv_shift_lc   = &d_MooeeInv_shift_lc[0];
  auto pMooeeInv_shift_norm = &d_MooeeInv_shift_norm[0];
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -367,17 +333,11 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);
-  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
+  auto plee = & this->lee [0];
-  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
+  auto pdee = & this->dee [0];
-  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
+  auto puee = & this->uee [0];
-  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
+  auto pleem= & this->leem[0];
-  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+  auto pueem= & this->ueem[0];
  auto plee  = & d_lee [0];
  auto pdee  = & d_dee [0];
  auto puee  = & d_uee [0];
  auto pleem = & d_leem[0];
  auto pueem = & d_ueem[0];
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -426,28 +386,14 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
  autoView(chi , chi_i, AcceleratorWrite);
  int Ls = this->Ls;
  static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
  static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
  auto pm = this->pm;
-  auto plee  = & d_lee [0];
+  auto plee = & this->lee [0];
-  auto pdee  = & d_dee [0];
+  auto pdee = & this->dee [0];
-  auto puee  = & d_uee [0];
+  auto puee = & this->uee [0];
-  auto pleem = & d_leem[0];
+  auto pleem= & this->leem[0];
-  auto pueem = & d_ueem[0];
+  auto pueem= & this->ueem[0];
-
+  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
-  static deviceVector<Coeff_t> d_MooeeInvDag_shift_lc(Ls);
+  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
  static deviceVector<Coeff_t> d_MooeeInvDag_shift_norm(Ls);
  acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&d_MooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&d_MooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
  auto pMooeeInvDag_shift_lc   = &d_MooeeInvDag_shift_lc[0];
  auto pMooeeInvDag_shift_norm = &d_MooeeInvDag_shift_norm[0];
  //  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
  //  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
@ -196,9 +196,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
 {
  int Ls = this->Ls;
-  std::vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> diag(Ls,1.0);
-  std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
  // no shift term
  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
@ -212,9 +212,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
 {
  int Ls = this->Ls;
-  std::vector<Coeff_t> diag(Ls,1.0);
+  Vector<Coeff_t> diag(Ls,1.0);
-  std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
  // no shift term
  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
@ -230,9 +230,9 @@ void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
  int Ls = this->Ls;
  // coefficients of Mooee
-  std::vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> diag = this->bee;
-  std::vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
    lower[s] = -this->cee[s];
@ -253,9 +253,9 @@ void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& ch
  int Ls = this->Ls;
  // coefficients of MooeeDag
-  std::vector<Coeff_t> diag = this->bee;
+  Vector<Coeff_t> diag = this->bee;
-  std::vector<Coeff_t> upper(Ls);
+  Vector<Coeff_t> upper(Ls);
-  std::vector<Coeff_t> lower(Ls);
+  Vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    if(s==0) {
      upper[s] = -this->cee[s+1];
@ -314,10 +314,10 @@ void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
  // Tridiagonal solve for MooeeInvDag_shift_lc
  {
    Coeff_t m(0.0);
-    std::vector<Coeff_t> d = Mooee_shift;
+    Vector<Coeff_t> d = Mooee_shift;
-    std::vector<Coeff_t> u(Ls,0.0);
+    Vector<Coeff_t> u(Ls,0.0);
-    std::vector<Coeff_t> y(Ls,0.0);
+    Vector<Coeff_t> y(Ls,0.0);
-    std::vector<Coeff_t> q(Ls,0.0);
+    Vector<Coeff_t> q(Ls,0.0);
    if(pm == 1){ u[0] = 1.0; }
    else{ u[Ls-1] = 1.0; }
--- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
@ -48,6 +48,8 @@ NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed
    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
    mass(_mass),
    Lebesgue(_grid),
    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
@ -266,7 +268,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
  out.Checkerboard() = in.Checkerboard();
-  DhopInternal(Stencil, Umu, in, out, dag);
+  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
 }
 template <class Impl>
@ -278,7 +280,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;
-  DhopInternal(StencilEven, UmuOdd, in, out, dag);
+  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
 }
 template <class Impl>
@ -290,7 +292,7 @@ void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &o
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;
-  DhopInternal(StencilOdd, UmuEven, in, out, dag);
+  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
 }
 template <class Impl>
@ -321,18 +323,18 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &
 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
+void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 					       DoubledGaugeField &U,
 					       const FermionField &in,
 					       FermionField &out, int dag) 
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,U,in,out,dag);
+    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else
-    DhopInternalSerialComms(st,U,in,out,dag);
+    DhopInternalSerialComms(st,lo,U,in,out,dag);
 }
 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
+void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
 							      DoubledGaugeField &U,
 							      const FermionField &in,
 							      FermionField &out, int dag) 
@ -354,7 +356,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  st.CommunicateComplete(requests);
@ -365,12 +367,12 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
 }
 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
+void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
 							  DoubledGaugeField &U,
 							  const FermionField &in,
 							  FermionField &out, int dag) 
@ -383,7 +385,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
 };
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@ -375,6 +375,23 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  }
 }
 /*
 #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 									\
  template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 									\
  template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 */
 #undef LOAD_CHI
 #undef HAND_DECLARATIONS
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@ -256,7 +256,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
  });
 template <class Impl> 
-void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, 
+void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 {
@ -294,7 +294,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,
  assert(0 && " Kernel optimisation case not covered ");
 }
 template <class Impl> 
-void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, 
+void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 				       DoubledGaugeField &U,
 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 {
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -58,9 +58,15 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Umu(_FourDimGrid),
  UmuEven(_FourDimRedBlackGrid),
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid),
  _tmp(&FiveDimRedBlackGrid),
  Dirichlet(0)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
@ -299,19 +305,19 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 }
 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st,
+void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         DoubledGaugeField & U,
                                         const FermionField &in, FermionField &out,int dag)
 {
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,U,in,out,dag);
+    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
-    DhopInternalSerialComms(st,U,in,out,dag);
+    DhopInternalSerialComms(st,lo,U,in,out,dag);
 }
 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
+void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
@ -325,12 +331,10 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
  {
    //    std::cout << " WilsonFermion5D gather " <<std::endl;
    GRID_TRACE("Gather");
    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
  }
  //  std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
  std::vector<std::vector<CommsRequest_t> > requests;
  auto id=traceStart("Communicate overlapped");
  st.CommunicateBegin(requests);
@ -339,7 +343,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
  // Overlap with comms
  /////////////////////////////
  {
    //  std::cout << " WilsonFermion5D Comms merge " <<std::endl;
    GRID_TRACE("MergeSHM");
    st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  }
@ -347,7 +350,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  //  std::cout << " WilsonFermion5D Interior " <<std::endl;
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
@ -360,7 +362,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  //  std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
  st.CommunicateComplete(requests);
  traceStop(id);
@ -368,13 +369,11 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
  // do the compute exterior
  /////////////////////////////
  {
    //    std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
    GRID_TRACE("Merge");
    st.CommsMerge(compressor);
  }
  //  std::cout << " WilsonFermion5D Exterior " <<std::endl;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
@ -382,12 +381,11 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  //  std::cout << " WilsonFermion5D Done " <<std::endl;
 }
 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, 
+void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 						    DoubledGaugeField & U,
 						    const FermionField &in, 
 						    FermionField &out,int dag)
@ -397,13 +395,11 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  int LLs = in.Grid()->_rdimensions[0];
  //  std::cout << " WilsonFermion5D Halo exch " <<std::endl;
  {
    GRID_TRACE("HaloExchange");
    st.HaloExchangeOpt(in,compressor);
  }
  //  std::cout << " WilsonFermion5D Dhop " <<std::endl;
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDag");
@ -412,7 +408,6 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  //  std::cout << " WilsonFermion5D Done " <<std::endl;
 }
@ -425,7 +420,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
  assert(in.Checkerboard()==Even);
  out.Checkerboard() = Odd;
-  DhopInternal(StencilEven,UmuOdd,in,out,dag);
+  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
@ -436,7 +431,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
  assert(in.Checkerboard()==Odd);
  out.Checkerboard() = Even;
-  DhopInternal(StencilOdd,UmuEven,in,out,dag);
+  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
@ -446,7 +441,7 @@ void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int d
  out.Checkerboard() = in.Checkerboard();
-  DhopInternal(Stencil,Umu,in,out,dag);
+  DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -52,12 +52,17 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
    StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd
    mass(_mass),
    Lebesgue(_grid),
    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  if  (anisotropyCoeff.isAnisotropic){
@ -309,7 +314,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
  out.Checkerboard() = in.Checkerboard();
-  DhopInternal(Stencil, Umu, in, out, dag);
+  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
 }
 template <class Impl>
@ -321,7 +326,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;
-  DhopInternal(StencilEven, UmuOdd, in, out, dag);
+  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
 }
 template <class Impl>
@ -333,7 +338,7 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;
-  DhopInternal(StencilOdd, UmuEven, in, out, dag);
+  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
 }
 template <class Impl>
@ -386,21 +391,21 @@ void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,
 };
 template <class Impl>
-void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, 
+void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,U,in,out,dag);
+    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else
 #endif
-    DhopInternalSerial(st,U,in,out,dag);
+    DhopInternalSerial(st,lo,U,in,out,dag);
 }
 template <class Impl>
-void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, 
+void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
 						      DoubledGaugeField &U,
 						      const FermionField &in,
 						      FermionField &out, int dag)
@ -469,7 +474,7 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
 template <class Impl>
-void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, 
+void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
                                       const FermionField &in,
                                       FermionField &out, int dag)
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h
@ -40,11 +40,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /// Switch off the 5d vectorised code optimisations
 #undef DWFVEC5D
-static std::vector<vComplexF> signsF;
+static Vector<vComplexF> signsF;
  template<typename vtype>    
-  int setupSigns(std::vector<vtype>& signs ){
+  int setupSigns(Vector<vtype>& signs ){
-    std::vector<vtype> bother(2);
+    Vector<vtype> bother(2);
    signs = bother;
    vrsign(signs[0]);
    visign(signs[1]);
@ -364,7 +364,7 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled
 #include <simd/Intel512double.h>
-static std::vector<vComplexD> signsD;
+static Vector<vComplexD> signsD;
 static int signInitD = setupSigns(signsD);
 #define MAYBEPERM(A,perm) if (perm) { A ; }
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -434,7 +434,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #define ASM_CALL(A)							\
  thread_for( sss, Nsite, {						\
-    int ss = sss; /*st.lo->Reorder(sss);*/			\
+    int ss = st.lo->Reorder(sss);					\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
--- a/Grid/qcd/representations/adjoint.h
+++ b/Grid/qcd/representations/adjoint.h
@ -40,7 +40,7 @@ public:
    U = Zero();
    LatticeColourMatrix tmp(Uin.Grid());
-    std::vector<typename SU<ncolour>::Matrix> ta(Dimension);
+    Vector<typename SU<ncolour>::Matrix> ta(Dimension);
    // Debug lines
    // LatticeMatrix uno(Uin.Grid());
--- a/Grid/qcd/representations/two_index.h
+++ b/Grid/qcd/representations/two_index.h
@ -43,7 +43,7 @@ public:
    U = Zero();
    LatticeColourMatrix tmp(Uin.Grid());
-    std::vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
+    Vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
    for (int a = 0; a < Dimension; a++)
      GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@ -32,7 +32,9 @@ private:
  //  Smear_Stout<Gimpl> *StoutSmearing;
  //  std::vector<GaugeField> SmearedSet;
  GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object
  std::vector<LatticeLorentzComplex> masks;
  std::vector<int> cbs;
  typedef typename SU3Adjoint::AMatrix AdjMatrix;
  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
@ -147,6 +149,25 @@ private:
    }
    pokeLorentz(Fdet, Fdet_pol, nu);
  }
  void Compute_MpInvJx_dNxxdSy(int cb,
 			       const GaugeLinkField &PlaqL,
 			       const GaugeLinkField &PlaqR,
 			       AdjMatrixField MpInvJx,
 			       AdjVectorField &Fdet2 )
  {
    GaugeLinkField PlaqLeo(UrbGrid);
    GaugeLinkField PlaqReo(UrbGrid);
    AdjMatrixField MpInvJxeo(UrbGrid);
    AdjVectorField Fdet2eo(UrbGrid);
    pickCheckerboard(cb,PlaqLeo,PlaqL);
    pickCheckerboard(cb,PlaqReo,PlaqR);
    pickCheckerboard(cb,MpInvJxeo,MpInvJx);
    Fdet2eo.Checkerboard()=cb;
    Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo);
    setCheckerboard(Fdet2,Fdet2eo);
  }
  void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
  {
    GaugeLinkField UtaU(PlaqL.Grid());
@ -278,6 +299,7 @@ public:
    ////////////////////////////////////////////////////////////////////////////////
    // Mask the gauge field
    ////////////////////////////////////////////////////////////////////////////////
    int cb = cbs[smr];
    auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
    Umsk = U;
@ -442,7 +464,7 @@ public:
    AdjMatrixField MpInvJx_nu(grid);
    MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
-    Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
+    Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
    Fdet2_mu=FdetV;
    Fdet1_mu=Zero();
@ -499,7 +521,7 @@ public:
 	time=-usecond();
 	PlaqR=(-1.0)*PlaqR;
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
 	Fdet2_nu = FdetV;
 	time+=usecond();
 	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
@ -520,7 +542,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;
 	///////////////// -ve nu /////////////////
@ -539,7 +561,7 @@ public:
 	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
 	MpInvJx_nu = Cshift(MpInvJx,nu,1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;
 	// x==
@ -560,7 +582,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
 	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;
 	/////////////////////////////////////////////////////////////////////
@ -589,7 +611,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,nu,-1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_mu = Fdet2_mu+FdetV;
 	//  __
@ -609,7 +631,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,nu,1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_mu = Fdet2_mu+FdetV;
      }
@ -931,6 +953,10 @@ private:
 public:
  /* Standard constructor */
  virtual ~SmearedConfigurationMasked()
  {
    delete UrbGrid;
  }
  SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
    : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
  {
@ -939,7 +965,6 @@ public:
    // was resized in base class
    assert(this->SmearedSet.size()==Nsmear);
    GridRedBlackCartesian * UrbGrid;
    UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
    LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
    LatticeComplex tmp(_UGrid);
@ -947,11 +972,12 @@ public:
    for (unsigned int i = 0; i < this->smearingLevels; ++i) {
      masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
      int mu= (i/2) %Nd;
      int cb= (i%2);
      LatticeComplex tmpcb(UrbGrid);
      cbs.push_back(cb);
      masks[i]=Zero();
      ////////////////////
      // Setup the mask
@ -962,7 +988,6 @@ public:
      PokeIndex<LorentzIndex>(masks[i],tmp, mu);
    }
    delete UrbGrid;
  }
  virtual void smeared_force(GaugeField &SigmaTilde) 
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@ -158,12 +158,12 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  int MFrvol = rd*Lblock*Rblock*Nmom;
  int MFlvol = ld*Lblock*Rblock*Nmom;
-  std::vector<SpinMatrix_v > lvSum(MFrvol);
+  Vector<SpinMatrix_v > lvSum(MFrvol);
  thread_for( r, MFrvol,{
    lvSum[r] = Zero();
  });
-  std::vector<SpinMatrix_s > lsSum(MFlvol);             
+  Vector<SpinMatrix_s > lsSum(MFlvol);             
  thread_for(r,MFlvol,{
    lsSum[r]=scalar_type(0.0);
  });
@ -346,12 +346,12 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
  int MFrvol = rd*Lblock*Rblock;
  int MFlvol = ld*Lblock*Rblock;
-  std::vector<vector_type > lvSum(MFrvol);
+  Vector<vector_type > lvSum(MFrvol);
  thread_for(r,MFrvol,{
    lvSum[r] = Zero();
  });
-  std::vector<scalar_type > lsSum(MFlvol);             
+  Vector<scalar_type > lsSum(MFlvol);             
  thread_for(r,MFlvol,{
    lsSum[r]=scalar_type(0.0);
  });
@ -493,12 +493,12 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
  int MFrvol = rd*Lblock*Rblock*Nmom;
  int MFlvol = ld*Lblock*Rblock*Nmom;
-  std::vector<vector_type > lvSum(MFrvol);
+  Vector<vector_type > lvSum(MFrvol);
  thread_for(r,MFrvol,{
    lvSum[r] = Zero();
  });
-  std::vector<scalar_type > lsSum(MFlvol);             
+  Vector<scalar_type > lsSum(MFlvol);             
  thread_for(r,MFlvol,{
    lsSum[r]=scalar_type(0.0);
  });
@ -700,13 +700,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
    int MFrvol = rd*Lblock*Rblock*Nem;
    int MFlvol = ld*Lblock*Rblock*Nem;
-    std::vector<vector_type> lvSum(MFrvol);
+    Vector<vector_type> lvSum(MFrvol);
    thread_for(r,MFrvol,
    {
      lvSum[r] = Zero();
    });
-    std::vector<scalar_type> lsSum(MFlvol);             
+    Vector<scalar_type> lsSum(MFlvol);             
    thread_for(r,MFlvol,
    {
        lsSum[r] = scalar_type(0.0);
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@ -971,9 +971,7 @@ void BaryonUtils<FImpl>::BaryonGamma3pt(
  autoView( vq_ti , q_ti     , AcceleratorRead);
  autoView( vq_tf , q_tf     , AcceleratorRead);
-  deviceVector<mobj> my_Dq_spec(2);
+  Vector<mobj> my_Dq_spec{Dq_spec1,Dq_spec2};
  acceleratorPut(my_Dq_spec[0],Dq_spec1);
  acceleratorPut(my_Dq_spec[1],Dq_spec2);
  mobj * Dq_spec_p = &my_Dq_spec[0];
  if (group == 1) {
@ -1302,8 +1300,7 @@ void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
  autoView( vd_tf   , qd_tf    , AcceleratorRead);
  autoView( vs_ti   , qs_ti    , AcceleratorRead);
-  deviceVector<mobj> my_Dq_spec(1);
+  Vector<mobj> my_Dq_spec{Du_spec};
  acceleratorPut(my_Dq_spec[0],Du_spec);
  mobj * Dq_spec_p = &my_Dq_spec[0];
  if(op == "Q1"){
@ -1356,8 +1353,7 @@ void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
  autoView( vd_tf , qd_tf    , AcceleratorRead  );
  autoView( vs_ti , qs_ti    , AcceleratorRead  );
-  deviceVector<mobj> my_Dq_spec(1);
+  Vector<mobj> my_Dq_spec{Du_spec};
  acceleratorPut(my_Dq_spec[0],Du_spec);
  mobj * Dq_spec_p = &my_Dq_spec[0];
  if(op == "Q1"){
@ -1548,9 +1544,7 @@ void BaryonUtils<FImpl>::XiToSigmaEye(const PropagatorField &qq_loop,
  autoView( vd_tf   , qd_tf    , AcceleratorRead);
  autoView( vs_ti   , qs_ti    , AcceleratorRead);
-  deviceVector<mobj> my_Dq_spec(2);
+  Vector<mobj> my_Dq_spec{Dd_spec,Ds_spec};
  acceleratorPut(my_Dq_spec[0],Dd_spec);
  acceleratorPut(my_Dq_spec[0],Ds_spec);
  mobj * Dq_spec_p = &my_Dq_spec[0];
  if(op == "Q1"){
--- a/Grid/qcd/utils/GaugeGroup.h
+++ b/Grid/qcd/utils/GaugeGroup.h
@ -418,33 +418,33 @@ static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in,
  int hNNm1= NNm1/2;
  RealD sqrt_2 = sqrt(2.0);
  Complex ci(0.0,1.0);
  const int nsimd=  Matrix::Nsimd();
  accelerator_for(ss,grid->oSites(),nsimd,{
      for(int su2Index=0;su2Index<hNNm1;su2Index++){
 	int i1, i2;
 	su2SubGroupIndex(i1, i2, su2Index);
 	int ax = su2Index*2;
 	int ay = su2Index*2+1;
    accelerator_for(ss,grid->oSites(),1,{
 	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
 	// trace( Ta x Ci in)
 	// Bet I need to move to real part with mult by -i
-	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
+	coalescedWrite(out_v[ss]()()(ax,b),0.5*(real(in_v(ss)()()(i2,i1)) - real(in_v(ss)()()(i1,i2))));
-	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
+	coalescedWrite(out_v[ss]()()(ay,b),0.5*(imag(in_v(ss)()()(i1,i2)) + imag(in_v(ss)()()(i2,i1))));
      });
      }
      for(int diagIndex=0;diagIndex<N-1;diagIndex++){
 	int k = diagIndex + 1; // diagIndex starts from 0
 	int a = NNm1+diagIndex;
 	RealD scale = 1.0/sqrt(2.0*k*(k+1));
-    accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
+	auto tmp = in_v(ss)()()(0,0);
 	auto tmp = in_v[ss]()()(0,0);
 	for(int i=1;i<k;i++){
-	  tmp=tmp+in_v[ss]()()(i,i);
+	  tmp=tmp+in_v(ss)()()(i,i);
 	}
 	tmp = tmp - in_v(ss)()()(k,k)*k;
 	coalescedWrite(out_v[ss]()()(a,b),imag(tmp) * scale);
      }
 	tmp = tmp - in_v[ss]()()(k,k)*k;
 	out_v[ss]()()(a,b) =imag(tmp) * scale;
    });
 }
 }
 };
--- a/Grid/qcd/utils/SUn.impl.h
+++ b/Grid/qcd/utils/SUn.impl.h
@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
 ////////////////////////////////////////////////////////////////////////
 // Map a su2 subgroup number to the pair of rows that are non zero
 ////////////////////////////////////////////////////////////////////////
-static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
+static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
  assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
  int spare = su2_index;
--- a/Grid/qcd/utils/SUnAdjoint.h
+++ b/Grid/qcd/utils/SUnAdjoint.h
@ -62,7 +62,7 @@ public:
    // returns i(T_Adj)^index necessary for the projectors
    // see definitions above
    iAdjTa = Zero();
-    iSUnMatrix<cplx> ta[ncolour * ncolour - 1];
+    Vector<iSUnMatrix<cplx> > ta(ncolour * ncolour - 1);
    iSUnMatrix<cplx> tmp;
    // FIXME not very efficient to get all the generators everytime
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@ -72,7 +72,7 @@ public:
  }
  // Resident in managed memory
-  deviceVector<GeneralStencilEntry>  _entries; 
+  Vector<GeneralStencilEntry>  _entries; 
  GeneralLocalStencil(GridBase *grid, const std::vector<Coordinate> &shifts)
  {
@ -141,7 +141,7 @@ public:
 	  ////////////////////////////////////////////////
 	  // Store in look up table
 	  ////////////////////////////////////////////////
-	  acceleratorPut(this->_entries[lex],SE);
+	  this->_entries[lex] = SE;
 	}
      });
  }
--- a/Grid/qcd/action/fermion/deprecated/Lebesgue.cc
+++ b/Grid/qcd/action/fermion/deprecated/Lebesgue.cc
@ -1,4 +1,3 @@
 #if 0
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -242,4 +241,3 @@ void LebesgueOrder::ZGraph(void)
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/deprecated/Lebesgue.h
+++ b/Grid/qcd/action/fermion/deprecated/Lebesgue.h
@ -72,7 +72,7 @@ public:
  void ThreadInterleave(void);
 private:
-  deviceVector<IndexInteger> _LebesgueReorder;
+  Vector<IndexInteger> _LebesgueReorder;
 };    
--- a/Grid/stencil/SimpleCompressor.h
+++ b/Grid/stencil/SimpleCompressor.h
@ -19,7 +19,7 @@ public:
  static int PartialCompressionFactor(GridBase *grid) {return 1;};
  // Decompress is after merge so ok
  template<class vobj,class cobj,class compressor> 
-  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
+  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
 				   const Lattice<vobj> &rhs,
 				   cobj *buffer,
 				   compressor &compress,
@ -35,7 +35,7 @@ public:
    rhs_v.ViewClose();
  }
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
@ -83,6 +83,25 @@ public:
 // Wilson compressor will add alternate policies for Dirichlet
 // and possibly partial Dirichlet for DWF
 ////////////////////////////////////
 /*
 class FaceGatherDirichlet
 {
  // If it's dirichlet we don't assemble comms buffers
  //
  // Rely on zeroes in gauge field to drive the correct result
  // NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so){};
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				   Vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				   compressor &compress,int type) {}
  template<class decompressor,class Merger>
  static void Merge(decompressor decompress,Merge &mm)  {  }
  template<class decompressor,class Decompression>
  static void Decompress(decompressor decompress,Decompression &dd) {}
 };
 */
 template<class vobj,class FaceGather>
 class SimpleCompressorGather : public FaceGather {
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -31,6 +31,7 @@
 #define STENCIL_MAX (16)
 #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate
 #include <Grid/stencil/Lebesgue.h>   // subdir aggregate
 #include <Grid/stencil/GeneralLocalStencil.h>
 //////////////////////////////////////////////////////////////////////////////////////////
@ -255,6 +256,7 @@ protected:
  GridBase *                        _grid;
 public:
  GridBase *Grid(void) const { return _grid; }
  LebesgueOrder *lo;
  ////////////////////////////////////////////////////////////////////////
  // Needed to conveniently communicate gparity parameters into GPU memory
@ -271,11 +273,11 @@ public:
  int face_table_computed;
  int partialDirichlet;
  int fullDirichlet;
-  std::vector<deviceVector<std::pair<int,int> > > face_table ;
+  std::vector<commVector<std::pair<int,int> > > face_table ;
-  deviceVector<int> surface_list;
+  Vector<int> surface_list;
-  std::vector<StencilEntry>  _entries; // Resident in host memory
+  stencilVector<StencilEntry>  _entries; // Resident in managed memory
-  deviceVector<StencilEntry>     _entries_device; // Resident in device memory
+  commVector<StencilEntry>     _entries_device; // Resident in device memory
  std::vector<Packet> Packets;
  std::vector<Merge> Mergers;
  std::vector<Merge> MergersSHM;
@ -365,9 +367,10 @@ public:
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    // All GPU kernel tasks must complete
-    accelerator_barrier();     // All kernels should ALREADY be complete
+    //    accelerator_barrier();     // All kernels should ALREADY be complete
-    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
+    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
 #ifdef ACCELERATOR_AWARE_MPI
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
@ -376,6 +379,23 @@ public:
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
 #else
 #warning "Using COPY VIA HOST BUFFERS IN STENCIL"
    for(int i=0;i<Packets.size();i++){
      // Introduce a host buffer with a cheap slab allocator and zero cost wipe all
      Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
      Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
      if ( Packets[i].do_send ) {
 	acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
      }
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].host_send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
 					Packets[i].host_recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
 #endif
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
@ -390,9 +410,18 @@ public:
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
-    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
+    // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
-    accelerator_barrier(); 
+    //    accelerator_barrier(); 
    _grid->StencilBarrier(); 
 #ifndef ACCELERATOR_AWARE_MPI
 #warning "Using COPY VIA HOST BUFFERS IN STENCIL"
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_recv ) {
 	acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
      }
    }
    _grid->HostBufferFreeAll();
 #endif
    // run any checksums
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_recv )
@ -473,7 +502,7 @@ public:
  template<class compressor>
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
-    accelerator_barrier();
+    //    accelerator_barrier();
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    assert(source.Grid()==_grid);
@ -487,7 +516,6 @@ public:
      HaloGatherDir(source,compress,point,face_idx);
    }
    accelerator_barrier(); // All my local gathers are complete
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
  }
@ -640,7 +668,7 @@ public:
    for(int point=0;point<this->_npoints;point++){
      this->same_node[point] = this->SameNode(point);
    }
-    int32_t surface_list_size=0;
+
    for(int site = 0 ;site< vol4;site++){
      int local = 1;
      for(int point=0;point<this->_npoints;point++){
@ -650,30 +678,11 @@ public:
      }
      if(local == 0) {
 	for(int s=0;s<Ls;s++){
-	  surface_list_size++;
+	  surface_list.push_back(site*Ls+s);
 	}
      }
    }
-    std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    //std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
    surface_list.resize(surface_list_size);
    std::vector<int> surface_list_host(surface_list_size);
    int32_t ss=0;
    for(int site = 0 ;site< vol4;site++){
      int local = 1;
      for(int point=0;point<this->_npoints;point++){
 	if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
 	  local = 0;
 	}
      }
      if(local == 0) {
 	for(int s=0;s<Ls;s++){
 	  int idx=site*Ls+s;
 	  surface_list_host[ss]= idx;
 	  ss++;
 	}
      }
    }
    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@ -207,10 +207,10 @@ cl::sycl::queue *theCopyAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
-  //  cl::sycl::gpu_selector selector;
+  cl::sycl::gpu_selector selector;
-  //  cl::sycl::device selectedDevice { selector };
+  cl::sycl::device selectedDevice { selector };
-  theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
+  theGridAccelerator = new sycl::queue (selectedDevice);
-  theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
+  theCopyAccelerator = new sycl::queue (selectedDevice);
  //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -464,12 +464,16 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
    std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
    std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --lebesgue      : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;    
    std::cout<<GridLogMessage<<"  --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    exit(EXIT_SUCCESS);
  }
@ -497,8 +501,28 @@ void Grid_init(int *argc,char ***argv)
    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
    StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
  CartesianCommunicator::nCommThreads = 1;
 #ifdef GRID_COMMS_THREADS  
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
    GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
    assert(CartesianCommunicator::nCommThreads > 0);
  }
 #endif  
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
    GridLogTimestamp(0);
  } else {
@ -549,31 +573,8 @@ void GridLogLayout() {
 void * Grid_backtrace_buffer[_NBACKTRACE];
 void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
  fprintf(stderr,"Signal handler on host %s\n",hostname);
  fprintf(stderr,"Caught signal %d\n",si->si_signo);
  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
  fprintf(stderr,"         code %d\n",si->si_code);
  // x86 64bit
 #ifdef __linux__
 #ifdef __x86_64__
  ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
 #endif
 #endif
  fflush(stderr);
  BACKTRACEFP(stderr);
  fprintf(stderr,"Called backtrace\n");
  fflush(stdout);
  fflush(stderr);
  return;
 }
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
  fprintf(stderr,"Signal handler on host %s\n",hostname);
  fprintf(stderr,"Caught signal %d\n",si->si_signo);
  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
  fprintf(stderr,"         code %d\n",si->si_code);
@ -584,7 +585,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
-#define REG(A)  fprintf(stderr,"  %s %lx\n",#A,sc-> A);
+#define REG(A)  printf("  %s %lx\n",#A,sc-> A);
  REG(rdi);
  REG(rsi);
  REG(rbp);
@ -617,8 +618,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 void Grid_exit_handler(void)
 {
-  //  BACKTRACEFP(stdout);
+  BACKTRACEFP(stdout);
-  //  fflush(stdout);
+  fflush(stdout);
 }
 void Grid_debug_handler_init(void)
 {
@ -626,10 +627,10 @@ void Grid_debug_handler_init(void)
  sigemptyset (&sa.sa_mask);
  sa.sa_sigaction= Grid_sa_signal_handler;
  sa.sa_flags    = SA_SIGINFO;
-  //  sigaction(SIGSEGV,&sa,NULL);
+  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  sigaction(SIGBUS,&sa,NULL);
-  //  sigaction(SIGUSR2,&sa,NULL);
+  sigaction(SIGUSR2,&sa,NULL);
  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
@ -637,14 +638,7 @@ void Grid_debug_handler_init(void)
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);
-  // Non terminating SIGUSR1/2 handler
+  atexit(Grid_exit_handler);
  struct sigaction sa_ping;
  sigemptyset (&sa_ping.sa_mask);
  sa_ping.sa_sigaction= Grid_usr_signal_handler;
  sa_ping.sa_flags    = SA_SIGINFO;
  sigaction(SIGHUP,&sa_ping,NULL);
  //  atexit(Grid_exit_handler);
 }
 NAMESPACE_END(Grid);
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@ -644,6 +644,11 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 #ifdef KNL
  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
 #else
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
 #endif
  Benchmark::Decomposition();
  int do_su4=1;
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@ -70,7 +70,7 @@ int main (int argc, char ** argv)
    pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101}));
    std::vector<double> stop(threads);
-    std::vector<Vec> sum(threads);
+    Vector<Vec> sum(threads);
    std::vector<LatticeVec> x(threads,&Grid);
    for(int t=0;t<threads;t++){
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@ -78,9 +78,9 @@ int main (int argc, char ** argv)
    double t0,t1;
    typedef typename DomainWallFermionD::Coeff_t Coeff_t;
-    std::vector<Coeff_t> diag = Dw.bs;
+    Vector<Coeff_t> diag = Dw.bs;
-    std::vector<Coeff_t> upper= Dw.cs;
+    Vector<Coeff_t> upper= Dw.cs;
-    std::vector<Coeff_t> lower= Dw.cs;
+    Vector<Coeff_t> lower= Dw.cs;
    upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
    lower[0]   =-Dw.mass_plus*lower[0];
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@ -861,7 +861,7 @@ int main (int argc, char ** argv)
  }
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
-  //  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
+  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
  Benchmark::Decomposition();
--- a/configure.ac
+++ b/configure.ac
@ -225,6 +225,18 @@ case ${ac_SFW_FP16} in
      AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac
 ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
 AC_ARG_ENABLE([accelerator-aware-mpi],
    [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
    [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
 case ${ac_ACCELERATOR_AWARE_MPI} in
    yes)
      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
      AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
    *);;
 esac
 ############### SYCL/CUDA/HIP/none
 AC_ARG_ENABLE([accelerator],
@ -652,6 +664,16 @@ case ${ac_SHM_FAST_PATH} in
     *) ;;
 esac
 ############### communication type selection
 AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
 case ${ac_COMMS_THREADS} in
     yes)
        AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] )
      ;;
     *) ;;
 esac
 ############### communication type selection
 AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
--- a/systems/Aurora-AOT/config-command
+++ b/systems/Aurora-AOT/config-command
@ -1,23 +0,0 @@
 #Ahead of time compile for PVC
 export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
 #JIT compile 
 #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-accelerator-aware-mpi=yes\
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx 
--- a/systems/Aurora-AOT/sourceme.sh
+++ b/systems/Aurora-AOT/sourceme.sh
@ -1,15 +0,0 @@
 #module load oneapi/release/2023.12.15.001
 #module load mpich/icc-all-debug-pmix-gpu/52.2
 #module load mpich-config/mode/deterministic
 #module load intel_compute_runtime/release/821.35
 source ~/spack/share/spack/setup-env.sh 
 spack load c-lime
 spack load openssl
 export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/Aurora-AOT/tests/reproBigJob.pbs
+++ b/systems/Aurora-AOT/tests/reproBigJob.pbs
@ -1,74 +0,0 @@
 #!/bin/bash
 #PBS -l select=512
 #PBS -q EarlyAppAccess
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=6:00:00
 #PBS -N reproBigJob
 #PBS -k doe
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 #module load oneapi/eng-compiler/2023.05.15.003
 #module load mpich/51.2/icc-all-deterministic-pmix-gpu
 # 56 cores / 6 threads ~9
 export OMP_NUM_THREADS=6
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 export GRID_PRINT_ENTIRE_LOG=0
 export GRID_CHECKSUM_RECV_BUF=0
 export GRID_CHECKSUM_SEND_BUF=0
 export MPICH_OFI_NIC_POLICY=GPU
 #export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 #export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 #export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 #unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 #unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 #unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 cd $PBS_O_WORKDIR
 cp $PBS_NODEFILE nodefile
 DIR=reproBigJob.$PBS_JOBID
 mkdir -p $DIR
 cd $DIR
 cp $PBS_NODEFILE nodefile
 BINARY=../Test_dwf_mixedcg_prec
 echo > pingjob <<EOF
 while read node ; 
 do
 	echo ssh $node killall -s USR1 -- ../Test_dwf_mixedcg_prec
 done < nodefile
 EOF
 CMD="mpiexec -np 6144 -ppn 12  -envall --hostfile nodefile \
 	     ../gpu_tile_compact.sh \
 	     $BINARY --mpi 8.8.8.12 --grid 128.128.128.288 \
 	--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 18000 --debug-stdout --log Message --debug-signals --comms-overlap"
 echo $CMD > command-line
 env > environment
 $CMD
 grep Oops Grid.stderr.* > failures.$PBS_JOBID
 rm core.*
--- a/systems/Aurora/benchmarks/bench1.pbs
+++ b/systems/Aurora/benchmarks/bench1.pbs
@ -1,38 +1,67 @@
 #!/bin/bash
-#PBS -q EarlyAppAccess
+#PBS -q debug
 #PBS -l select=1
 #PBS -l walltime=00:20:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 module load pti-gpu
-cp $PBS_NODEFILE nodefile
+#cat $PBS_NODEFILE
 export OMP_NUM_THREADS=4
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
-unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+
-unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
-unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 2 nodes, 24 ranks
 #
 CMD="mpiexec -np 12 -ppn 12  -envall \
-	     ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.48 \
+	     ./gpu_tile_compact.sh \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --debug-signals"
+	     ./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \
 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
 #$CMD | tee 1node.comms
-#for f in 1 2 3 4 5 6 7 8
+
-for f in 1
+CMD="mpiexec -np 1 -ppn 1  -envall \
-do
+	     ./gpu_tile_compact.sh \
-echo $CMD
+	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
-$CMD | tee 1node.32.32.64.48.dwf.hbm.$f
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
-done
+#$CMD | tee 1tile.dwf
 CMD="mpiexec -np 12 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \
 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 1node.32.32.32.48.dwf
 CMD="mpiexec -np 12 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 #$CMD | tee 1node.64.64.32.96.dwf
 CMD="mpiexec -np 12 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 #$CMD | tee 1node.64.32.32.48.dwf
--- a/systems/Aurora/benchmarks/bench2.pbs
+++ b/systems/Aurora/benchmarks/bench2.pbs
@ -1,6 +1,6 @@
 #!/bin/bash
-#PBS -q EarlyAppAccess
+#PBS -q workq
 #PBS -l select=2
 #PBS -l walltime=00:20:00
 #PBS -A LatticeQCD_aesp_CNDA
@ -11,16 +11,17 @@
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
-#module load pti-gpu
+module load pti-gpu
-
+#cat $PBS_NODEFILE
 cp $PBS_NODEFILE nodefile
 export OMP_NUM_THREADS=4
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
@ -33,26 +34,22 @@ export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 2 nodes, 24 ranks
 #
 CMD="mpiexec -np 24 -ppn 12  -envall \
-	     ./gpu_tile.sh \
+	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
-#$CMD | tee 2node.comms.hbm
+$CMD | tee 2node.comms
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap --debug-signals"
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 2node.32.32.64.48.dwf
 #for f in 1 2 3 4 5 6 7 8
 for f in 1
 do
 echo $CMD
 $CMD | tee 2node.32.32.64.48.dwf.hbm.$f
 done
 CMD="mpiexec -np 24 -ppn 12  -envall \
-	     ./gpu_tile.sh \
+	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-#$CMD | tee 2node.64.64.64.96.dwf.hbm
+$CMD | tee 2node.64.64.64.96.dwf
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@ -1,6 +1,6 @@
-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions "
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@ -1,14 +1,40 @@
 module load oneapi/release/2023.12.15.001
 #module load mpich/icc-all-debug-pmix-gpu/52.2
 #module load mpich-config/mode/deterministic
 #module load intel_compute_runtime/release/821.35
 source ~/spack/share/spack/setup-env.sh 
 spack load c-lime
 spack load openssl
 export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
 #spack load libefence
 #export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' `
 #export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH
 #spack load gperftools
 export TCMALLOC=/home/paboyle/gperftools/install
 export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
 export INTELGT_AUTO_ATTACH_DISABLE=1
 #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
 #module load oneapi/release/2023.12.15.001
 #module use /soft/modulefiles
 #module load intel_compute_runtime/release/agama-devel-682.22
 #export FI_CXI_DEFAULT_CQ_SIZE=131072
 #export FI_CXI_CQ_FILL_PERCENT=20
 #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
 #
 # -ftarget-register-alloc-mode=pvc:default 
 # -ftarget-register-alloc-mode=pvc:small
 # -ftarget-register-alloc-mode=pvc:large
 # -ftarget-register-alloc-mode=pvc:auto
 #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
 #source ~/spack/share/spack/setup-env.sh
 #spack load gperftools
 #export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
 #export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/Aurora/tests/repro16.pbs
+++ b/systems/Aurora/tests/repro16.pbs
@ -2,8 +2,7 @@
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
-#PBS -l select=16
+#PBS -l select=16:system=sunspot,place=scatter
 #PBS -q EarlyAppAccess
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=01:00:00
 #PBS -N dwf
@ -14,14 +13,19 @@
 cd $PBS_O_WORKDIR
-source ../sourceme.sh
+#source ../sourceme.sh
 cat $PBS_NODEFILE
 #export MPICH_COLL_SYNC=1
 #export MPICH_ENV_DISPLAY=1
 export MPICH_
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 module load oneapi/eng-compiler/2023.05.15.003
 module load mpich/51.2/icc-all-deterministic-pmix-gpu
 #export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH
 #module load mpich/51.2/icc-all-deterministic-pmix-gpu
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
--- a/systems/Aurora/tests/repro1gpu.pbs
+++ b/systems/Aurora/tests/repro1gpu.pbs
@ -1,7 +1,6 @@
 #!/bin/bash
-#PBS -l select=16
+#PBS -l select=16:system=sunspot,place=scatter
 #PBS -q EarlyAppAccess
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=02:00:00
 #PBS -N repro1gpu
@ -10,9 +9,8 @@
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
-
+module load oneapi/eng-compiler/2023.05.15.003
-#module load oneapi/eng-compiler/2023.05.15.003
+module load mpich/51.2/icc-all-deterministic-pmix-gpu
 #module load mpich/51.2/icc-all-deterministic-pmix-gpu
 # 56 cores / 6 threads ~9
 export OMP_NUM_THREADS=6
@ -36,8 +34,6 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 NN=`cat $PBS_NODEFILE | wc -l`
 echo $PBS_NODEFILE
 cat $PBS_NODEFILE
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	bffd30abec	Optimise lie algebra project	2024-09-19 15:48:09 -04:00
Peter Boyle	da919949f9	Clean up the accelerator pick/set checkerboard	2024-08-23 12:34:41 -04:00
Peter Boyle	b12b4fdaff	Attempt at operating on half checkerboard	2024-08-23 11:05:09 -04:00
`@ -1,2 +1,2 @@`

	`mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL`	`mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench`
		`@ -1,2 +0,0 @@`

			`mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL`