Merge pull request #475 from lehner/feature-aurora

Sync with GPT on Aurora
verbosity
2025-06-15 06:17:05 +01:00 · 2025-03-13 08:55:55 -04:00 · 2025-03-13 12:49:36 +00:00 · 2025-03-13 08:48:23 +00:00 · 2025-03-13 07:29:55 +00:00 · 2025-03-11 15:02:32 +00:00
169 changed files with 8384 additions and 4114 deletions
--- a/BLAS_benchmark/BatchBlasBench.cc
+++ b/BLAS_benchmark/BatchBlasBench.cc
@ -12,15 +12,13 @@
 #include <iostream>
 #include <sys/time.h>
 #define GRID_SYCL
 #undef  GRID_HIP
 #undef  GRID_CUDA
 #ifdef GRID_HIP
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_CUDA
 #include <cublas_v2.h>
 #endif
 #ifdef GRID_SYCL
 #include <oneapi/mkl.hpp>
@ -45,6 +43,90 @@ inline void acceleratorFreeDevice(void *ptr,size_t bytes){free(ptr,*theAccelerat
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();}
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
 #define accelerator_barrier(dummy) { theAccelerator->wait(); }
 #endif
 #ifdef GRID_HIP
 hipStream_t copyStream;
 hipStream_t computeStream;
 void acceleratorInit(void)
 {
  int device = 0;
  auto discard = hipSetDevice(device);
  discard = hipStreamCreate(&copyStream);
  discard = hipStreamCreate(&computeStream);
  printf("AcceleratorHIPInit\n");
 }
 inline void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = hipMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    fprintf(stderr," hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
  }
  return ptr;
 };
 inline void acceleratorFreeDevice(void *ptr,size_t bytes){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 #define accelerator_barrier(dummy)				\
  {								\
    auto tmp=hipStreamSynchronize(computeStream);		\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
      puts(__FILE__);							\
      printf("Line %d\n",__LINE__);				\
      exit(0);							\
    }								\
  }
 #endif
 #ifdef GRID_CUDA
 cudaStream_t copyStream;
 cudaStream_t computeStream;
 void acceleratorInit(void)
 {
  int device = 0;
  cudaSetDevice(device);
  cudaStreamCreate(&copyStream);
  cudaStreamCreate(&computeStream);
 }
 inline void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
 };
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 #define accelerator_barrier(dummy)					\
  {									\
    cudaStreamSynchronize(computeStream);				\
    cudaError err = cudaGetLastError();					\
    if ( cudaSuccess != err ) {						\
      printf("accelerator_barrier(): Cuda error %s \n",			\
 	     cudaGetErrorString( err ));				\
      printf("File %s Line %d\n",__FILE__,__LINE__);			\
      fflush(stdout);							\
      if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);		\
    }									\
  }
 #endif
 template<class T> void acceleratorPut(T& dev,T&host)
 {
  acceleratorCopyToDevice(&host,&dev,sizeof(T));
@ -55,9 +137,6 @@ template<class T> T acceleratorGet(T& dev)
  acceleratorCopyFromDevice(&dev,&host,sizeof(T));
  return host;
 }
 #define accelerator_barrier(dummy) { theAccelerator->wait(); }
 #endif
 /**************************************************************
 * Allocator
@ -211,6 +290,269 @@ public:
 #endif
  }
  /////////////////////////////////////////////////////////////
  // Single matrix GEMM -- fp64 and fp32
  /////////////////////////////////////////////////////////////
  void gemm(GridBLASOperation_t OpA,
 	    GridBLASOperation_t OpB,
 	    int m,int n, int k,
 	    ComplexD alpha,
 	    ComplexD* Amk,  // Device pointer
 	    ComplexD* Bkn,
 	    ComplexD beta,
 	    ComplexD* Cmn)
  {
    RealD t2=usecond();
    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
    assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<ComplexD> alpha_p(1);
    static deviceVector<ComplexD> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
    RealD t0=usecond();
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasZgemm(gridblasHandle,
 			    hOpA,
 			    hOpB,
 			    m,n,k,
 			    (hipblasDoubleComplex *) &alpha_p[0],
 			    (hipblasDoubleComplex *) Amk, lda,
 			    (hipblasDoubleComplex *) Bkn, ldb,
 			    (hipblasDoubleComplex *) &beta_p[0],
 			    (hipblasDoubleComplex *) Cmn, ldc);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasZgemm(gridblasHandle,
 			   hOpA,
 			   hOpB,
 			   m,n,k,
 			   (cuDoubleComplex *) &alpha_p[0],
 			   (cuDoubleComplex *) Amk, lda,
 			   (cuDoubleComplex *) Bkn, ldb,
 			   (cuDoubleComplex *) &beta_p[0],
 			   (cuDoubleComplex *) Cmn, ldc);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
      int64_t lda64=lda;
      int64_t ldb64=ldb;
      int64_t ldc64=ldc;
      oneapi::mkl::transpose iOpA;
      oneapi::mkl::transpose iOpB;
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
      oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
 					    iOpA,
 					    iOpB,
 					    m64,n64,k64,
 					    (ComplexD *) &alpha_p[0],
 					    (const ComplexD *)Amk, (int64_t )lda64,
 					    (const ComplexD *)Bkn, (int64_t )ldb64,
 					    (ComplexD *) &beta_p[0],
 					    (ComplexD *)Cmn, (int64_t)ldc64);
      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation; use Eigen
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
 	Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
 	Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk * eBkn ;
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
 	Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
 	Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
 	Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
 	Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
 	Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
 	Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
      } else { 
 	assert(0);
      }
 #endif
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k;
     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n);
  }
  void gemm(GridBLASOperation_t OpA,
 	    GridBLASOperation_t OpB,
 	    int m,int n, int k,
 	    ComplexF alpha,
 	    ComplexF* Amk,  // Device pointer
 	    ComplexF* Bkn,
 	    ComplexF beta,
 	    ComplexF* Cmn)
  {
    RealD t2=usecond();
    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
    assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<ComplexF> alpha_p(1);
    static deviceVector<ComplexF> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
    RealD t0=usecond();
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasCgemm(gridblasHandle,
 			    hOpA,
 			    hOpB,
 			    m,n,k,
 			    (hipblasComplex *) &alpha_p[0],
 			    (hipblasComplex *) Amk, lda,
 			    (hipblasComplex *) Bkn, ldb,
 			    (hipblasComplex *) &beta_p[0],
 			    (hipblasComplex *) Cmn, ldc);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasCgemm(gridblasHandle,
 			   hOpA,
 			   hOpB,
 			   m,n,k,
 			   (cuComplex *) &alpha_p[0],
 			   (cuComplex *) Amk, lda,
 			   (cuComplex *) Bkn, ldb,
 			   (cuComplex *) &beta_p[0],
 			   (cuComplex *) Cmn, ldc);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
      int64_t lda64=lda;
      int64_t ldb64=ldb;
      int64_t ldc64=ldc;
      oneapi::mkl::transpose iOpA;
      oneapi::mkl::transpose iOpB;
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
      oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
 					    iOpA,
 					    iOpB,
 					    m64,n64,k64,
 					    (ComplexF *) &alpha_p[0],
 					    (const ComplexF *)Amk, (int64_t )lda64,
 					    (const ComplexF *)Bkn, (int64_t )ldb64,
 					    (ComplexF *) &beta_p[0],
 					    (ComplexF *)Cmn, (int64_t )ldc64);
      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation; use Eigen
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
 	Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
 	Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk * eBkn ;
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
 	Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
 	Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
 	Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
 	Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
 	Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
 	Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
 	eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
      } else { 
 	assert(0);
      }
 #endif
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k;
     RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n);
  }
  /////////////////////////////////////////////////////////////
  void gemmBatched(int m,int n, int k,
 		   ComplexD alpha,
 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
@ -241,36 +583,6 @@ public:
 		beta,
 		Cmn);
  }
  void gemmBatched(int m,int n, int k,
 		   RealD alpha,
 		   deviceVector<RealD*> &Amk,  // pointer list to matrices
 		   deviceVector<RealD*> &Bkn,
 		   RealD beta,
 		   deviceVector<RealD*> &Cmn)
  {
    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
 		Amk,
 		Bkn,
 		beta,
 		Cmn);
  }
  void gemmBatched(int m,int n, int k,
 		   RealF alpha,
 		   deviceVector<RealF*> &Amk,  // pointer list to matrices
 		   deviceVector<RealF*> &Bkn,
 		   RealF beta,
 		   deviceVector<RealF*> &Cmn)
  {
    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
 		Amk,
 		Bkn,
 		beta,
 		Cmn);
  }
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
@ -624,301 +936,6 @@ public:
     RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
  }
  ///////////////////////////////////////////////////////////////////////////
  // Single precision real GEMM
  ///////////////////////////////////////////////////////////////////////////
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
 		   int m,int n, int k,
 		   RealF alpha,
 		   deviceVector<RealF*> &Amk,  // pointer list to matrices
 		   deviceVector<RealF*> &Bkn,
 		   RealF beta,
 		   deviceVector<RealF*> &Cmn)
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
    assert(OpB!=GridBLAS_OP_C);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<RealF> alpha_p(1);
    static deviceVector<RealF> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
    RealD t0=usecond();
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasSgemmBatched(gridblasHandle,
 				   hOpA,
 				   hOpB,
 				   m,n,k,
 				   (float *) &alpha_p[0],
 				   (float **)&Amk[0], lda,
 				   (float **)&Bkn[0], ldb,
 				   (float *) &beta_p[0],
 				   (float **)&Cmn[0], ldc,
 				   batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasSgemmBatched(gridblasHandle,
 				  hOpA,
 				  hOpB,
 				  m,n,k,
 				  (float *) &alpha_p[0],
 				  (float **)&Amk[0], lda,
 				  (float **)&Bkn[0], ldb,
 				  (float *) &beta_p[0],
 				  (float **)&Cmn[0], ldc,
 				  batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
      int64_t lda64=lda;
      int64_t ldb64=ldb;
      int64_t ldc64=ldc;
      int64_t batchCount64=batchCount;
      oneapi::mkl::transpose iOpA;
      oneapi::mkl::transpose iOpB;
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 						  &iOpA,
 						  &iOpB,
 						  &m64,&n64,&k64,
 						  (float *) &alpha_p[0],
 						  (const float **)&Amk[0], (const int64_t *)&lda64,
 						  (const float **)&Bkn[0], (const int64_t *)&ldb64,
 						  (float *) &beta_p[0],
 						  (float **)&Cmn[0], (const int64_t *)&ldc64,
 						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation; use Eigen
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
      }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
  }
  ///////////////////////////////////////////////////////////////////////////
  // Double precision real GEMM
  ///////////////////////////////////////////////////////////////////////////
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
 		   int m,int n, int k,
 		   RealD alpha,
 		   deviceVector<RealD*> &Amk,  // pointer list to matrices
 		   deviceVector<RealD*> &Bkn,
 		   RealD beta,
 		   deviceVector<RealD*> &Cmn)
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
    assert(OpB!=GridBLAS_OP_C);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<RealD> alpha_p(1);
    static deviceVector<RealD> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
    RealD t0=usecond();
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasDgemmBatched(gridblasHandle,
 				   HIPBLAS_OP_N,
 				   HIPBLAS_OP_N,
 				   m,n,k,
 				   (double *) &alpha_p[0],
 				   (double **)&Amk[0], lda,
 				   (double **)&Bkn[0], ldb,
 				   (double *) &beta_p[0],
 				   (double **)&Cmn[0], ldc,
 				   batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasDgemmBatched(gridblasHandle,
 				  hOpA,
 				  hOpB,
 				  m,n,k,
 				  (double *) &alpha_p[0],
 				  (double **)&Amk[0], lda,
 				  (double **)&Bkn[0], ldb,
 				  (double *) &beta_p[0],
 				  (double **)&Cmn[0], ldc,
 				  batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
      int64_t lda64=lda;
      int64_t ldb64=ldb;
      int64_t ldc64=ldc;
      int64_t batchCount64=batchCount;
      oneapi::mkl::transpose iOpA;
      oneapi::mkl::transpose iOpB;
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 						  &iOpA,
 						  &iOpB,
 						  &m64,&n64,&k64,
 						  (double *) &alpha_p[0],
 						  (const double **)&Amk[0], (const int64_t *)&lda64,
 						  (const double **)&Bkn[0], (const int64_t *)&ldb64,
 						  (double *) &beta_p[0],
 						  (double **)&Cmn[0], (const int64_t *)&ldc64,
 						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation; use Eigen
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
      }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
  }
  template<class CComplex>
  double benchmark(int M, int N, int K, int BATCH)
  {
@ -967,6 +984,47 @@ public:
    return flops; // Returns gigaflops
  }
  template<class CComplex>
  double benchmark(int M, int N, int K)
  {
    int32_t N_A = M*K;
    int32_t N_B = K*N;
    int32_t N_C = M*N;
    deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
    deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
    deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
    CComplex alpha(1.0);
    CComplex beta (1.0);
    RealD flops = 8.0*M*N*K;
    int ncall=10;
    gemm(GridBLAS_OP_C,GridBLAS_OP_N,
 	 M,N,K,
 	 alpha,
 	 &A[0], // m x k 
 	 &B[0], // k x n
 	 beta, 
 	 &C[0]);
    synchronise();
    RealD t0 = usecond();
    for(int i=0;i<ncall;i++){
      gemm(GridBLAS_OP_N,GridBLAS_OP_N,
 	   M,N,K,
 	   alpha,
 	   &A[0], // m x k 
 	   &B[0], // k x n
 	   beta, 
 	   &C[0]);
      synchronise();
    }
    RealD t1 = usecond();
    RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K);
    flops = 8.0*M*N*K*ncall;
    flops = flops/(t1-t0)/1.e3;
    return flops; // Returns gigaflops
  }
 };
@ -1035,6 +1093,21 @@ static void BLAS(void)
      std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
    }}
  fprintf(FP,"\n\n\n");
  std::cout << "----------------------------------------------------------"<<std::endl;
  std::cout << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (inner product matrix)"<<std::endl;
  std::cout << "----------------------------------------------------------"<<std::endl;
  {
    int M=12;
    int N=12;
    std::vector<int> ks({4*1024*1024, 2*1024*1024, 1024*1024, 256*1024, 1024 });
    for( int kk=0;kk<ks.size();kk++ ) {
      int K = ks[kk];
      double p=blas.benchmark<CComplex>(M,N,K);
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, 1, p);
      std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<1<<"\t\t"<<p<<std::endl;
    }
  }
  std::cout << "=================================================================================="<<std::endl;
 };
--- a/BLAS_benchmark/compile-command
+++ b/BLAS_benchmark/compile-command
@ -1,2 +1,2 @@
-mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
+mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
--- a/BLAS_benchmark/compile-command-frontier
+++ b/BLAS_benchmark/compile-command-frontier
@ -0,0 +1,5 @@
 CXX=hipcc
 MPICXX=mpicxx 
 CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
 LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
 hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench
--- a/BLAS_benchmark/compile-command-sunspot
+++ b/BLAS_benchmark/compile-command-sunspot
@ -0,0 +1,2 @@
 mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@ -50,6 +50,7 @@ NAMESPACE_CHECK(approx);
 #include <Grid/algorithms/deflation/Deflation.h>
 #include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
 #include <Grid/algorithms/deflation/MultiRHSDeflation.h>
 #include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
 NAMESPACE_CHECK(deflation);
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 NAMESPACE_CHECK(ConjGrad);
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@ -168,6 +168,7 @@ public:
  template<class vobj>
  void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
 #ifndef HAVE_FFTW
    std::cerr << "FFTW is not compiled but is called"<<std::endl;
    assert(0);
 #else
    conformable(result.Grid(),vgrid);
@ -190,6 +191,7 @@ public:
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
    //std::cout << "CPU view" << std::endl;
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@ -213,6 +215,7 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -226,6 +229,7 @@ public:
    }
    // Barrel shift and collect global pencil
    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@ -247,6 +251,7 @@ public:
      }
    }
    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@ -269,6 +274,7 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@ -285,6 +291,7 @@ public:
    }
    result = result*div;
    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -103,6 +103,38 @@ public:
    _Mat.MdagM(in,out);
  }
 };
 template<class Matrix,class Field>
 class MMdagLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
 public:
  MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
    _Mat.Mdiag(in,out);
  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){
    _Mat.MdirAll(in,out);
  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
  void AdjOp     (const Field &in, Field &out){
    _Mat.Mdag(in,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    _Mat.MMdag(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
  void HermOp(const Field &in, Field &out){
    _Mat.MMdag(in,out);
  }
 };
 ////////////////////////////////////////////////////////////////////
 // Construct herm op and shift it for mgrid smoother
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@ -45,6 +45,11 @@ public:
    M(in,tmp);
    Mdag(tmp,out);
  }
  virtual void  MMdag(const Field &in, Field &out) {
    Field tmp (in.Grid());
    Mdag(in,tmp);
    M(tmp,out);
  }
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@ -59,7 +59,7 @@ public:
    RealD diff = hi-lo;
    RealD delta = diff*1.0e-9;
    for (RealD x=lo; x<hi; x+=delta) {
-      delta*=1.1;
+      delta*=1.02;
      RealD f = approx(x);
      out<< x<<" "<<f<<std::endl;
    }
@ -131,6 +131,26 @@ public:
      Coeffs[j] = s * 2.0/order;
    }
  };
  template<class functor>
  void Init(RealD _lo,RealD _hi,int _order, functor & func)
  {
    lo=_lo;
    hi=_hi;
    order=_order;
    if(order < 2) exit(-1);
    Coeffs.resize(order);
    for(int j=0;j<order;j++){
      RealD s=0;
      for(int k=0;k<order;k++){
 	RealD y=std::cos(M_PI*(k+0.5)/order);
 	RealD x=0.5*(y*(hi-lo)+(hi+lo));
 	RealD f=func(x);
 	s=s+f*std::cos( j*M_PI*(k+0.5)/order );
      }
      Coeffs[j] = s * 2.0/order;
    }
  };
  void JacksonSmooth(void){
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
  typedef cublasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
-  typedef cl::sycl::queue *gridblasHandle_t;
+  typedef sycl::queue *gridblasHandle_t;
 #endif
 #ifdef GRID_ONE_MKL
-  typedef cl::sycl::queue *gridblasHandle_t;
+  typedef sycl::queue *gridblasHandle_t;
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
  typedef int32_t gridblasHandle_t;
@ -89,9 +89,9 @@ public:
      gridblasHandle = theGridAccelerator;
 #endif
 #ifdef GRID_ONE_MKL
-      cl::sycl::gpu_selector selector;
+      sycl::gpu_selector selector;
-      cl::sycl::device selectedDevice { selector };
+      sycl::device selectedDevice { selector };
-      cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
+      sycl::property_list q_prop{sycl::property::queue::in_order()};
      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 #endif
      gridblasInit=1;
@ -208,8 +208,8 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -367,28 +367,67 @@ public:
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
        });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.adjoint() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@ -414,8 +453,8 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -514,28 +553,70 @@ public:
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.adjoint() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  else
 	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@ -661,28 +742,40 @@ public:
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
@ -809,28 +902,40 @@ public:
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 	  else
 	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  if (std::abs(beta) != 0.0)
 	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  else
 	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
--- a/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
+++ b/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
@ -0,0 +1,376 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: MultiRHSBlockCGLinalg.h
    Copyright (C) 2024
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 /* Need helper object for BLAS accelerated mrhs blockCG */
 template<class Field>
 class MultiRHSBlockCGLinalg
 {
 public:
  typedef typename Field::scalar_type   scalar;
  typedef typename Field::scalar_object scalar_object;
  typedef typename Field::vector_object vector_object;
  deviceVector<scalar> BLAS_X;      // nrhs x vol -- the sources
  deviceVector<scalar> BLAS_Y;      // nrhs x vol -- the result
  deviceVector<scalar> BLAS_C;      // nrhs x nrhs -- the coefficients 
  deviceVector<scalar> BLAS_Cred;   // nrhs x nrhs x oSites -- reduction buffer
  deviceVector<scalar *> Xdip;
  deviceVector<scalar *> Ydip;
  deviceVector<scalar *> Cdip;
  MultiRHSBlockCGLinalg() {};
  ~MultiRHSBlockCGLinalg(){ Deallocate(); };
  void Deallocate(void)
  {
    Xdip.resize(0);
    Ydip.resize(0);
    Cdip.resize(0);
    BLAS_Cred.resize(0);
    BLAS_C.resize(0);
    BLAS_X.resize(0);
    BLAS_Y.resize(0);
  }
  void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
  {
    std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
    for(int r=0;r<AP.size();r++){
      Y_copy[r] = Y[r];
    }
    MulMatrix(AP,m,X);
    for(int r=0;r<AP.size();r++){
      AP[r] = scale*AP[r]+Y_copy[r];
    }
  }
  void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
  {
    typedef typename Field::scalar_type scomplex;
    GridBase *grid;
    uint64_t vol;
    uint64_t words;
    int nrhs = Y.size();
    grid  = X[0].Grid();
    vol   = grid->lSites();
    words = sizeof(scalar_object)/sizeof(scalar);
    int64_t vw = vol * words;
    RealD t0 = usecond();
    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
    RealD t1 = usecond();
    /////////////////////////////////////////////
    // Copy in the multi-rhs sources
    /////////////////////////////////////////////
    for(int r=0;r<nrhs;r++){
      int64_t offset = r*vw;
      autoView(x_v,X[r],AcceleratorRead);
      acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
    }
    // Assumes Eigen storage contiguous
    acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
  /*
   * in Fortran column major notation (cuBlas order)
   *
   * Xxr = [X1(x)][..][Xn(x)]
   * Yxr = [Y1(x)][..][Ym(x)]
   * Y = X . C
   */
    deviceVector<scalar *> Xd(1);
    deviceVector<scalar *> Yd(1);
    deviceVector<scalar *> Cd(1);
    scalar * Xh = & BLAS_X[0];
    scalar * Yh = & BLAS_Y[0];
    scalar * Ch = & BLAS_C[0];
    acceleratorPut(Xd[0],Xh);
    acceleratorPut(Yd[0],Yh);
    acceleratorPut(Cd[0],Ch);
    RealD t2 = usecond();
    GridBLAS BLAS;
    /////////////////////////////////////////
    // Y = X*C (transpose?)
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
    		     vw,nrhs,nrhs,
 		     scalar(1.0),
 		     Xd,
 		     Cd,
 		     scalar(0.0),  // wipe out Y
 		     Yd);
    BLAS.synchronise();
    RealD t3 = usecond();
    // Copy back Y = m X 
    for(int r=0;r<nrhs;r++){
      int64_t offset = r*vw;
      autoView(y_v,Y[r],AcceleratorWrite);
      acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
    }    
    RealD t4 = usecond();
    std::cout <<GridLogPerformance << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
  }
  void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
  {
 #if 0    
    int nrhs;
    GridBase *grid;
    uint64_t vol;
    uint64_t words;
    nrhs = X.size();
    assert(X.size()==Y.size());
    conformable(X[0],Y[0]);
    grid  = X[0].Grid();
    vol   = grid->lSites();
    words = sizeof(scalar_object)/sizeof(scalar);
    int64_t vw = vol * words;
    RealD t0 = usecond();
    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
    RealD t1 = usecond();
    /////////////////////////////////////////////
    // Copy in the multi-rhs sources
    /////////////////////////////////////////////
    for(int r=0;r<nrhs;r++){
      int64_t offset = r*vw;
      autoView(x_v,X[r],AcceleratorRead);
      acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
      autoView(y_v,Y[r],AcceleratorRead);
      acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
    }
    RealD t2 = usecond();
  /*
   * in Fortran column major notation (cuBlas order)
   *
   * Xxr = [X1(x)][..][Xn(x)]
   *
   * Yxr = [Y1(x)][..][Ym(x)]
   *
   * C_rs = X^dag Y
   */
    deviceVector<scalar *> Xd(1);
    deviceVector<scalar *> Yd(1);
    deviceVector<scalar *> Cd(1);
    scalar * Xh = & BLAS_X[0];
    scalar * Yh = & BLAS_Y[0];
    scalar * Ch = & BLAS_C[0];
    acceleratorPut(Xd[0],Xh);
    acceleratorPut(Yd[0],Yh);
    acceleratorPut(Cd[0],Ch);
    GridBLAS BLAS;
    RealD t3 = usecond();
    /////////////////////////////////////////
    // C_rs = X^dag Y
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
    		     nrhs,nrhs,vw,
 		     ComplexD(1.0),
 		     Xd,
 		     Yd,
 		     ComplexD(0.0),  // wipe out C
 		     Cd);
    BLAS.synchronise();
    RealD t4 = usecond();
    std::vector<scalar> HOST_C(BLAS_C.size());      // nrhs . nrhs -- the coefficients 
    acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
    grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
    RealD t5 = usecond();
    for(int rr=0;rr<nrhs;rr++){
      for(int r=0;r<nrhs;r++){
 	int off = r+nrhs*rr;
 	m(r,rr)=HOST_C[off];
      }
    }
    RealD t6 = usecond();
    uint64_t M=nrhs;
    uint64_t N=nrhs;
    uint64_t K=vw;
    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
    RealD flops = 8.0*M*N*K;
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #else
    int nrhs;
    GridBase *grid;
    uint64_t vol;
    uint64_t words;
    nrhs = X.size();
    assert(X.size()==Y.size());
    conformable(X[0],Y[0]);
    grid  = X[0].Grid();
    int rd0 =  grid->_rdimensions[0] * grid->_rdimensions[1];
    vol   = grid->oSites()/rd0;
    words = rd0*sizeof(vector_object)/sizeof(scalar);
    int64_t vw = vol * words;
    assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
    RealD t0 = usecond();
    BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
    BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
    RealD t1 = usecond();
    /////////////////////////////////////////////
    // Copy in the multi-rhs sources -- layout batched BLAS ready
    /////////////////////////////////////////////
    for(int r=0;r<nrhs;r++){
      autoView(x_v,X[r],AcceleratorRead);
      autoView(y_v,Y[r],AcceleratorRead);
      scalar *from_x=(scalar *)&x_v[0];
      scalar *from_y=(scalar *)&y_v[0];
      scalar *BX = &BLAS_X[0];
      scalar *BY = &BLAS_Y[0];
      accelerator_for(ssw,vw,1,{
 	  uint64_t ss=ssw/words;
 	  uint64_t  w=ssw%words;
 	  uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
 	  BX[offset] = from_x[ssw];
 	  BY[offset] = from_y[ssw];
 	});
    }
    RealD t2 = usecond();
  /*
   * in Fortran column major notation (cuBlas order)
   *
   * Xxr = [X1(x)][..][Xn(x)]
   *
   * Yxr = [Y1(x)][..][Ym(x)]
   *
   * C_rs = X^dag Y
   */
    Xdip.resize(vol);
    Ydip.resize(vol);
    Cdip.resize(vol);
    std::vector<scalar *> Xh(vol);
    std::vector<scalar *> Yh(vol);
    std::vector<scalar *> Ch(vol);
    for(uint64_t ss=0;ss<vol;ss++){
      Xh[ss] = & BLAS_X[ss*nrhs*words];
      Yh[ss] = & BLAS_Y[ss*nrhs*words];
      Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
    }
    acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
    acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
    acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
    GridBLAS BLAS;
    RealD t3 = usecond();
    /////////////////////////////////////////
    // C_rs = X^dag Y
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
    		     nrhs,nrhs,words,
 		     ComplexD(1.0),
 		     Xdip,
 		     Ydip,
 		     ComplexD(0.0),  // wipe out C
 		     Cdip);
    BLAS.synchronise();
    RealD t4 = usecond();
    std::vector<scalar> HOST_C(BLAS_Cred.size());      // nrhs . nrhs -- the coefficients 
    acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
    RealD t5 = usecond();
    m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    for(int ss=0;ss<vol;ss++){
      Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
      m = m + eC;
    }
    RealD t6l = usecond();
    grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
    RealD t6 = usecond();
    uint64_t M=nrhs;
    uint64_t N=nrhs;
    uint64_t K=vw;
    RealD xybytes = grid->lSites()*sizeof(scalar_object);
    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
    RealD flops = 8.0*M*N*K;
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
    xybytes = 4*xybytes/(t2-t1)/1.e3;
    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #endif
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/deflation/MultiRHSBlockProject.h
+++ b/Grid/algorithms/deflation/MultiRHSBlockProject.h
@ -447,10 +447,10 @@ public:
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
    		     nbasis,nrhs,vw,
-		     ComplexD(1.0),
+		     scalar(1.0),
 		     Vd,
 		     Fd,
-		     ComplexD(0.0),  // wipe out C
+		     scalar(0.0),  // wipe out C
 		     Cd);
    BLAS.synchronise();
    //    std::cout << "BlockProject done"<<std::endl;
@ -497,10 +497,10 @@ public:
    int64_t vw = block_vol * words;
    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
    		     vw,nrhs,nbasis,
-		     ComplexD(1.0),
+		     scalar(1.0),
 		     Vd,
 		     Cd,
-		     ComplexD(0.0),  // wipe out C
+		     scalar(0.0),  // wipe out C
 		     Fd);
    BLAS.synchronise();
    //    std::cout << " blas call done"<<std::endl;
--- a/Grid/algorithms/deflation/MultiRHSDeflation.h
+++ b/Grid/algorithms/deflation/MultiRHSDeflation.h
@ -182,10 +182,10 @@ public:
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, 
    		     nev,nrhs,vw,
-		     ComplexD(1.0),
+		     scalar(1.0),
 		     Ed,
 		     Rd,
-		     ComplexD(0.0),  // wipe out C
+		     scalar(0.0),  // wipe out C
 		     Cd);
    BLAS.synchronise();
@ -210,10 +210,10 @@ public:
    /////////////////////////////////////////
    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
 		     vw,nrhs,nev,
-		     ComplexD(1.0),
+		     scalar(1.0),
 		     Ed, // x . nev
 		     Cd, // nev . nrhs
-		     ComplexD(0.0),
+		     scalar(0.0),
 		     Gd);
    BLAS.synchronise();
--- a/Grid/algorithms/iterative/AdefMrhs.h
+++ b/Grid/algorithms/iterative/AdefMrhs.h
@ -53,6 +53,7 @@ class TwoLevelCGmrhs
  // Fine operator, Smoother, CoarseSolver
  LinearOperatorBase<Field>   &_FineLinop;
  LinearFunction<Field>   &_Smoother;
  MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
  GridStopWatch ProjectTimer;
  GridStopWatch PromoteTimer;
@ -62,7 +63,12 @@ class TwoLevelCGmrhs
  GridStopWatch SmoothTimer;
  GridStopWatch InsertTimer;
-  
+  /*
    Field rrr;
  Field sss;
  Field qqq;
  Field zzz;
  */  
  // more most opertor functions
  TwoLevelCGmrhs(RealD tol,
 		 Integer maxit,
@ -73,12 +79,313 @@ class TwoLevelCGmrhs
    MaxIterations(maxit),
    _FineLinop(FineLinop),
    _Smoother(Smoother)
    /*
    rrr(fine),
    sss(fine),
    qqq(fine),
    zzz(fine)
 */
  {
    grid       = fine;
  };
  // Vector case
  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
  {
    //    SolveSingleSystem(src,x);
    SolvePrecBlockCG(src,x);
  }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  //Dimensions
  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
  //
  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
  //
  //   Q  C = R => Q = R C^{-1}
  //
  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
  //
  // Set C = L^{dag}, and then Q^dag Q = ident 
  //
  // Checks:
  // Cdag C = Rdag R ; passes.
  // QdagQ  = 1      ; passes
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  void ThinQRfact (Eigen::MatrixXcd &m_zz,
 		   Eigen::MatrixXcd &C,
 		   Eigen::MatrixXcd &Cinv,
 		   std::vector<Field> &  Q,
 		   std::vector<Field> & MQ,
 		   const std::vector<Field> & Z,
 		   const std::vector<Field> & MZ)
  {
    RealD t0=usecond();
    _BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
    RealD t1=usecond();
    m_zz = 0.5*(m_zz+m_zz.adjoint());
    Eigen::MatrixXcd L    = m_zz.llt().matrixL(); 
    C    = L.adjoint();
    Cinv = C.inverse();
    RealD t3=usecond();
    _BlockCGLinalg.MulMatrix( Q,Cinv,Z);
    _BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
    RealD t4=usecond();
    std::cout << " ThinQRfact IP    :"<< t1-t0<<" us"<<std::endl;
    std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
    std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
  }
  virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
  {
    std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
    src[0].Grid()->Barrier();
    int nrhs = src.size();
    //    std::vector<RealD> f(nrhs);
    //    std::vector<RealD> rtzp(nrhs);
    //    std::vector<RealD> rtz(nrhs);
    //    std::vector<RealD> a(nrhs);
    //    std::vector<RealD> d(nrhs);
    //    std::vector<RealD> b(nrhs);
    //    std::vector<RealD> rptzp(nrhs);
    ////////////////////////////////////////////
    //Initial residual computation & set up
    ////////////////////////////////////////////
    std::vector<RealD> ssq(nrhs);
    for(int rhs=0;rhs<nrhs;rhs++){
      ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
    }      
    ///////////////////////////
    // Fields -- eliminate duplicates between fPcg and block cg
    ///////////////////////////
    std::vector<Field> Mtmp(nrhs,grid);
    std::vector<Field> tmp(nrhs,grid);
    std::vector<Field>   Z(nrhs,grid); // Rename Z to R
    std::vector<Field>  MZ(nrhs,grid); // Rename MZ to Z
    std::vector<Field>   Q(nrhs,grid); // 
    std::vector<Field>  MQ(nrhs,grid); // Rename to P
    std::vector<Field>   D(nrhs,grid);
    std::vector<Field>  AD(nrhs,grid);
    /************************************************************************
     * Preconditioned Block conjugate gradient rQ
     * Generalise Sebastien Birk Thesis, after Dubrulle 2001.
     * Introduce preconditioning following Saad Ch9
     ************************************************************************
     * Dimensions:
     *
     *   X,B etc... ==(Nferm x nrhs)
     *  Matrix A==(Nferm x Nferm)
     *  
     * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
     * QC => Thin QR factorisation (google it)
     *
     * R = B-AX
     * Z = Mi R
     * QC = Z
     * D = Q 
     * for k: 
     *   R  = AD
     *   Z  = Mi R
     *   M  = [D^dag R]^{-1}
     *   X  = X + D M C
     *   QS = Q - Z.M
     *   D  = Q + D S^dag
     *   C  = S C
     */
    Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(nrhs,nrhs);
    Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(nrhs,nrhs);
    Eigen::MatrixXcd m_zz     = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(nrhs,nrhs);
    Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(nrhs,nrhs);
    Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(nrhs,nrhs);
    GridStopWatch HDCGTimer;
    //////////////////////////
    // x0 = Vstart -- possibly modify guess
    //////////////////////////
    Vstart(X,src);
    //////////////////////////
    // R = B-AX
    //////////////////////////
    for(int rhs=0;rhs<nrhs;rhs++){
      // r0 = b -A x0
      _FineLinop.HermOp(X[rhs],tmp[rhs]);
      axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]);    // Computes R=Z=src - A X0
    }
    //////////////////////////////////
    // Compute MZ = M1 Z = M1 B - M1 A x0
    //////////////////////////////////
    PcgM1(Z,MZ);  
    //////////////////////////////////
    // QC = Z
    //////////////////////////////////
    ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
    //////////////////////////////////
    // D=MQ
    //////////////////////////////////
    for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
    std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
    ProjectTimer.Reset();
    PromoteTimer.Reset();
    DeflateTimer.Reset();
    CoarseTimer.Reset();
    SmoothTimer.Reset();
    FineTimer.Reset();
    InsertTimer.Reset();
    GridStopWatch M1Timer;
    GridStopWatch M2Timer;
    GridStopWatch M3Timer;
    GridStopWatch LinalgTimer;
    GridStopWatch InnerProdTimer;
    HDCGTimer.Start();
    std::vector<RealD> rn(nrhs);
    for (int k=0;k<=MaxIterations;k++){
      ////////////////////
      // Z  = AD
      ////////////////////
      M3Timer.Start();
      for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);      
      M3Timer.Stop();
      ////////////////////
      // MZ  = M1 Z <==== the Multigrid preconditioner
      ////////////////////
      M1Timer.Start();
      PcgM1(Z,MZ);
      M1Timer.Stop();
      FineTimer.Start();
      ////////////////////
      // M  = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
      ////////////////////
      InnerProdTimer.Start();
      _BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
      InnerProdTimer.Stop();
      m_M       = m_DZ.inverse();
      ///////////////////////////
      // X  = X + D MC
      ///////////////////////////
      m_tmp     = m_M * m_C;
      LinalgTimer.Start();
      _BlockCGLinalg.MaddMatrix(X,m_tmp, D,X);     // D are the search directions and X takes the updates 
      LinalgTimer.Stop();
      ///////////////////////////
      // QS = Q - M Z
      // (MQ) S = MQ - M (M1Z)
      ///////////////////////////
      LinalgTimer.Start();
      _BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
      _BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
      ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
      LinalgTimer.Stop();
      ////////////////////////////
      // D  = MQ + D S^dag
      ////////////////////////////
      m_tmp = m_S.adjoint();
      LinalgTimer.Start();
      _BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
      LinalgTimer.Stop();
      ////////////////////////////
      // C  = S C
      ////////////////////////////
      m_C = m_S*m_C;
      ////////////////////////////
      // convergence monitor
      ////////////////////////////
      m_rr = m_C.adjoint() * m_C;
      FineTimer.Stop();
      RealD max_resid=0;
      RealD rrsum=0;
      RealD sssum=0;
      RealD rr;
      for(int b=0;b<nrhs;b++) {
 	rrsum+=real(m_rr(b,b));
 	sssum+=ssq[b];
 	rr = real(m_rr(b,b))/ssq[b];
 	if ( rr > max_resid ) max_resid = rr;
      }
      std::cout << GridLogMessage <<
 	  "\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
      if ( max_resid < Tolerance*Tolerance ) { 
 	HDCGTimer.Stop();
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg  "<<LinalgTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H  "<<M3Timer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse  "<<CoarseTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine    "<<FineTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth  "<<SmoothTimer.Elapsed()<<std::endl;;
 	std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert  "<<InsertTimer.Elapsed()<<std::endl;;
 	for(int rhs=0;rhs<nrhs;rhs++){
 	  _FineLinop.HermOp(X[rhs],tmp[rhs]);			  
 	  Field mytmp(grid);
 	  axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
 	  RealD  xnorm   = sqrt(norm2(X[rhs]));
 	  RealD  srcnorm = sqrt(norm2(src[rhs]));
 	  RealD  tmpnorm = sqrt(norm2(mytmp));
 	  RealD  true_residual = tmpnorm/srcnorm;
 	  std::cout<<GridLogMessage
 		   <<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
 		   <<" solution "<<xnorm
 		   <<" source "<<srcnorm
 		   <<std::endl;
 	}
 	return;
      }
    }
    HDCGTimer.Stop();
    std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
    assert(0);
  }
  virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
  {
    std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
    src[0].Grid()->Barrier();
@ -361,15 +668,26 @@ public:
    CoarseField PleftProjMrhs(this->coarsegridmrhs);
    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
-    for(int rhs=0;rhs<nrhs;rhs++) {
+    //    this->rrr=in[0];
 #undef SMOOTHER_BLOCK_SOLVE
 #if SMOOTHER_BLOCK_SOLVE
    this->SmoothTimer.Start();
    this->_Smoother(in,Min);
    this->SmoothTimer.Stop();
 #else
    for(int rhs=0;rhs<nrhs;rhs++) {
      this->SmoothTimer.Start();
      this->_Smoother(in[rhs],Min[rhs]);
      this->SmoothTimer.Stop();
    }
 #endif
    //    this->sss=Min[0];
    for(int rhs=0;rhs<nrhs;rhs++) {
      this->FineTimer.Start();
      this->_FineLinop.HermOp(Min[rhs],out[rhs]);
      axpy(tmp[rhs],-1.0,out[rhs],in[rhs]);          // resid  = in - A Min
      this->FineTimer.Stop();
@ -401,9 +719,11 @@ public:
    this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]  
    this->PromoteTimer.Stop();
    this->FineTimer.Start();
    //    this->qqq=tmp[0];
    for(int rhs=0;rhs<nrhs;rhs++) {
      axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
    }
    //    this->zzz=out[0];
    this->FineTimer.Stop();
  }
 };
--- a/Grid/algorithms/iterative/BlockConjugateGradient.h
+++ b/Grid/algorithms/iterative/BlockConjugateGradient.h
@ -31,6 +31,58 @@ directory
 NAMESPACE_BEGIN(Grid);
 template<class Field>
 void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
  typedef typename Field::scalar_type scomplex;
  int Nblock = X.size();
  for(int b=0;b<Nblock;b++){
  for(int bp=0;bp<Nblock;bp++) {
    m(b,bp) = innerProduct(X[b],Y[bp]);  
  }}
 }
 template<class Field>
 void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
  // Should make this cache friendly with site outermost, parallel_for
  // Deal with case AP aliases with either Y or X
  //
  //Could pack "X" and "AP" into a Nblock x Volume dense array.
  // AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
  typedef typename Field::scalar_type scomplex;
  int Nblock = AP.size();
  std::vector<Field> tmp(Nblock,X[0]);
  for(int b=0;b<Nblock;b++){
    tmp[b]   = Y[b];
    for(int bp=0;bp<Nblock;bp++) {
      tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp]; 
    }
  }
  for(int b=0;b<Nblock;b++){
    AP[b] = tmp[b];
  }
 }
 template<class Field>
 void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
  // Should make this cache friendly with site outermost, parallel_for
  typedef typename Field::scalar_type scomplex;
  int Nblock = AP.size();
  for(int b=0;b<Nblock;b++){
    AP[b] = Zero();
    for(int bp=0;bp<Nblock;bp++) {
      AP[b] += scomplex(m(bp,b))*X[bp]; 
    }
  }
 }
 template<class Field>
 double normv(const std::vector<Field> &P){
  int Nblock = P.size();
  double nn = 0.0;
  for(int b=0;b<Nblock;b++) {
    nn+=norm2(P[b]);
  }
  return nn;
 }
 enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
 //////////////////////////////////////////////////////////////////////////
@ -87,10 +139,19 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  // Force manifest hermitian to avoid rounding related
  /*
  int rank=m_rr.rows();
  for(int r=0;r<rank;r++){
  for(int s=0;s<rank;s++){
    std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
  }}
  */
  m_rr = 0.5*(m_rr+m_rr.adjoint());
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
 //  ComplexD det = L.determinant();
 //  std::cout << " Det m_rr "<<det<<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -110,11 +171,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 const std::vector<Field> & R)
 {
  InnerProductMatrix(m_rr,R,R);
-
+  /*
  int rank=m_rr.rows();
  for(int r=0;r<rank;r++){
  for(int s=0;s<rank;s++){
    std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
  }}
  */
  m_rr = 0.5*(m_rr+m_rr.adjoint());
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
  //  ComplexD det = L.determinant();
  //  std::cout << " Det m_rr "<<det<<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
@ -186,6 +256,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  sliceNorm(ssq,B,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
  sliceNorm(residuals,B,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
@ -221,6 +292,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  Linop.HermOp(X, AD);
  tmp = B - AD;  
  sliceNorm(residuals,tmp,Orthog);
  for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  D=Q;
@ -236,6 +310,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  RealD max_resid=0;
  int k;
  for (k = 1; k <= MaxIterations; k++) {
@ -280,7 +356,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
     */
    m_rr = m_C.adjoint() * m_C;
-    RealD max_resid=0;
+    max_resid=0;
    RealD rrsum=0;
    RealD rr;
@ -322,7 +398,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    }
  }
-  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
 	    <<" residual "<< std::sqrt(max_resid)<< std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
@ -466,43 +544,6 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
  IterationsToComplete = k;
 }
 void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
  for(int b=0;b<Nblock;b++){
  for(int bp=0;bp<Nblock;bp++) {
    m(b,bp) = innerProduct(X[b],Y[bp]);  
  }}
 }
 void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
  // Should make this cache friendly with site outermost, parallel_for
  // Deal with case AP aliases with either Y or X
  std::vector<Field> tmp(Nblock,X[0]);
  for(int b=0;b<Nblock;b++){
    tmp[b]   = Y[b];
    for(int bp=0;bp<Nblock;bp++) {
      tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp]; 
    }
  }
  for(int b=0;b<Nblock;b++){
    AP[b] = tmp[b];
  }
 }
 void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
  // Should make this cache friendly with site outermost, parallel_for
  for(int b=0;b<Nblock;b++){
    AP[b] = Zero();
    for(int bp=0;bp<Nblock;bp++) {
      AP[b] += scomplex(m(bp,b))*X[bp]; 
    }
  }
 }
 double normv(const std::vector<Field> &P){
  double nn = 0.0;
  for(int b=0;b<Nblock;b++) {
    nn+=norm2(P[b]);
  }
  return nn;
 }
 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQvec implementation:
 //--------------------------
@ -549,6 +590,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
  RealD sssum=0;
  for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
  for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
@ -585,6 +627,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
  for(int b=0;b<Nblock;b++) {
    Linop.HermOp(X[b], AD[b]);
    tmp[b] = B[b] - AD[b];  
    std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
  }
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@ -38,6 +38,7 @@ NAMESPACE_BEGIN(Grid);
 // single input vec, single output vec.
 /////////////////////////////////////////////////////////////
 template <class Field>
 class ConjugateGradient : public OperatorFunction<Field> {
 public:
@ -57,10 +58,22 @@ public:
      ErrorOnNoConverge(err_on_no_conv)
  {};
  virtual void LogIteration(int k,RealD a,RealD b){
    //    std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
  };
  virtual void LogBegin(void){
    std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
  };
    void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
      this->LogBegin();
      GRID_TRACE("ConjugateGradient");
    GridStopWatch PreambleTimer;
    GridStopWatch ConstructTimer;
    GridStopWatch NormTimer;
    GridStopWatch AssignTimer;
    PreambleTimer.Start();
    psi.Checkerboard() = src.Checkerboard();
@ -70,14 +83,19 @@ public:
    //RealD b_pred;
    // Was doing copies
    ConstructTimer.Start();
    Field p  (src.Grid());
    Field mmp(src.Grid());
    Field r  (src.Grid());
    ConstructTimer.Stop();
    // Initial residual computation & set up
    NormTimer.Start();
    ssq = norm2(src);
    RealD guess = norm2(psi);
    NormTimer.Stop();
    assert(std::isnan(guess) == 0);
    AssignTimer.Start();
    if ( guess == 0.0 ) {
      r = src;
      p = r;
@ -89,6 +107,7 @@ public:
      a = norm2(p);
    }
    cp = a;
    AssignTimer.Stop();
    // Handle trivial case of zero src
    if (ssq == 0.){
@ -164,6 +183,7 @@ public:
      }
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
      LogIteration(k,a,b);
      IterationTimer.Stop();
      if ( (k % 500) == 0 ) {
@ -220,6 +240,9 @@ public:
    	      <<" residual "<< std::sqrt(cp / ssq)<< std::endl;
    SolverTimer.Stop();
    std::cout << GridLogMessage << "\tPreamble   " << PreambleTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tConstruct  " << ConstructTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tNorm       " << NormTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tAssign     " << AssignTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tSolver     " << SolverTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
    std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
@ -233,5 +256,118 @@ public:
  }
 };
 template <class Field>
 class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
 public:
  // Optionally record the CG polynomial
  std::vector<double> ak;
  std::vector<double> bk;
  std::vector<double> poly_p;
  std::vector<double> poly_r;
  std::vector<double> poly_Ap;
  std::vector<double> polynomial;
 public:
  ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
  { };
  void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
  {
    Field tmp(src.Grid());
    Field AtoN(src.Grid());
    AtoN = src;
    psi=AtoN*polynomial[0];
    for(int n=1;n<polynomial.size();n++){
      tmp = AtoN;
      Linop.HermOp(tmp,AtoN);
      psi = psi + polynomial[n]*AtoN;
    }
  }
  void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
  {
    Field Ap(src.Grid());
    Field r(src.Grid());
    Field p(src.Grid());
    p=src;
    r=src;
    x=Zero();
    x.Checkerboard()=src.Checkerboard();
    for(int k=0;k<ak.size();k++){
      x = x + ak[k]*p;
      Linop.HermOp(p,Ap);
      r = r - ak[k] * Ap;
      p = r + bk[k] * p;
    }
  }
  void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
  {
    psi=Zero();
    this->operator ()(Linop,src,psi);
  }
  virtual void LogBegin(void)
  {
    std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
    ak.resize(0);
    bk.resize(0);
    polynomial.resize(0);
    poly_Ap.resize(0);
    poly_Ap.resize(0);
    poly_p.resize(1);
    poly_r.resize(1);
    poly_p[0]=1.0;
    poly_r[0]=1.0;
  };
  virtual void LogIteration(int k,RealD a,RealD b)
  {
    // With zero guess,
    // p = r = src
    //
    // iterate:
    //   x =  x + a p
    //   r =  r - a A p
    //   p =  r + b p
    //
    // [0]
    // r = x
    // p = x
    // Ap=0
    //
    // [1]
    // Ap = A x + 0  ==> shift poly P right by 1 and add 0.
    // x  = x + a p  ==> add polynomials term by term 
    // r  = r - a A p  ==> add polynomials term by term
    // p  = r + b p  ==> add polynomials term by term
    //
    std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
    ak.push_back(a);
    bk.push_back(b);
    //  Ap= right_shift(p)
    poly_Ap.resize(k+1);
    poly_Ap[0]=0.0;
    for(int i=0;i<k;i++){
      poly_Ap[i+1]=poly_p[i];
    }
    //  x = x + a p
    polynomial.resize(k);
    polynomial[k-1]=0.0;
    for(int i=0;i<k;i++){
      polynomial[i] = polynomial[i] + a * poly_p[i];
    }
    //  r = r - a Ap
    //  p = r + b p
    poly_r.resize(k+1);
    poly_p.resize(k+1);
    poly_r[k] = poly_p[k] = 0.0;
    for(int i=0;i<k+1;i++){
      poly_r[i] = poly_r[i] - a * poly_Ap[i];
      poly_p[i] = poly_r[i] + b * poly_p[i];
    }
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
      //Compute double precision rsd and also new RHS vector.
      Linop_d.HermOp(sol_d, tmp_d);
      RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
-      
+      std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
      if(norm < OuterLoopNormMult * stop){
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 	break;
      }
-      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+      while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
      PrecChangeTimer.Start();
      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@ -102,11 +102,11 @@ public:
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
-    // dynamic sized arrays on stack; 2d is a pain with vector
+    // remove dynamic sized arrays on stack; 2d is a pain with vector
-    RealD  bs[nshift];
+    std::vector<RealD>  bs(nshift);
-    RealD  rsq[nshift];
+    std::vector<RealD>  rsq(nshift);
-    RealD  z[nshift][2];
+    std::vector<std::array<RealD,2> >  z(nshift);
-    int     converged[nshift];
+    std::vector<int>     converged(nshift);
    const int       primary =0;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
@ -123,11 +123,11 @@ public:
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
-    RealD  bs[nshift];
+    std::vector<RealD>  bs(nshift);
-    RealD  rsq[nshift];
+    std::vector<RealD>  rsq(nshift);
-    RealD  rsqf[nshift];
+    std::vector<RealD>  rsqf(nshift);
-    RealD  z[nshift][2];
+    std::vector<std::array<RealD,2> >  z(nshift);
-    int     converged[nshift];
+    std::vector<int>     converged(nshift);
    const int       primary =0;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@ -156,11 +156,11 @@ public:
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
-    RealD  bs[nshift];
+    std::vector<RealD>  bs(nshift);
-    RealD  rsq[nshift];
+    std::vector<RealD>  rsq(nshift);
-    RealD  rsqf[nshift];
+    std::vector<RealD>  rsqf(nshift);
-    RealD  z[nshift][2];
+    std::vector<std::array<RealD,2> >  z(nshift);
-    int     converged[nshift];
+    std::vector<int>     converged(nshift);
    const int       primary =0;
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h
@ -279,16 +279,16 @@ public:
      Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
      diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
      _sort.push(eval2,Nm);
-      Glog << "#Ritz value before shift: "<< std::endl;
+      //      Glog << "#Ritz value before shift: "<< std::endl;
      for(int i=0; i<Nm; ++i){
-	std::cout.precision(13);
+	//	std::cout.precision(13);
-	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	//	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+	//	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
      }
      //----------------------------------------------------------------------
      if ( Nm>Nk ) {
-        Glog <<" #Apply shifted QR transformations "<<std::endl;
+	//        Glog <<" #Apply shifted QR transformations "<<std::endl;
        //int k2 = Nk+Nu;
        int k2 = Nk;
@ -326,7 +326,7 @@ public:
        Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
        diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
        _sort.push(eval2,Nk);
-	Glog << "#Ritz value after shift: "<< std::endl;
+	//	Glog << "#Ritz value after shift: "<< std::endl;
        for(int i=0; i<Nk; ++i){
 	  //	  std::cout.precision(13);
 	  //	  std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
@ -644,7 +644,7 @@ private:
      //      for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
      k_start +=mrhs;
    }
-    Glog << "LinAlg "<< std::endl;
+    //    Glog << "LinAlg "<< std::endl;
    if (b>0) {
      for (int u=0; u<Nu; ++u) {
@ -678,7 +678,7 @@ private:
      }
      w_copy[u] = w[u];
    }
-    Glog << "LinAlg done"<< std::endl;
+    //    Glog << "LinAlg done"<< std::endl;
    // In block version, the steps 6 and 7 in Lanczos construction is
    // replaced by the QR decomposition of new basis block.
@ -691,15 +691,15 @@ private:
    }
    // re-orthogonalization for numerical stability
-    Glog << "Gram Schmidt"<< std::endl;
+    //    Glog << "Gram Schmidt"<< std::endl;
    orthogonalize(w,Nu,evec,R);
    // QR part
    for (int u=1; u<Nu; ++u) {
      orthogonalize(w[u],w,u);
    }
-    Glog << "Gram Schmidt done "<< std::endl;
+    //    Glog << "Gram Schmidt done "<< std::endl;
-    Glog << "LinAlg "<< std::endl;
+    //    Glog << "LinAlg "<< std::endl;
    for (int u=0; u<Nu; ++u) {
      //for (int v=0; v<Nu; ++v) {
      for (int v=u; v<Nu; ++v) {
@ -716,7 +716,7 @@ private:
 	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
      }
    }
-    Glog << "LinAlg done "<< std::endl;
+    //    Glog << "LinAlg done "<< std::endl;
    if (b < Nm/Nu-1) {
      for (int u=0; u<Nu; ++u) {
@ -935,7 +935,7 @@ if (1){
         int Nu, int Nb, int Nk, int Nm,
         Eigen::MatrixXcd& M)
  {
-    Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
+    //    Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
    assert( Nk <= Nm );
    M = Eigen::MatrixXcd::Zero(Nk,Nk);
@ -953,7 +953,7 @@ if (1){
        M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
      }
    }
-    Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl; 
+    //    Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl; 
  }
@ -963,7 +963,7 @@ if (1){
         int Nu, int Nb, int Nk, int Nm,
         Eigen::MatrixXcd& M)
  {
-    Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
+    //    Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
    assert( Nk <= Nm );
@ -979,7 +979,7 @@ if (1){
        lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
      }
    }
-    Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl; 
+    //    Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl; 
  }
@ -988,7 +988,7 @@ if (1){
 		            RealD Dsh,
 		            Eigen::MatrixXcd& Qprod)
  {
-    Glog << "shiftedQRDecompEigen() begin" << '\n'; 
+    //    Glog << "shiftedQRDecompEigen() begin" << '\n'; 
    Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
    Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
    Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
@ -1004,7 +1004,7 @@ if (1){
                        // lower triangular part used to represent series
                        // of Q sequence.
-    Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n'; 
+    //    Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n'; 
    // equivalent operation of Qprod *= Q
    //M = Eigen::MatrixXcd::Zero(Nm,Nm);
@ -1025,7 +1025,7 @@ if (1){
    Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
-    Glog << "shiftedQRDecompEigen() Mtmp create" << '\n'; 
+    //    Glog << "shiftedQRDecompEigen() Mtmp create" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      for (int j=0; j<Nm-(Nu+1); ++j) {
        for (int k=0; k<Nu+1+j; ++k) {
@ -1033,7 +1033,7 @@ if (1){
        }
      }
    }
-    Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n'; 
+    //    Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      for (int j=Nm-(Nu+1); j<Nm; ++j) {
        for (int k=0; k<Nm; ++k) {
@ -1041,7 +1041,7 @@ if (1){
        }
      }
    }
-    Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n'; 
+    //    Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n'; 
    //static int ntimes = 2;
    //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
@ -1067,13 +1067,13 @@ if (1){
        Mtmp(j,i) = conj(Mtmp(i,j));
      }
    }
-    Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n'; 
+    //    Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
    }
-    Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n'; 
+    //    Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n'; 
    M = Mtmp;
    //M = Q.adjoint()*(M*Q);
@ -1085,7 +1085,7 @@ if (1){
    //  }
    //}
-    Glog << "shiftedQRDecompEigen() end" <<std::endl; 
+    //    Glog << "shiftedQRDecompEigen() end" <<std::endl; 
  }
  void exampleQRDecompEigen(void)
--- a/Grid/algorithms/iterative/NormalEquations.h
+++ b/Grid/algorithms/iterative/NormalEquations.h
@ -60,6 +60,32 @@ public:
  }     
 };
 template<class Field> class NormalResidual : public LinearFunction<Field>{
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
  LinearFunction<Field>   & _Guess;
 public:
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
 NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
 		 LinearFunction<Field> &Guess) 
   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
  void operator() (const Field &in, Field &out){
    Field res(in.Grid());
    Field tmp(in.Grid());
    MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
    _Guess(in,res);
    _HermitianSolver(MMdagOp,in,res);  // M Mdag res = in ;
    _Matrix.Mdag(res,out);             // out = Mdag res
  }     
 };
 template<class Field> class HPDSolver : public LinearFunction<Field> {
 private:
  LinearOperatorBase<Field> & _Matrix;
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@ -20,7 +20,7 @@ template<class Field> class PowerMethod
    RealD evalMaxApprox = 0.0; 
    auto src_n = src; 
    auto tmp = src; 
-    const int _MAX_ITER_EST_ = 100; 
+    const int _MAX_ITER_EST_ = 200; 
    for (int i=0;i<_MAX_ITER_EST_;i++) { 
@ -30,18 +30,17 @@ template<class Field> class PowerMethod
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
-      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
+      std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
-      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
+      //      if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) { 
- 	evalMaxApprox = na; 
+	// 	evalMaxApprox = na; 
-	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
+	// 	return evalMaxApprox; 
- 	return evalMaxApprox; 
+      //      } 
      } 
      evalMaxApprox = na; 
      src_n = tmp;
    }
-    assert(0);
+    std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
-    return 0;
+    return evalMaxApprox;
  }
 };
 }
--- a/Grid/algorithms/iterative/PowerSpectrum.h
+++ b/Grid/algorithms/iterative/PowerSpectrum.h
@ -0,0 +1,76 @@
 #pragma once
 namespace Grid {
 class Band
 {
  RealD lo, hi;
 public:
  Band(RealD _lo,RealD _hi)
  {
    lo=_lo;
    hi=_hi;
  }
  RealD operator() (RealD x){
    if ( x>lo && x<hi ){
      return 1.0;
    } else {
      return 0.0;
    }
  }
 };
 class PowerSpectrum
 { 
 public: 
  template<typename T>  static RealD normalise(T& v) 
  {
    RealD nn = norm2(v);
    nn = sqrt(nn);
    v = v * (1.0/nn);
    return nn;
  }
  std::vector<RealD> ranges;
  std::vector<int> order;
  PowerSpectrum(  std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order)  { };
  template<class Field>
  RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src) 
  { 
    GridBase *grid = src.Grid(); 
    int N=ranges.size();
    RealD hi = ranges[N-1];
    RealD lo_band = 0.0;
    RealD hi_band;
    RealD nn=norm2(src);
    RealD ss=0.0;
    Field tmp = src;
    for(int b=0;b<N;b++){
      hi_band = ranges[b];
      Band Notch(lo_band,hi_band);
      Chebyshev<Field> polynomial;
      polynomial.Init(0.0,hi,order[b],Notch);
      polynomial.JacksonSmooth();
      polynomial(HermOp,src,tmp) ;
      RealD p=norm2(tmp);
      ss=ss+p;
      std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
      lo_band=hi_band;
    }
    std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
    std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
    return 0;
  };
 };
 }
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@ -74,7 +74,7 @@ public:
  void operator() (const Field &src, Field &psi){
-    psi=Zero();
+    //    psi=Zero();
    RealD cp, ssq,rsq;
    ssq=norm2(src);
    rsq=Tolerance*Tolerance*ssq;
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 NAMESPACE_BEGIN(Grid);
 inline RealD AggregatePowerLaw(RealD x)
@ -124,6 +126,53 @@ public:
    }
  }
  virtual void CreateSubspaceGCR(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
  {
    RealD scale;
    TrivialPrecon<FineField> simple_fine;
    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
    FineField noise(FineGrid);
    FineField src(FineGrid);
    FineField guess(FineGrid);
    FineField Mn(FineGrid);
    for(int b=0;b<nn;b++){
      subspace[b] = Zero();
      gaussian(RNG,noise);
      scale = std::pow(norm2(noise),-0.5); 
      noise=noise*scale;
      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
      for(int i=0;i<3;i++){
 	//  void operator() (const Field &src, Field &psi){
 #if 1
 	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
 	src = noise;
 	guess=Zero();
 	GCR(src,guess);
 	subspace[b] = guess;
 #else
 	std::cout << GridLogMessage << " inverting on zero "<<std::endl;
 	src=Zero();
 	guess = noise;
 	GCR(src,guess);
 	subspace[b] = guess;
 #endif
 	noise = subspace[b];
 	scale = std::pow(norm2(noise),-0.5); 
 	noise=noise*scale;
      }
      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
      subspace[b]   = noise;
    }
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
@ -160,14 +209,21 @@ public:
    int b =0;
    {
      ComplexD ip;
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
      hermop.Op(Mn,tmp);
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      ip= innerProduct(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
      hermop.AdjOp(Mn,tmp); 
      ip = innerProduct(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
      b++;
    }
@ -213,8 +269,18 @@ public:
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
 	  ComplexD ip;
 	  hermop.Op(Mn,tmp);
-	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  ip= innerProduct(Mn,tmp); 
 	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
 	  hermop.AdjOp(Mn,tmp); 
 	  ip = innerProduct(Mn,tmp); 
 	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
 	  b++;
 	}
@ -228,6 +294,70 @@ public:
    }
    assert(b==nn);
  }
  virtual void CreateSubspacePolyCheby(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo1,
 				       int orderfilter,
 				       double lo2,
 				       int orderstep)
  {
    RealD scale;
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    FineField tmp(FineGrid);
    // New normalised noise
    gaussian(RNG,noise);
    scale = std::pow(norm2(noise),-0.5); 
    noise=noise*scale;
    std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
    // Initial matrix element
    hermop.Op(noise,Mn);
    std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
    int b =0;
    {
      // Filter
      std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
      Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
      hermop.Op(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
    }
    // Generate a full sequence of Chebyshevs
    for(int n=1;n<nn;n++){
      std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
      Chebyshev<FineField> Cheb(lo2,hi,orderstep);
      Cheb(hermop,subspace[n-1],Mn);
      for(int m=0;m<n;m++){
 	ComplexD c = innerProduct(subspace[m],Mn);
 	Mn = Mn - c*subspace[m];
      }
      // normalise
      scale = std::pow(norm2(Mn),-0.5);
      Mn=Mn*scale;
      subspace[n]=Mn;
      hermop.Op(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
    }
  }
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
--- a/Grid/algorithms/multigrid/CoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/CoarsenedMatrix.h
@ -99,7 +99,7 @@ public:
  CoarseMatrix AselfInvEven;
  CoarseMatrix AselfInvOdd;
-  Vector<RealD> dag_factor;
+  deviceVector<RealD> dag_factor;
  ///////////////////////
  // Interface
@ -124,9 +124,13 @@ public:
    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
-    Vector<Aview> AcceleratorViewContainer;
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
+    for(int p=0;p<geom.npoint;p++) {
      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
    }
    Aview *Aview_p = & AcceleratorViewContainer[0];
    const int Nsimd = CComplex::Nsimd();
@ -161,7 +165,7 @@ public:
      coalescedWrite(out_v[ss](b),res);
      });
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  };
  void Mdag (const CoarseVector &in, CoarseVector &out)
@ -190,9 +194,14 @@ public:
    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
    for(int p=0;p<geom.npoint;p++) {
      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
    }
    Aview *Aview_p = & AcceleratorViewContainer[0];
    const int Nsimd = CComplex::Nsimd();
@ -201,10 +210,10 @@ public:
    int osites=Grid()->oSites();
-    Vector<int> points(geom.npoint, 0);
+    deviceVector<int> points(geom.npoint);
-    for(int p=0; p<geom.npoint; p++)
+    for(int p=0; p<geom.npoint; p++) { 
-      points[p] = geom.points_dagger[p];
+      acceleratorPut(points[p],geom.points_dagger[p]);
-
+    }
    auto points_p = &points[0];
    RealD* dag_factor_p = &dag_factor[0];
@ -236,7 +245,7 @@ public:
      coalescedWrite(out_v[ss](b),res);
      });
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  }
  void MdirComms(const CoarseVector &in)
@ -251,8 +260,14 @@ public:
    out.Checkerboard() = in.Checkerboard();
    typedef LatticeView<Cobj> Aview;
-    Vector<Aview> AcceleratorViewContainer;
+
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
    for(int p=0;p<geom.npoint;p++) {
      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
    }
    Aview *Aview_p = & AcceleratorViewContainer[0];
    autoView( out_v , out, AcceleratorWrite);
@ -285,7 +300,7 @@ public:
      }
      coalescedWrite(out_v[ss](b),res);
    });
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  }
  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
  {
@ -469,14 +484,20 @@ public:
    // determine in what order we need the points
    int npoint = geom.npoint-1;
-    Vector<int> points(npoint, 0);
+    deviceVector<int> points(npoint);
-    for(int p=0; p<npoint; p++)
+    for(int p=0; p<npoint; p++) {
-      points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
+      int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
-
+      acceleratorPut(points[p], val);
    }
    auto points_p = &points[0];
-    Vector<Aview> AcceleratorViewContainer;
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
-    for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
+    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
    for(int p=0;p<geom.npoint;p++) {
      hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
    }
    Aview *Aview_p = & AcceleratorViewContainer[0];
    const int Nsimd = CComplex::Nsimd();
@ -539,7 +560,7 @@ public:
      });
    }
-    for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  }
  CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	:
@ -590,11 +611,13 @@ public:
    }
    // GPU readable prefactor
    std::vector<RealD> h_dag_factor(nbasis*nbasis);
    thread_for(i, nbasis*nbasis, {
      int j = i/nbasis;
      int k = i%nbasis;
-      dag_factor[i] = dag_factor_eigen(j, k);
+      h_dag_factor[i] = dag_factor_eigen(j, k);
    });
    acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
  }
  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@ -69,7 +69,7 @@ public:
  }
  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
@ -174,19 +174,10 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-#ifdef ACCELERATOR_CSHIFT
+template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview
-// Cshift on device
+template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // Really want to deprecate
-template<class T> using cshiftAllocator = devAllocator<T>;
+template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page
-#else
+template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector
 // Cshift on host
 template<class T> using cshiftAllocator = std::allocator<T>;
 #endif
 template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
 template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
 template<class T> using commVector    = std::vector<T,devAllocator<T> >;
 template<class T> using deviceVector  = std::vector<T,devAllocator<T> >;
 template<class T> using cshiftVector  = std::vector<T,cshiftAllocator<T> >;
 /*
 template<class T> class vecView
@ -197,8 +188,9 @@ template<class T> class vecView
  ViewMode mode;
  void * cpu_ptr;
 public:
  // Rvalue accessor
  accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
-  vecView(std::vector<T> &refer_to_me,ViewMode _mode)
+  vecView(Vector<T> &refer_to_me,ViewMode _mode)
  {
    cpu_ptr = &refer_to_me[0];
    size = refer_to_me.size();
@ -214,22 +206,12 @@ template<class T> class vecView
  }
 };
-template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
+template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
 {
  vecView<T> ret(vec,_mode); // does the open
  return ret;                // must be closed
 }
 // Little autoscope assister
 template<class View> 
 class VectorViewCloser
 {
  View v;  // Take a copy of view and call view close when I go out of scope automatically
 public:
  VectorViewCloser(View &_v) : v(_v) {};
  ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose();  MemoryManager::NotifyDeletion(ptr);}
 };
 #define autoVecView(v_v,v,mode)					\
  auto v_v = VectorView(v,mode);				\
  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -1,16 +1,15 @@
 #include <Grid/GridCore.h>
 #ifndef GRID_UVM
 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
 #define MAXLINE 512
 static char print_buffer [ MAXLINE ];
-#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
+#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
-#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
+#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug  << print_buffer << std::endl;
 //#define dprintf(...) 
-
+//#define mprintf(...) 
 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
@ -111,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -121,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
    AccCache.AccPtr=(uint64_t) NULL;
-    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@ -141,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@ -169,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@ -184,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx",
 	  (uint64_t)AccCache.bytes,
 	  (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
+    dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
+    dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
  if(bytes>=DeviceMaxBytes) {
    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
  }
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error
  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
+    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
-    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
+    dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
+    dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
  } else {
    assert(0);
  }
@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
-    dprintf("AccCache entry removed from LRU \n");
+    dprintf("AccCache entry removed from LRU ");
    LRUremove(AccCache);
  }
@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
  } else {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
--- a/Grid/allocator/MemoryStats.cc
+++ b/Grid/allocator/MemoryStats.cc
@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
-  uint64_t pagedata[npages];
+  std::vector<uint64_t> pagedata(npages);
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
-  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
+  ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@ -82,6 +82,7 @@ public:
  bool _isCheckerBoarded; 
  int        LocallyPeriodic;
  Coordinate _checker_dim_mask;
  int              _checker_dim;
 public:
@ -91,7 +92,6 @@ public:
  ////////////////////////////////////////////////////////////////
  virtual int CheckerBoarded(int dim) =0;
  virtual int CheckerBoard(const Coordinate &site)=0;
  virtual int CheckerDim(void){ return 0; };
  virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
  virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
  virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
--- a/Grid/cartesian/Cartesian_full.h
+++ b/Grid/cartesian/Cartesian_full.h
@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
 public:
  int dummy;
-  Coordinate _checker_dim_mask;
+  //  Coordinate _checker_dim_mask;
  virtual int  CheckerBoardFromOindexTable (int Oindex) {
    return 0;
  }
@ -106,6 +106,7 @@ public:
    _rdimensions.resize(_ndimension);
    _simd_layout.resize(_ndimension);
    _checker_dim_mask.resize(_ndimension);;
    _checker_dim = -1;
    _lstart.resize(_ndimension);
    _lend.resize(_ndimension);
--- a/Grid/cartesian/Cartesian_red_black.h
+++ b/Grid/cartesian/Cartesian_red_black.h
@ -57,10 +57,10 @@ class GridRedBlackCartesian : public GridBase
 {
 public:
  //  Coordinate _checker_dim_mask;
-  int              _checker_dim;
+  //  int              _checker_dim;
  std::vector<int> _checker_board;
-  virtual int CheckerDim(void){ return _checker_dim; };
+  virtual int isCheckerBoarded(void) const { return 1; };
  virtual int CheckerBoarded(int dim){
    if( dim==_checker_dim) return 1;
    else return 0;
--- a/Grid/communicator/Communicator_base.cc
+++ b/Grid/communicator/Communicator_base.cc
@ -57,18 +57,29 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef USE_GRID_REDUCTION
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumP2P(c);
 }
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumP2P(c);
 }
 #else
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 {
  GlobalSumVector((float *)c,2*N);
 }
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumVector((double *)&c,2);
 }
 #endif
 void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 {
  GlobalSumVector((float *)c,2*N);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 #include <Grid/communicator/SharedMemory.h>
 #define NVLINK_GET
 NAMESPACE_BEGIN(Grid);
 extern bool Stencil_force_mpi ;
@ -128,6 +130,35 @@ public:
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
  template<class obj> void GlobalSumP2P(obj &o)
  {
    std::vector<obj> column;
    obj accum = o;
    int source,dest;
    for(int d=0;d<_ndimension;d++){
      column.resize(_processors[d]);
      column[0] = accum;
      std::vector<MpiCommsRequest_t> list;
      for(int p=1;p<_processors[d];p++){
 	ShiftedRanks(d,p,source,dest);
 	SendToRecvFromBegin(list,
 			    &column[0],
 			    dest,
 			    &column[p],
 			    source,
 			    sizeof(obj),d*100+p);
      }
      if (!list.empty()) // avoid triggering assert in comms == none
 	CommsComplete(list);
      for(int p=1;p<_processors[d];p++){
 	accum = accum + column[p];
      }
    }
    Broadcast(0,accum);
    o=accum;
  }
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
@ -138,8 +169,8 @@ public:
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
-  void CommsComplete(std::vector<CommsRequest_t> &list);
+  void CommsComplete(std::vector<MpiCommsRequest_t> &list);
-  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+  void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
 			   void *xmit,
 			   int dest,
 			   void *recv,
@ -158,6 +189,17 @@ public:
 			       int recv_from_rank,int do_recv,
 			       int bytes,int dir);
  double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 				      void *xmit,
 				      int xmit_to_rank,int do_xmit,
 				      void *recv,
 				      int recv_from_rank,int do_recv,
 				      int xbytes,int rbytes,int dir);
  // Could do a PollHtoD and have a CommsMerge dependence
  void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 ////////////////////////////////////////////
@ -257,6 +258,25 @@ CartesianCommunicator::~CartesianCommunicator()
    }
  }
 }
 #ifdef USE_GRID_REDUCTION
 void CartesianCommunicator::GlobalSum(float &f){
  CartesianCommunicator::GlobalSumP2P(f);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  CartesianCommunicator::GlobalSumP2P(d);
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@ -287,27 +307,18 @@ void CartesianCommunicator::GlobalMax(double &d)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
@ -332,7 +343,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  assert(ierr==0);
  list.push_back(xrq);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
+void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
 {
  int nreq=list.size();
@ -351,9 +362,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int from,
 					   int bytes)
 {
-  std::vector<CommsRequest_t> reqs(0);
+  std::vector<MpiCommsRequest_t> reqs(0);
  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
  int myrank = _processor;
  int ierr;
@ -369,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);
  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@ -381,12 +387,25 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
-#undef NVLINK_GET // Define to use get instead of put DMA
+
 #ifdef ACCELERATOR_AWARE_MPI
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
 void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int dest,int dox,
 							   void *recv,
 							   int from,int dor,
 							   int xbytes,int rbytes,int dir)
 {
  return 0.0; // Do nothing -- no preparation required
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
@ -419,15 +438,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(rrq);
      off_node_bytes+=rbytes;
    }
 #ifdef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
 #endif
  }
  if (dox) {
    //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@ -435,17 +448,14 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(xrq);
      off_node_bytes+=xbytes;
    } else {
 #ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
 #endif
    }
  }
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  int nreq=list.size();
@ -453,12 +463,326 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
  acceleratorCopySynchronise();
  if (nreq==0) return;
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
  this->StencilBarrier(); 
 }
 #else /* NOT     ... ACCELERATOR_AWARE_MPI */
 ///////////////////////////////////////////
 // Pipeline mode through host memory
 ///////////////////////////////////////////
  /*
   * In prepare (phase 1):
   * PHASE 1: (prepare)
   * - post MPI receive buffers asynch
   * - post device - host send buffer transfer asynch
   * PHASE 2: (Begin)
   * - complete all copies
   * - post MPI send asynch
   * - post device - device transfers
   * PHASE 3: (Complete)
   * - MPI_waitall
   * - host-device transfers
   *
   *********************************
   * NB could split this further:
   *--------------------------------
   * PHASE 1: (Prepare)
   * - post MPI receive buffers asynch
   * - post device - host send buffer transfer asynch
   * PHASE 2: (BeginInterNode)
   * - complete all copies 
   * - post MPI send asynch
   * PHASE 3: (BeginIntraNode)
   * - post device - device transfers
   * PHASE 4: (Complete)
   * - MPI_waitall
   * - host-device transfers asynch
   * - (complete all copies) 
   */
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int dest,int dox,
 							   void *recv,
 							   int from,int dor,
 							   int xbytes,int rbytes,int dir)
 {
 /*
 * Bring sequence from Stencil.h down to lower level.
 * Assume using XeLink is ok
 */  
  int ncomm  =communicator_halo.size();
  int commdir=dir%ncomm;
  MPI_Request xrq;
  MPI_Request rrq;
  int ierr;
  int gdest = ShmRanks[dest];
  int gfrom = ShmRanks[from];
  int gme   = ShmRanks[_processor];
  assert(dest != _processor);
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
  int tag;
  void * host_recv = NULL;
  void * host_xmit = NULL;
  /*
   * PHASE 1: (Prepare)
   * - post MPI receive buffers asynch
   * - post device - host send buffer transfer asynch
   */
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
      host_recv = this->HostBufferMalloc(rbytes);
      ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      assert(ierr==0);
      CommsRequest_t srq;
      srq.PacketType = InterNodeRecv;
      srq.bytes      = rbytes;
      srq.req        = rrq;
      srq.host_buf   = host_recv;
      srq.device_buf = recv;
      list.push_back(srq);
      off_node_bytes+=rbytes;
    }
  }
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
      host_xmit = this->HostBufferMalloc(xbytes);
      CommsRequest_t srq;
      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      //      assert(ierr==0);
      //      off_node_bytes+=xbytes;
      srq.PacketType = InterNodeXmit;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = host_xmit;
      srq.device_buf = xmit;
      srq.tag        = tag;
      srq.dest       = dest;
      srq.commdir    = commdir;
      list.push_back(srq);
    }
  }
  return off_node_bytes;
 }
 /*
 * In the interest of better pipelining, poll for completion on each DtoH and 
 * start MPI_ISend in the meantime
 */
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
 {
  int pending = 0;
  do {
    pending = 0;
    for(int idx = 0; idx<list.size();idx++){
      if ( list[idx].PacketType==InterNodeRecv ) {
 	int flag = 0;
 	MPI_Status status;
 	int ierr = MPI_Test(&list[idx].req,&flag,&status);
 	assert(ierr==0);
 	if ( flag ) {
 	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
 	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
 	  list[idx].PacketType=InterNodeReceiveHtoD;
 	} else {
 	  pending ++;
 	}
      }
    }
    //    std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
  } while ( pending );
 }
 void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
 {
  int pending = 0;
  do {
    pending = 0;
    for(int idx = 0; idx<list.size();idx++){
      if ( list[idx].PacketType==InterNodeXmit ) {
 	if ( acceleratorEventIsComplete(list[idx].ev) ) {
 	  void *host_xmit = list[idx].host_buf;
 	  uint32_t xbytes = list[idx].bytes;
 	  int dest        = list[idx].dest;
 	  int tag         = list[idx].tag;
 	  int commdir     = list[idx].commdir;
 	  ///////////////////
 	  // Send packet
 	  ///////////////////
 	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
 	  MPI_Request xrq;
 	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
 	  assert(ierr==0);
 	  list[idx].req        = xrq; // Update the MPI request in the list
 	  list[idx].PacketType=InterNodeXmitISend;
 	} else {
 	  // not done, so return to polling loop
 	  pending++;
 	}
      }
    }
  } while (pending);
 }  
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
 							 void *recv,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
  int ncomm  =communicator_halo.size();
  int commdir=dir%ncomm;
  MPI_Request xrq;
  MPI_Request rrq;
  int ierr;
  int gdest = ShmRanks[dest];
  int gfrom = ShmRanks[from];
  int gme   = ShmRanks[_processor];
  assert(dest != _processor);
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
  int tag;
  void * host_xmit = NULL;
  ////////////////////////////////
  // Receives already posted
  // Copies already started
  ////////////////////////////////
  /*  
   * PHASE 2: (Begin)
   * - complete all copies
   * - post MPI send asynch
   */
 #ifdef NVLINK_GET
  if ( dor ) {
    if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
      // Intranode
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
      CommsRequest_t srq;
      srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
      srq.PacketType = IntraNodeRecv;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = NULL;
      srq.device_buf = xmit;
      srq.tag        = -1;
      srq.dest       = dest;
      srq.commdir    = dir;
      list.push_back(srq);
    }
  }  
 #else
  if (dox) {
    if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
      // Intranode
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      CommsRequest_t srq;
      srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
      srq.PacketType = IntraNodeXmit;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = NULL;
      srq.device_buf = xmit;
      srq.tag        = -1;
      srq.dest       = dest;
      srq.commdir    = dir;
      list.push_back(srq);
    }
  }
 #endif
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
  std::vector<MPI_Status> status;
  std::vector<MPI_Request> MpiRequests;
  for(int r=0;r<list.size();r++){
    // Must check each Send buf is clear to reuse
    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
  }
  int nreq=MpiRequests.size();
  if (nreq>0) {
    status.resize(MpiRequests.size());
    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
    assert(ierr==0);
  }
  //  for(int r=0;r<nreq;r++){
  //    if ( list[r].PacketType==InterNodeRecv ) {
  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
  //    }
  //  }
  list.resize(0);               // Delete the list
  this->HostBufferFreeAll();    // Clean up the buffer allocs
 #ifndef NVLINK_GET
  this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
 #endif   
 }
 #endif
 ////////////////////////////////////////////
 // END PIPELINE MODE / NO CUDA AWARE MPI
 ////////////////////////////////////////////
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@ -132,6 +132,17 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
 void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int xmit_to_rank,int dox,
 							   void *recv,
 							   int recv_from_rank,int dor,
 							   int xbytes,int rbytes, int dir)
 {
  return 0.0;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,int dox,
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);
 #if defined (GRID_COMMS_MPI3) 
 typedef MPI_Comm    Grid_MPI_Comm;
 typedef MPI_Request MpiCommsRequest_t;
 #ifdef ACCELERATOR_AWARE_MPI
 typedef MPI_Request CommsRequest_t;
 #else
 /*
 * Enable state transitions as each packet flows.
 */
 enum PacketType_t {
  FaceGather,
  InterNodeXmit,
  InterNodeRecv,
  IntraNodeXmit,
  IntraNodeRecv,
  InterNodeXmitISend,
  InterNodeReceiveHtoD
 };
 /*
 *Package arguments needed for various actions along packet flow
 */
 typedef struct {
  PacketType_t PacketType;
  void *host_buf;
  void *device_buf;
  int dest;
  int tag;
  int commdir;
  unsigned long bytes;
  acceleratorEvent_t ev;
  MpiCommsRequest_t req;
 } CommsRequest_t;
 #endif
 #else 
 typedef int MpiCommsRequest_t;
 typedef int CommsRequest_t;
 typedef int Grid_MPI_Comm;
 #endif
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -42,6 +42,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef ACCELERATOR_AWARE_MPI
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #define SHM_SOCKETS
 #else
 #ifdef HAVE_NUMAIF_H
  #warning " Using NUMAIF "
 #include <numaif.h>
 #endif 
 #endif 
 #include <syscall.h>
 #endif
@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  HostCommBuf= malloc(bytes);
+  // printf("Host buffer allocate for GPU non-aware MPI\n");
 #if 0
  HostCommBuf= acceleratorAllocHost(bytes);
 #else 
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
 #ifdef HAVE_NUMAIF_H
  #warning "Moving host buffers to specific NUMA domain"
  int numa;
  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
  if(numa_name) {
    unsigned long page_size = sysconf(_SC_PAGESIZE);
    numa = atoi(numa_name);
    unsigned long page_count = bytes/page_size;
    std::vector<void *> pages(page_count);
    std::vector<int>    nodes(page_count,numa);
    std::vector<int>    status(page_count,-1);
    for(unsigned long p=0;p<page_count;p++){
      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
    }
    int ret = move_pages(0,
 			 page_count,
 			 &pages[0],
 			 &nodes[0],
 			 &status[0],
 			 MPOL_MF_MOVE);
    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
    if (ret) perror(" move_pages failed for reason:");
  }
 #endif  
  acceleratorPin(HostCommBuf,bytes);
 #endif  
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
@ -569,8 +605,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
+    auto zeDevice    = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
+    auto zeContext   = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
--- a/Grid/cshift/Cshift.h
+++ b/Grid/cshift/Cshift.h
@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #endif 
 NAMESPACE_BEGIN(Grid);
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
 auto Cshift(const Expression &expr,int dim,int shift)  -> decltype(closure(expr)) 
 {
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@ -30,12 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 extern std::vector<std::pair<int,int> > Cshift_table; 
-extern commVector<std::pair<int,int> > Cshift_table_device; 
+extern deviceVector<std::pair<int,int> > Cshift_table_device; 
 inline std::pair<int,int> *MapCshiftTable(void)
 {
  // GPU version
 #ifdef ACCELERATOR_CSHIFT    
  uint64_t sz=Cshift_table.size();
  if (Cshift_table_device.size()!=sz )    {
    Cshift_table_device.resize(sz);
@ -45,16 +44,13 @@ inline std::pair<int,int> *MapCshiftTable(void)
 			  sizeof(Cshift_table[0])*sz);
  return &Cshift_table_device[0];
 #else 
  return &Cshift_table[0];
 #endif
  // CPU version use identify map
 }
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];
@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
  {
    auto buffer_p = & buffer[0];
    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
 #else
    autoView(rhs_v , rhs, CpuRead);
    thread_for(i,ent,{
      buffer_p[table[i].first]=rhs_v[table[i].second];
    });
 #endif
  }
 }
@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
  int n1=rhs.Grid()->_slice_stride[dimension];
  if ( cbmask ==0x3){
 #ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      });
 #else
    autoView(rhs_v , rhs, CpuRead);
    thread_for2d(n,e1,b,e2,{
 	int o      =   n*n1;
 	int offset = b+n*e2;
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      });
 #endif
  } else { 
    Coordinate rdim=rhs.Grid()->_rdimensions;
    Coordinate cdm =rhs.Grid()->_checker_dim_mask;
    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@ -175,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	  extract<vobj>(temp,pointers,offset);
 	}
      });
 #else
    autoView(rhs_v , rhs, CpuRead);
    thread_for2d(n,e1,b,e2,{
 	Coordinate coor;
 	int o=n*n1;
 	int oindex = o+b;
       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
 	int ocb=1<<cb;
 	int offset = b+n*e2;
 	if ( ocb & cbmask ) {
 	  vobj temp =rhs_v[so+o+b];
 	  extract<vobj>(temp,pointers,offset);
 	}
      });
 #endif
  }
 }
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];
@ -245,17 +202,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
  {
    auto buffer_p = & buffer[0];
    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
    });
 #else
    autoView( rhs_v, rhs, CpuWrite);
    thread_for(i,ent,{
      rhs_v[table[i].first]=buffer_p[table[i].second];
    });
 #endif
  }
 }
@ -278,7 +228,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  if(cbmask ==0x3 ) {
    int _slice_stride = rhs.Grid()->_slice_stride[dimension];
    int _slice_block = rhs.Grid()->_slice_block[dimension];
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v , rhs, AcceleratorWrite);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@ -287,14 +236,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
      });
 #else
    autoView( rhs_v , rhs, CpuWrite);
    thread_for2d(n,e1,b,e2,{
 	int o      = n*_slice_stride;
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
    });
 #endif
  } else { 
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
@ -360,19 +301,11 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  {
    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
 #else
    autoView(rhs_v , rhs, CpuRead);
    autoView(lhs_v , lhs, CpuWrite);
    thread_for(i,ent,{
      lhs_v[table[i].first]=rhs_v[table[i].second];
    });
 #endif
  }
 }
@ -412,19 +345,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  {
    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);
    accelerator_for(i,ent,1,{
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
    });
 #else
    autoView( rhs_v, rhs, CpuRead);
    autoView( lhs_v, lhs, CpuWrite);
    thread_for(i,ent,{
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
    });
 #endif
  }
 }
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid); 
-
+const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
  typedef typename vobj::vector_type vector_type;
@ -65,7 +65,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
    Cshift_comms(ret,rhs,dimension,shift);
  }
  t1=usecond();
-  //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
+  if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }
@ -94,7 +94,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
  sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
  sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
-  //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+  //  std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
  if ( sshift[0] == sshift[1] ) {
    //    std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
@ -104,8 +104,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
 }
 #define ACCELERATOR_CSHIFT_NO_COPY
 #ifdef ACCELERATOR_CSHIFT_NO_COPY
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
@ -125,8 +123,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
+  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
-  static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
+  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
 #ifndef ACCELERATOR_AWARE_MPI
  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
 #endif
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@ -158,18 +160,31 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      tcomms-=usecond();
-      //      grid->Barrier();
+      grid->Barrier();
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
 #else
      // bouncy bouncy
      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
      grid->SendToRecvFrom((void *)&hsend_buf[0],
 			   xmit_to_rank,
 			   (void *)&hrecv_buf[0],
 			   recv_from_rank,
 			   bytes);
      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
 #endif
      xbytes+=bytes;
-      //      grid->Barrier();
+      grid->Barrier();
      tcomms+=usecond();
      tscatter-=usecond();
@ -177,13 +192,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      tscatter+=usecond();
    }
  }
-  /*
+  if (Cshift_verbose){
    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  */
+  }
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -224,8 +239,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);
-  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<deviceVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
-  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
+  static std::vector<deviceVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
@ -233,6 +248,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
 #ifndef ACCELERATOR_AWARE_MPI
  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
 #endif
  int bytes = buffer_size*sizeof(scalar_object);
@ -281,246 +300,31 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 	tcomms-=usecond();
-	//	grid->Barrier();
+	grid->Barrier();
 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
 #ifdef ACCELERATOR_AWARE_MPI
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
 	xbytes+=bytes;
 	//	grid->Barrier();
 	tcomms+=usecond();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
      }
    }
    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
  }
  /*
  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
 }
 #else
-template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+      // bouncy bouncy
-{
+	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
-  typedef typename vobj::vector_type vector_type;
+	grid->SendToRecvFrom((void *)&hsend_buf[0],
  typedef typename vobj::scalar_type scalar_type;
  GridBase *grid=rhs.Grid();
  Lattice<vobj> temp(rhs.Grid());
  int fd              = rhs.Grid()->_fdimensions[dimension];
  int rd              = rhs.Grid()->_rdimensions[dimension];
  int pd              = rhs.Grid()->_processors[dimension];
  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
  assert(simd_layout==1);
  assert(comm_dim==1);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
  vobj *send_buf;
  vobj *recv_buf;
  {
    grid->ShmBufferFreeAll();
    size_t bytes = buffer_size*sizeof(vobj);
    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
  }
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  for(int x=0;x<rd;x++){       
    int sx        =  (x+sshift)%rd;
    int comm_proc = ((x+sshift)/rd)%pd;
    if (comm_proc==0) {
      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
      tcopy+=usecond();
    } else {
      int words = buffer_size;
      if (cbmask != 0x3) words=words>>1;
      int bytes = words * sizeof(vobj);
      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
      tgather+=usecond();
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      tcomms-=usecond();
      //      grid->Barrier();
      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
      grid->SendToRecvFrom((void *)&send_buf[0],
 			     xmit_to_rank,
-			   (void *)&recv_buf[0],
+			     (void *)&hrecv_buf[0],
 			     recv_from_rank,
 			     bytes);
-      xbytes+=bytes;
+	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
-      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
+#endif
-      //      grid->Barrier();
+	xbytes+=bytes;
 	grid->Barrier();
 	tcomms+=usecond();
      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
      tscatter+=usecond();
    }
  }
  /*
  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid=rhs.Grid();
  const int Nsimd = grid->Nsimd();
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  int fd = grid->_fdimensions[dimension];
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int pd = grid->_processors[dimension];
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;
  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
  assert(comm_dim==1);
  assert(simd_layout==2);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int permute_type=grid->PermuteType(dimension);
  ///////////////////////////////////////////////
  // Simd direction uses an extract/merge pair
  ///////////////////////////////////////////////
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);
  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
  {
    size_t bytes = sizeof(scalar_object)*buffer_size;
    grid->ShmBufferFreeAll();
    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
  }
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
  int bytes = buffer_size*sizeof(scalar_object);
  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
  ///////////////////////////////////////////
  // Work out what to send where
  ///////////////////////////////////////////
  int cb    = (cbmask==0x2)? Odd : Even;
  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  // loop over outer coord planes orthog to dim
  for(int x=0;x<rd;x++){       
    // FIXME call local permute copy if none are offnode.
    for(int i=0;i<Nsimd;i++){       
      pointers[i] = &send_buf_extract[i][0];
    }
    tgather-=usecond();
    int sx   = (x+sshift)%rd;
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
    tgather+=usecond();
    for(int i=0;i<Nsimd;i++){
      int inner_bit = (Nsimd>>(permute_type+1));
      int ic= (i&inner_bit)? 1:0;
      int my_coor          = rd*ic + x;
      int nbr_coor         = my_coor+sshift;
      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
      int nbr_lane = (i&(~inner_bit));
      int recv_from_rank;
      int xmit_to_rank;
      if (nbr_ic) nbr_lane|=inner_bit;
      assert (sx == nbr_ox);
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 	tcomms-=usecond();
 	//	grid->Barrier();
 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
 	xbytes+=bytes;
 	//	grid->Barrier();
 	tcomms+=usecond();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
@ -530,17 +334,16 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
  }
-  /*
+  if(Cshift_verbose){
    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
  }
-#endif
+}
 NAMESPACE_END(Grid); 
 #endif
--- a/Grid/cshift/Cshift_table.cc
+++ b/Grid/cshift/Cshift_table.cc
@ -1,5 +1,5 @@
 #include <Grid/GridCore.h>       
 NAMESPACE_BEGIN(Grid);
 std::vector<std::pair<int,int> > Cshift_table; 
-commVector<std::pair<int,int> > Cshift_table_device; 
+deviceVector<std::pair<int,int> > Cshift_table_device; 
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
  });
 }
 #define FAST_AXPY_NORM
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpy_norm");
 #ifdef FAST_AXPY_NORM
  return axpy_norm_fast(ret,a,x,y);
 #else
  ret = a*x+y;
  RealD nn=norm2(ret);
  return nn;
 #endif
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpby_norm");
 #ifdef FAST_AXPY_NORM
  return axpby_norm_fast(ret,a,b,x,y);
 #else
  ret = a*x+b*y;
  RealD nn=norm2(ret);
  return nn;
 #endif
 }
 /// Trace product
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -236,17 +236,20 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
-#if 1
+#if 0
    deviceVector<vobj> vvtmp(1);
    acceleratorPut(vvtmp[0],vtmp);
    vobj *vvtmp_p = & vvtmp[0];
    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 	auto stmp=coalescedRead(*vvtmp_p);
 	coalescedWrite(me[ss],stmp);
    });
 #else    
    auto me  = View(CpuWrite);
    thread_for(ss,me.size(),{
       me[ss]= r;
      });
 #else    
    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 	auto stmp=coalescedRead(vtmp);
 	coalescedWrite(me[ss],stmp);
    });
 #endif    
    me.ViewClose();
    return *this;
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  typedef decltype(basis[0]) Field;
  typedef decltype(basis[0].View(AcceleratorRead)) View;
-  Vector<View> basis_v; basis_v.reserve(basis.size());
+  hostVector<View>  h_basis_v(basis.size());
-  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
+  deviceVector<View> d_basis_v(basis.size());
  typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
  typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
  GridBase* grid = basis[0].Grid();
  for(int k=0;k<basis.size();k++){
-    basis_v.push_back(basis[k].View(AcceleratorWrite));
+    h_basis_v[k] = basis[k].View(AcceleratorWrite);
    acceleratorPut(d_basis_v[k],h_basis_v[k]);
  }
-#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
+  View *basis_vp = &d_basis_v[0];
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
    {
      vobj* B = &Bt[Nm * thread_num()];
      thread_for_in_region(ss, grid->oSites(),{
 	  for(int j=j0; j<j1; ++j) B[j]=0.;
 	  for(int j=j0; j<j1; ++j){
 	    for(int k=k0; k<k1; ++k){
 	      B[j] +=Qt(j,k) * basis_v[k][ss];
 	    }
 	  }
 	  for(int j=j0; j<j1; ++j){
 	    basis_v[j][ss] = B[j];
 	  }
 	});
    }
 #else
  View *basis_vp = &basis_v[0];
  int nrot = j1-j0;
  if (!nrot) // edge case not handled gracefully by Cuda
@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  uint64_t oSites   =grid->oSites();
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
-  Vector <vobj> Bt(siteBlock * nrot); 
+  deviceVector <vobj> Bt(siteBlock * nrot); 
  auto Bp=&Bt[0];
  // GPU readable copy of matrix
-  Vector<Coeff_t> Qt_jv(Nm*Nm);
+  hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
  deviceVector<Coeff_t> Qt_jv(Nm*Nm);
  Coeff_t *Qt_p = & Qt_jv[0];
  thread_for(i,Nm*Nm,{
      int j = i/Nm;
      int k = i%Nm;
-      Qt_p[i]=Qt(j,k);
+      h_Qt_jv[i]=Qt(j,k);
  });
  acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
  // Block the loop to keep storage footprint down
  for(uint64_t s=0;s<oSites;s+=siteBlock){
@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 	coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
      });
  }
 #endif
-  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
+  for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
 }
 // Extract a single rotated vector
@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
  result.Checkerboard() = basis[0].Checkerboard();
-  Vector<View> basis_v; basis_v.reserve(basis.size());
+  hostVector<View>  h_basis_v(basis.size());
  deviceVector<View> d_basis_v(basis.size());
  for(int k=0;k<basis.size();k++){
-    basis_v.push_back(basis[k].View(AcceleratorRead));
+    h_basis_v[k]=basis[k].View(AcceleratorRead);
    acceleratorPut(d_basis_v[k],h_basis_v[k]);
  }
  vobj zz=Zero();
  Vector<double> Qt_jv(Nm);
  double * Qt_j = & Qt_jv[0];
  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
-  auto basis_vp=& basis_v[0];
+  vobj zz=Zero();
  deviceVector<double> Qt_jv(Nm);
  double * Qt_j = & Qt_jv[0];
  for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
  auto basis_vp=& d_basis_v[0];
  autoView(result_v,result,AcceleratorWrite);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
    vobj zzz=Zero();
@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
    }
    coalescedWrite(result_v[ss], B);
  });
-  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
+  for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
 }
 template<class Field>
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@ -165,7 +165,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  int Nsimd = grid->Nsimd();
-  assert( l.Checkerboard()== grid->CheckerBoard(site));
+  //  assert( l.Checkerboard()== grid->CheckerBoard(site));
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
  static const int words=sizeof(vobj)/sizeof(vector_type);
@ -179,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  for(int w=0;w<words;w++){
    pt[w] = getlane(vp[w],idx);
  }
-      
+  //  std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
  return;
 };
 template<class vobj,class sobj>
@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
  int Nsimd = grid->Nsimd();
-  assert( l.Checkerboard()== grid->CheckerBoard(site));
+  //  assert( l.Checkerboard()== grid->CheckerBoard(site));
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
  static const int words=sizeof(vobj)/sizeof(vector_type);
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
  //  const int Nsimd = vobj::Nsimd();
  const int nthread = GridThread::GetThreads();
-  Vector<sobj> sumarray(nthread);
+  std::vector<sobj> sumarray(nthread);
  for(int i=0;i<nthread;i++){
    sumarray[i]=Zero();
  }
@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
  const int nthread = GridThread::GetThreads();
-  Vector<sobj> sumarray(nthread);
+  std::vector<sobj> sumarray(nthread);
  for(int i=0;i<nthread;i++){
    sumarray[i]=Zero();
  }
@ -290,8 +290,10 @@ template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();
  bool ok;
 #ifdef GRID_SYCL
  uint64_t csum=0;
  uint64_t csum2=0;
  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
  {
    // Hack
@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
    uint64_t *base= (uint64_t *)&l_v[0];
    csum=svm_xor(base,words);
    ok = FlightRecorder::CsumLog(csum);
    if ( !ok ) {
      csum2=svm_xor(base,words);
      std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
    } else {
      //      csum2=svm_xor(base,words);
      //      std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
    }
    assert(ok);
  }
  FlightRecorder::CsumLog(csum);
 #endif
  FlightRecorder::StepLog("rank inner product");
  ComplexD nrm = rankInnerProduct(left,right);
  //  ComplexD nrmck=nrm;
  RealD local = real(nrm);
-  FlightRecorder::NormLog(real(nrm)); 
+  ok = FlightRecorder::NormLog(real(nrm));
  if ( !ok ) {
    ComplexD nrm2 = rankInnerProduct(left,right);
    RealD local2 = real(nrm2);
    std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
    assert(ok);
  }
  FlightRecorder::StepLog("Start global sum");
  //  grid->GlobalSumP2P(nrm);
  grid->GlobalSum(nrm);
  FlightRecorder::StepLog("Finished global sum");
  //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
  FlightRecorder::ReductionLog(local,real(nrm)); 
  return nrm;
 }
@ -343,18 +365,6 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
 #if 0
  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  accelerator_for( ss, sites, nsimd,{
      auto tmp = a*x_v(ss)+b*y_v(ss);
      coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 #else
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
  deviceVector<inner_t> inner_tmp;
  inner_tmp.resize(sites);
@ -365,9 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
-  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
+  bool ok;
 #ifdef GRID_SYCL
  uint64_t csum=0;
  uint64_t csum2=0;
  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
  {
    // z_v
    {
      Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
      uint64_t *base= (uint64_t *)&z_v[0];
      csum=svm_xor(base,words);
      ok = FlightRecorder::CsumLog(csum);
      if ( !ok ) {
 	csum2=svm_xor(base,words);
 	std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
      }
      assert(ok);
    }
    // inner_v
    {
      Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
      uint64_t *base= (uint64_t *)&inner_tmp_v[0];
      csum=svm_xor(base,words);
      ok = FlightRecorder::CsumLog(csum);
      if ( !ok ) {
 	csum2=svm_xor(base,words);
 	std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
      }
      assert(ok);
    }
  }
 #endif
  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
  ok = FlightRecorder::NormLog(real(nrm));
  assert(ok);
  RealD local = real(nrm);
  grid->GlobalSum(nrm);
  FlightRecorder::ReductionLog(local,real(nrm));
  return nrm; 
 }
@ -377,7 +422,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
  conformable(left,right);
  typedef typename vobj::vector_typeD vector_type;
-  Vector<ComplexD> tmp(2);
+  std::vector<ComplexD> tmp(2);
  GridBase *grid = left.Grid();
@ -387,8 +432,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
  // GPU
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  typedef decltype(innerProductD(vobj(),vobj())) norm_t;
-  Vector<inner_t> inner_tmp(sites);
+  deviceVector<inner_t> inner_tmp(sites);
-  Vector<norm_t>  norm_tmp(sites);
+  deviceVector<norm_t>  norm_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  auto norm_tmp_v = &norm_tmp[0];
  {
@ -438,7 +483,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
 // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
+template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
 					  std::vector<typename vobj::scalar_object> &result,
 					  int orthogdim)
 {
  ///////////////////////////////////////////////////////
  // FIXME precision promoted summation
@ -460,8 +507,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];
-  Vector<vobj> lvSum(rd); // will locally sum vectors first
+  std::vector<vobj> lvSum(rd); // will locally sum vectors first
-  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
+  std::vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD
  result.resize(fd); // And then global sum to return the same vector to every node 
@ -509,6 +556,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  scalar_type * ptr = (scalar_type *) &result[0];
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
  //  std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
 }
 template<class vobj> inline
 std::vector<typename vobj::scalar_object> 
@ -519,7 +568,20 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
  return result;
 }
 /*
 Reimplement
 1)
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 2)
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 3)
 -- Make Slice Mul Matrix call sliceMaddMatrix
 */
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
@ -539,8 +601,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];
-  Vector<vector_type> lvSum(rd); // will locally sum vectors first
+  std::vector<vector_type> lvSum(rd); // will locally sum vectors first
-  Vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
+  std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  
  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@ -670,203 +732,96 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };
 /*
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
  int nsimd = BlockSolverGrid->Nsimd();
-  std::vector<int> latt_phys(0);
+  std::vector<int> latt_phys(NN-1);
-  std::vector<int> simd_phys(0);
+  Coordinate simd_phys;
-  std::vector<int>  mpi_phys(0);
+  std::vector<int>  mpi_phys(NN-1);
  Coordinate checker_dim_mask(NN-1);
  int checker_dim=-1;
  int dd;
  for(int d=0;d<NN;d++){
    if( d!=Orthog ) { 
-      latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
+      latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
-      simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
+      mpi_phys[dd] =BlockSolverGrid->_processors[d];
-      mpi_phys.push_back(BlockSolverGrid->_processors[d]);
+      checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
      if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
      dd++;
    }
  }
-  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
+  simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
  GridCartesian *tmp         = new GridCartesian(latt_phys,simd_phys,mpi_phys);
  if(BlockSolverGrid->_isCheckerBoarded) {
    GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
    delete tmp;
    return (GridBase *) ret;
  } else { 
    return (GridBase *) tmp;
  }
 }
 */
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  GridBase *FullGrid = X.Grid();
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  Lattice<vobj> Ys(SliceGrid);
  Lattice<vobj> Rs(SliceGrid);
  Lattice<vobj> Xs(SliceGrid);
  Lattice<vobj> RR(FullGrid);
  RR = R; // Copies checkerboard for insert
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::vector_type vector_type;
-
+  int Nslice = X.Grid()->GlobalDimensions()[Orthog];
-  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
+  for(int i=0;i<Nslice;i++){
-
+    ExtractSlice(Ys,Y,i,Orthog);
-  GridBase *FullGrid  = X.Grid();
+    ExtractSlice(Rs,R,i,Orthog);
-  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+    Rs=Ys;
-
+    for(int j=0;j<Nslice;j++){
-  //  Lattice<vobj> Xslice(SliceGrid);
+      ExtractSlice(Xs,X,j,Orthog);
-  //  Lattice<vobj> Rslice(SliceGrid);
+      Rs = Rs + Xs*(scale*aa(j,i));
  assert( FullGrid->_simd_layout[Orthog]==1);
  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  //  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
  autoView( X_v, X, CpuRead);
  autoView( Y_v, Y, CpuRead);
  autoView( R_v, R, CpuWrite);
  thread_region
  {
    Vector<vobj> s_x(Nblock);
    thread_for_collapse_in_region(2, n,nblock, {
     for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X_v[o+i*ostride];
    }
-
+    InsertSlice(Rs,RR,i,Orthog);
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = Y_v[o+i*ostride];
 	for(int j=0;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R_v[o+i*ostride]=dot;
      }
    }});
  }
  R=RR; // Copy back handles arguments aliasing case
  delete SliceGrid;
 };
 template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
 {
-  typedef typename vobj::scalar_object sobj;
+  R=Zero();
-  typedef typename vobj::vector_type vector_type;
+  sliceMaddMatrix(R,aa,X,R,Orthog,scale);
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X.Grid();
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  //  Lattice<vobj> Xslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  //  int nl=1;
  //FIXME package in a convenient iterator
  // thread_for2d_in_region
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
  autoView( R_v, R, CpuWrite);
  autoView( X_v, X, CpuRead);
  thread_region
  {
    std::vector<vobj> s_x(Nblock);
    thread_for_collapse_in_region( 2 ,n,nblock,{
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X_v[o+i*ostride];
      }
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = s_x[0]*(scale*aa(0,i));
 	for(int j=1;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R_v[o+i*ostride]=dot;
      }
    }});
  }
 };
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
  Lattice<vobj> ls(SliceGrid);
  Lattice<vobj> rs(SliceGrid);
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::vector_type vector_type;
-  
+  int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
-  GridBase *FullGrid  = lhs.Grid();
+  mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
-  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  for(int s=0;s<Nslice;s++){
-  
+    ExtractSlice(ls,lhs,s,Orthog);
-  int Nblock = FullGrid->GlobalDimensions()[Orthog];
+    for(int ss=0;ss<Nslice;ss++){
-  
+      ExtractSlice(rs,rhs,ss,Orthog);
-  //  Lattice<vobj> Lslice(SliceGrid);
+      mat(s,ss) = innerProduct(ls,rs);
  //  Lattice<vobj> Rslice(SliceGrid);
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  assert( FullGrid->_simd_layout[Orthog]==1);
  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  //  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
  typedef typename vobj::vector_typeD vector_typeD;
  autoView( lhs_v, lhs, CpuRead);
  autoView( rhs_v, rhs, CpuRead);
  thread_region
  {
    std::vector<vobj> Left(Nblock);
    std::vector<vobj> Right(Nblock);
    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
    thread_for_collapse_in_region( 2, n,nblock,{
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	Left [i] = lhs_v[o+i*ostride];
 	Right[i] = rhs_v[o+i*ostride];
      }
      for(int i=0;i<Nblock;i++){
      for(int j=0;j<Nblock;j++){
 	auto tmp = innerProduct(Left[i],Right[j]);
 	auto rtmp = TensorRemove(tmp);
 	auto red  =  Reduce(rtmp);
 	mat_thread(i,j) += std::complex<double>(real(red),imag(red));
      }}
    }});
    thread_critical
    {
      mat += mat_thread;
    }
  }
-
+  delete SliceGrid;
  for(int i=0;i<Nblock;i++){
  for(int j=0;j<Nblock;j++){
    ComplexD sum = mat(i,j);
    FullGrid->GlobalSum(sum);
    mat(i,j)=sum;
  }}
  return;
 }
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  // Move out of UVM
  // Turns out I had messed up the synchronise after move to compute stream
  // as running this on the default stream fools the synchronise
-#undef UVM_BLOCK_BUFFER  
+  deviceVector<sobj> buffer(numBlocks);
 #ifndef UVM_BLOCK_BUFFER  
  commVector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
  sobj result;
  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
  accelerator_barrier();
  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
 #else
  Vector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
  sobj result;
  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
  accelerator_barrier();
  result = *buffer_v;
 #endif
  return result;
 }
@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
  const int words = sizeof(vobj)/sizeof(vector);
-  Vector<vector> buffer(osites);
+  deviceVector<vector> buffer(osites);
  vector *dat = (vector *)lat;
  vector *buf = &buffer[0];
  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@ -4,33 +4,28 @@ NAMESPACE_BEGIN(Grid);
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_objectD sobjD;
-  static Vector<sobj> mysum;
+
  mysum.resize(1);
  sobj *mysum_p = & mysum[0];
  sobj identity; zeroit(identity);
-  mysum[0] = identity;
+  sobj ret; zeroit(ret);
  sobj ret ; 
  Integer nsimd= vobj::Nsimd();
-
+  { 
-  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
+    sycl::buffer<sobj, 1> abuff(&ret, {1});
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
-    auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
-     cgh.parallel_for(cl::sycl::range<1>{osites},
+      cgh.parallel_for(sycl::range<1>{osites},
                      Reduction,
-		      [=] (cl::sycl::id<1> item, auto &sum) {
+                      [=] (sycl::id<1> item, auto &sum) {
                        auto osite   = item[0];
                        sum +=Reduce(lat[osite]);
                      });
    });
-  theGridAccelerator->wait();
+  }
  ret = mysum[0];
  //  free(mysum,*theGridAccelerator);
  sobjD dret; convertType(dret,ret);
  return dret;
 }
@ -76,59 +71,22 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
 template<class Word> Word svm_xor(Word *vec,uint64_t L)
 {
  Word xorResult; xorResult = 0;
  static Vector<Word> d_sum;
  d_sum.resize(1);
  Word *d_sum_p=&d_sum[0];
  Word identity;  identity=0;
-  d_sum[0] = identity;
+  Word ret = 0;
-  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
+  { 
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+    sycl::buffer<Word, 1> abuff(&ret, {1});
-    auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
-     cgh.parallel_for(cl::sycl::range<1>{L},
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
      cgh.parallel_for(sycl::range<1>{L},
                      Reduction,
-		      [=] (cl::sycl::id<1> index, auto &sum) {
+                      [=] (sycl::id<1> index, auto &sum) {
                        sum ^=vec[index];
                      });
    });
  }
  theGridAccelerator->wait();
  Word ret = d_sum[0];
  //  free(d_sum,*theGridAccelerator);
  return ret;
 }
 NAMESPACE_END(Grid);
 /*
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_type  scalar;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobjD;
  sobjD ret;
  scalarD *ret_p = (scalarD *)&ret;
  const int nsimd = vobj::Nsimd();
  const int words = sizeof(vobj)/sizeof(vector);
  Vector<scalar> buffer(osites*nsimd);
  scalar *buf = &buffer[0];
  vector *dat = (vector *)lat;
  for(int w=0;w<words;w++) {
    accelerator_for(ss,osites,nsimd,{
 	int lane = acceleratorSIMTlane(nsimd);
 	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
    });
    //Precision change at this point is to late to gain precision
    ret_p[w] = svm_reduce(buf,nsimd*osites);
  }
  return ret;
 }
 */
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@ -21,9 +21,18 @@ NAMESPACE_BEGIN(Grid);
 #if defined(GRID_CUDA) || defined(GRID_HIP)
-template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+template<class vobj>
 inline void sliceSumReduction_cub_small(const vobj *Data,
 					std::vector<vobj> &lvSum,
 					const int rd,
 					const int e1,
 					const int e2,
 					const int stride,
 					const int ostride,
 					const int Nsimd)
 {
  size_t subvol_size = e1*e2;
-  commVector<vobj> reduction_buffer(rd*subvol_size);
+  deviceVector<vobj> reduction_buffer(rd*subvol_size);
  auto rb_p = &reduction_buffer[0];
  vobj zero_init;
  zeroit(zero_init);
@ -94,7 +103,15 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
 #if defined(GRID_SYCL)
-template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
+template<class vobj>
 inline void sliceSumReduction_sycl_small(const vobj *Data,
 					 std::vector <vobj> &lvSum,
 					 const int  &rd,
 					 const int &e1,
 					 const int &e2,
 					 const int &stride,
 					 const int &ostride,
 					 const int &Nsimd)
 {
  size_t subvol_size = e1*e2;
@ -105,7 +122,7 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
    mysum[r] = vobj_zero; 
  }
-  commVector<vobj> reduction_buffer(rd*subvol_size);    
+  deviceVector<vobj> reduction_buffer(rd*subvol_size);    
  auto rb_p = &reduction_buffer[0];
@ -124,11 +141,11 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
  });
  for (int r = 0; r < rd; r++) {
-      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+      theGridAccelerator->submit([&](sycl::handler &cgh) {
-          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
+          auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
-          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
+          cgh.parallel_for(sycl::range<1>{subvol_size},
          Reduction,
-          [=](cl::sycl::id<1> item, auto &sum) {
+          [=](sycl::id<1> item, auto &sum) {
              auto s = item[0];
              sum += rb_p[r*subvol_size+s];
          });
@ -144,14 +161,23 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
 }
 #endif
-template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+template<class vobj>
 inline void sliceSumReduction_large(const vobj *Data,
 				    std::vector<vobj> &lvSum,
 				    const int rd,
 				    const int e1,
 				    const int e2,
 				    const int stride,
 				    const int ostride,
 				    const int Nsimd)
 {
  typedef typename vobj::vector_type vector;
  const int words = sizeof(vobj)/sizeof(vector);
  const int osites = rd*e1*e2;
-  commVector<vector>buffer(osites);
+  deviceVector<vector>buffer(osites);
  vector *dat = (vector *)Data;
  vector *buf = &buffer[0];
-  Vector<vector> lvSum_small(rd);
+  std::vector<vector> lvSum_small(rd);
  vector *lvSum_ptr = (vector *)&lvSum[0];
  for (int w = 0; w < words; w++) {
@ -168,13 +194,18 @@ template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vecto
    for (int r = 0; r < rd; r++) {
      lvSum_ptr[w+words*r]=lvSum_small[r];
    }
-
+  }
 }
-  
+template<class vobj>
-}
+inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
-
+				  std::vector<vobj> &lvSum,
-template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
+				  const int rd,
 				  const int e1,
 				  const int e2,
 				  const int stride,
 				  const int ostride,
 				  const int Nsimd)
 {
  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
    if constexpr (sizeof(vobj) <= 256) { 
@ -192,7 +223,15 @@ template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data
 }
-template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
+template<class vobj>
 inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
 				  std::vector<vobj> &lvSum,
 				  const int &rd,
 				  const int &e1,
 				  const int &e2,
 				  const int &stride,
 				  const int &ostride,
 				  const int &Nsimd)
 {
  // sum over reduced dimension planes, breaking out orthog dir
  // Parallel over orthog direction
@ -208,15 +247,19 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
  });
 }
-template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
+template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
 						   std::vector<vobj> &lvSum,
 						   const int &rd,
 						   const int &e1,
 						   const int &e2,
 						   const int &stride,
 						   const int &ostride,
 						   const int &Nsimd) 
 {
 #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
 #else
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
 #endif
 }
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -43,20 +43,49 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // remove and insert a half checkerboard
 ////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
 {
-  acceleratorPickCheckerboard(cb,half,full);
+  half.Checkerboard() = cb;
  autoView( half_v, half, CpuWrite);
  autoView( full_v, full, CpuRead);
  thread_for(ss, full.Grid()->oSites(),{
    int cbos;
    Coordinate coor;
    full.Grid()->oCoorFromOindex(coor,ss);
    cbos=half.Grid()->CheckerBoard(coor);
    if (cbos==cb) {
      int ssh=half.Grid()->oIndex(coor);
      half_v[ssh] = full_v[ss];
    }
  });
 }
 template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
 {
-  acceleratorSetCheckerboard(full,half);
+  int cb = half.Checkerboard();
  autoView( half_v , half, CpuRead);
  autoView( full_v , full, CpuWrite);
  thread_for(ss,full.Grid()->oSites(),{
    Coordinate coor;
    int cbos;
    full.Grid()->oCoorFromOindex(coor,ss);
    cbos=half.Grid()->CheckerBoard(coor);
    if (cbos==cb) {
      int ssh=half.Grid()->oIndex(coor);
      full_v[ss]=half_v[ssh];
    }
  });
 }
-template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0)
+template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
 {
  half.Checkerboard() = cb;
  autoView(half_v, half, AcceleratorWrite);
@ -66,7 +95,6 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  int checker_dim_half             = half.Grid()->CheckerDim();
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
@ -91,7 +119,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
    }
  });
 }
-template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0)
+template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
 {
  int cb = half.Checkerboard();
  autoView(half_v , half, AcceleratorRead);
@ -101,7 +129,6 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  int checker_dim_half             = half.Grid()->CheckerDim();
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
@ -954,8 +981,14 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
    hcoor[orthog] = slice;
    for(int d=0;d<nh;d++){
      if ( d!=orthog ) { 
-	hcoor[d]=lcoor[ddl++];
+	hcoor[d]=lcoor[ddl];
 	if ( hg->_checker_dim == d ) {
 	  hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
 	  lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
 	}
 	ddl++;
      }
    }
    peekLocalSite(s,lowDimv,lcoor);
    pokeLocalSite(s,higherDimv,hcoor);
@ -976,6 +1009,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
  assert(orthog<nh);
  assert(orthog>=0);
  assert(hg->_processors[orthog]==1);
  lowDim.Checkerboard() = higherDim.Checkerboard();
  int dl; dl = 0;
  for(int d=0;d<nh;d++){
@ -993,11 +1027,16 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
    Coordinate lcoor(nl);
    Coordinate hcoor(nh);
    lg->LocalIndexToLocalCoor(idx,lcoor);
    int ddl=0;
    hcoor[orthog] = slice;
    int ddl=0;
    for(int d=0;d<nh;d++){
      if ( d!=orthog ) { 
-	hcoor[d]=lcoor[ddl++];
+	hcoor[d]=lcoor[ddl];
 	if ( hg->_checker_dim == d ) {
 	  hcoor[d]=hcoor[d]*2;     // factor in the full gridd coor for peekLocalSite
 	  lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
 	}
 	ddl++;
      }
    }
    peekLocalSite(s,higherDimv,hcoor);
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
 *
 */
-template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
+template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
 					      Lattice<vobj> &lat,
 					      int x,
 					      int dim,
@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
  });
 }
-template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
+template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
 					     const Lattice<vobj> &lat,
 					     int x,
 					     int dim,
@ -462,13 +462,19 @@ public:
    int rNsimd = Nsimd / simd[dimension];
    assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
-    static cshiftVector<vobj> send_buf; 
+    static deviceVector<vobj> send_buf; 
-    static cshiftVector<vobj> recv_buf;
+    static deviceVector<vobj> recv_buf;
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);
 #ifndef ACCELERATOR_AWARE_MPI
    static hostVector<vobj> hsend_buf; 
    static hostVector<vobj> hrecv_buf;
    hsend_buf.resize(buffer_size*2*depth);    
    hrecv_buf.resize(buffer_size*2*depth);
 #endif    
-    std::vector<CommsRequest_t> fwd_req;   
+    std::vector<MpiCommsRequest_t> fwd_req;   
-    std::vector<CommsRequest_t> bwd_req;   
+    std::vector<MpiCommsRequest_t> bwd_req;   
    int words = buffer_size;
    int bytes = words * sizeof(vobj);
@ -495,9 +501,17 @@ public:
      t_gather+=usecond()-t;
      t=usecond();
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&send_buf[d*buffer_size], xmit_to_rank,
 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
 #else
      acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
 				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
 #endif
      t_comms+=usecond()-t;
     }
    for ( int d=0;d < depth ; d ++ ) {
@ -508,9 +522,17 @@ public:
      t_gather+= usecond() - t;
      t=usecond();
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
 #else
      acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
 #endif      
      t_comms+=usecond()-t;
    }
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@ -132,6 +132,10 @@ public:
 template <class GaugeField >
 class EmptyAction : public Action <GaugeField>
 {
  using Action<GaugeField>::refresh;
  using Action<GaugeField>::Sinitial;
  using Action<GaugeField>::deriv;
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative
--- a/Grid/qcd/action/fermion/AbstractEOFAFermion.h
+++ b/Grid/qcd/action/fermion/AbstractEOFAFermion.h
@ -55,6 +55,11 @@ public:
  RealD alpha; // Mobius scale
  RealD k;     // EOFA normalization constant
  // Device resident
  deviceVector<Coeff_t> d_shift_coefficients;
  deviceVector<Coeff_t> d_MooeeInv_shift_lc;
  deviceVector<Coeff_t> d_MooeeInv_shift_norm;
  virtual void Instantiatable(void) = 0;
  // EOFA-specific operations
@ -92,6 +97,11 @@ public:
    this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
      ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
      ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
    d_shift_coefficients.resize(Ls);
    d_MooeeInv_shift_lc.resize(Ls);
    d_MooeeInv_shift_norm.resize(Ls);
  };
 };
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -90,16 +90,16 @@ public:
  void M5D(const FermionField &psi,
 	   const FermionField &phi,
 	   FermionField &chi,
-	   Vector<Coeff_t> &lower,
+	   std::vector<Coeff_t> &lower,
-	   Vector<Coeff_t> &diag,
+	   std::vector<Coeff_t> &diag,
-	   Vector<Coeff_t> &upper);
+	   std::vector<Coeff_t> &upper);
  void M5Ddag(const FermionField &psi,
 	      const FermionField &phi,
 	      FermionField &chi,
-	      Vector<Coeff_t> &lower,
+	      std::vector<Coeff_t> &lower,
-	      Vector<Coeff_t> &diag,
+	      std::vector<Coeff_t> &diag,
-	      Vector<Coeff_t> &upper);
+	      std::vector<Coeff_t> &upper);
  virtual void   Instantiatable(void)=0;
@ -119,35 +119,51 @@ public:
  RealD mass_plus, mass_minus;
  // Save arguments to SetCoefficientsInternal
-  Vector<Coeff_t> _gamma;
+  std::vector<Coeff_t> _gamma;
  RealD                _zolo_hi;
  RealD                _b;
  RealD                _c;
  // possible boost
  std::vector<ComplexD> qmu;
  void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
  void addQmu(const FermionField &in, FermionField &out, int dag);
  // Cayley form Moebius (tanh and zolotarev)
-  Vector<Coeff_t> omega;
+  std::vector<Coeff_t> omega;
-  Vector<Coeff_t> bs;    // S dependent coeffs
+  std::vector<Coeff_t> bs;    // S dependent coeffs
-  Vector<Coeff_t> cs;
+  std::vector<Coeff_t> cs;
-  Vector<Coeff_t> as;
+  std::vector<Coeff_t> as;
  // For preconditioning Cayley form
-  Vector<Coeff_t> bee;
+  std::vector<Coeff_t> bee;
-  Vector<Coeff_t> cee;
+  std::vector<Coeff_t> cee;
-  Vector<Coeff_t> aee;
+  std::vector<Coeff_t> aee;
-  Vector<Coeff_t> beo;
+  std::vector<Coeff_t> beo;
-  Vector<Coeff_t> ceo;
+  std::vector<Coeff_t> ceo;
-  Vector<Coeff_t> aeo;
+  std::vector<Coeff_t> aeo;
  // LDU factorisation of the eeoo matrix
-  Vector<Coeff_t> lee;
+  std::vector<Coeff_t> lee;
-  Vector<Coeff_t> leem;
+  std::vector<Coeff_t> leem;
-  Vector<Coeff_t> uee;
+  std::vector<Coeff_t> uee;
-  Vector<Coeff_t> ueem;
+  std::vector<Coeff_t> ueem;
-  Vector<Coeff_t> dee;
+  std::vector<Coeff_t> dee;
  // Device memory
  deviceVector<Coeff_t> d_diag;
  deviceVector<Coeff_t> d_upper;
  deviceVector<Coeff_t> d_lower;
  deviceVector<Coeff_t> d_lee;
  deviceVector<Coeff_t> d_dee;
  deviceVector<Coeff_t> d_uee;
  deviceVector<Coeff_t> d_leem;
  deviceVector<Coeff_t> d_ueem;
  // Matrices of 5d ee inverse params
-  Vector<iSinglet<Simd> >  MatpInv;
+  //  std::vector<iSinglet<Simd> >  MatpInv;
-  Vector<iSinglet<Simd> >  MatmInv;
+  //  std::vector<iSinglet<Simd> >  MatmInv;
-  Vector<iSinglet<Simd> >  MatpInvDag;
+  //  std::vector<iSinglet<Simd> >  MatpInvDag;
-  Vector<iSinglet<Simd> >  MatmInvDag;
+  //  std::vector<iSinglet<Simd> >  MatmInvDag;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
@ -187,7 +203,7 @@ public:
 protected:
  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
-  virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
+  virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@ -60,6 +60,50 @@ public:
  //      virtual void   Instantiatable(void)=0;
  virtual void   Instantiatable(void) =0;
  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
  {
    std::cout << "Free Propagator for PartialFraction"<<std::endl;
    FermionField in_k(in.Grid());
    FermionField prop_k(in.Grid());
    FFT theFFT((GridCartesian *) in.Grid());
    //phase for boundary condition
    ComplexField coor(in.Grid());
    ComplexField ph(in.Grid());  ph = Zero();
    FermionField in_buf(in.Grid()); in_buf = Zero();
    typedef typename Simd::scalar_type Scalar;
    Scalar ci(0.0,1.0);
    assert(twist.size() == Nd);//check that twist is Nd
    assert(boundary.size() == Nd);//check that boundary conditions is Nd
    int shift = 0;
    for(unsigned int nu = 0; nu < Nd; nu++)
      {
 	// Shift coordinate lattice index by 1 to account for 5th dimension.
 	LatticeCoordinate(coor, nu + shift);
 	double boundary_phase = ::acos(real(boundary[nu]));
 	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
 	//momenta for propagator shifted by twist+boundary
 	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
      }
    in_buf = exp(ci*ph*(-1.0))*in;
    theFFT.FFT_all_dim(in_k,in,FFT::forward);
    this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
    //phase for boundary condition
    out = out * exp(ci*ph);
  };
  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
    std::vector<Complex> boundary;
    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
    FreePropagator(in,out,mass,boundary,twist);
  };
  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
@ -90,12 +134,12 @@ protected:
  RealD mass;
  RealD R;
  RealD ZoloHiInv;
-  Vector<double> Beta;
+  std::vector<double> Beta;
-  Vector<double> cc;;
+  std::vector<double> cc;;
-  Vector<double> cc_d;;
+  std::vector<double> cc_d;;
-  Vector<double> sqrt_cc;
+  std::vector<double> sqrt_cc;
-  Vector<double> See;
+  std::vector<double> See;
-  Vector<double> Aee;
+  std::vector<double> Aee;
 };
--- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
@ -69,10 +69,10 @@ public:
  // Instantiate different versions depending on Impl
  /////////////////////////////////////////////////////
  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	   std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
  virtual void RefreshShiftCoefficients(RealD new_shift);
@ -83,7 +83,7 @@ public:
 			RealD _M5, const ImplParams& p=ImplParams());
 protected:
-  void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
+  void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@ -102,11 +102,11 @@ public:
 		     GaugeField &mat, 
 		     const FermionField &A, const FermionField &B, int dag);
-  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);
  //////////////////////////////////////////////////////////////////////////
@ -164,8 +164,6 @@ public:
  DoubledGaugeField UUUmuEven;
  DoubledGaugeField UUUmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@ -100,7 +100,6 @@ public:
 		     int dag);
  void DhopInternal(StencilImpl & st,
 		    LebesgueOrder &lo,
 		    DoubledGaugeField &U,
 		    DoubledGaugeField &UUU,
 		    const FermionField &in, 
@ -108,7 +107,6 @@ public:
 		    int dag);
    void DhopInternalOverlappedComms(StencilImpl & st,
 		      LebesgueOrder &lo,
 		      DoubledGaugeField &U,
 		      DoubledGaugeField &UUU,
 		      const FermionField &in, 
@ -116,7 +114,6 @@ public:
 		      int dag);
    void DhopInternalSerialComms(StencilImpl & st,
 		      LebesgueOrder &lo,
 		      DoubledGaugeField &U,
 		      DoubledGaugeField &UUU,
 		      const FermionField &in, 
@ -192,8 +189,6 @@ public:
  DoubledGaugeField UUUmuEven;
  DoubledGaugeField UUUmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
--- a/Grid/qcd/action/fermion/MobiusEOFAFermion.h
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.h
@ -42,11 +42,11 @@ public:
 public:
  // Shift operator coefficients for red-black preconditioned Mobius EOFA
-  Vector<Coeff_t> Mooee_shift;
+  std::vector<Coeff_t> Mooee_shift;
-  Vector<Coeff_t> MooeeInv_shift_lc;
+  std::vector<Coeff_t> MooeeInv_shift_lc;
-  Vector<Coeff_t> MooeeInv_shift_norm;
+  std::vector<Coeff_t> MooeeInv_shift_norm;
-  Vector<Coeff_t> MooeeInvDag_shift_lc;
+  std::vector<Coeff_t> MooeeInvDag_shift_lc;
-  Vector<Coeff_t> MooeeInvDag_shift_norm;
+  std::vector<Coeff_t> MooeeInvDag_shift_norm;
  virtual void Instantiatable(void) {};
@ -74,18 +74,18 @@ public:
  // Instantiate different versions depending on Impl
  /////////////////////////////////////////////////////
  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	   std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
  void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-		 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+		 std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-		 Vector<Coeff_t>& shift_coeffs);
+		 std::vector<Coeff_t>& shift_coeffs);
  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
  void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-		    Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
+		    std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
-		    Vector<Coeff_t>& shift_coeffs);
+		    std::vector<Coeff_t>& shift_coeffs);
  virtual void RefreshShiftCoefficients(RealD new_shift);
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@ -102,11 +102,11 @@ public:
 		     GaugeField &mat, 
 		     const FermionField &A, const FermionField &B, int dag);
-  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
 			       const FermionField &in, FermionField &out, int dag);
-  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
 				   const FermionField &in, FermionField &out, int dag);
  //////////////////////////////////////////////////////////////////////////
@ -152,9 +152,6 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@ -41,6 +41,10 @@ public:
 public:
  // Constructors
  virtual void   Instantiatable(void){};
  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
    this->MomentumSpacePropagatorHw(out,in,_m,twist);
  };
  OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
 				      GridCartesian         &FiveDimGrid,
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@ -41,6 +41,9 @@ public:
 public:
  virtual void   Instantiatable(void){};
  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
    this->MomentumSpacePropagatorHw(out,in,_m,twist);
  };
  // Constructors
  OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
 				   GridCartesian         &FiveDimGrid,
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@ -40,6 +40,9 @@ public:
  INHERIT_IMPL_TYPES(Impl);
  virtual void   Instantiatable(void){};
  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
    this->MomentumSpacePropagatorHw(out,in,_m,twist);
  };
  // Constructors
  OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
 					GridCartesian         &FiveDimGrid,
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@ -41,6 +41,9 @@ public:
 public:
  virtual void   Instantiatable(void){};
  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
    this->MomentumSpacePropagatorHw(out,in,_m,twist);
  };
  // Constructors
  OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
 					  GridCartesian         &FiveDimGrid,
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@ -40,6 +40,11 @@ public:
  INHERIT_IMPL_TYPES(Impl);
  virtual void   Instantiatable(void){};
  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
    this->MomentumSpacePropagatorHw(out,in,_m,twist);
  };
  // Constructors
  OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
 					       GridCartesian         &FiveDimGrid,
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
 public:
  INHERIT_IMPL_TYPES(Impl);
-  const int part_frac_chroma_convention=1;
+  const int part_frac_chroma_convention=0;
  void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
  void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
@ -83,19 +83,78 @@ public:
 			   GridRedBlackCartesian &FourDimRedBlackGrid,
 			   RealD _mass,RealD M5,const ImplParams &p= ImplParams());
  PartialFractionFermion5D(GaugeField &_Umu,
 			   GridCartesian         &FiveDimGrid,
 			   GridRedBlackCartesian &FiveDimRedBlackGrid,
 			   GridCartesian         &FourDimGrid,
 			   GridRedBlackCartesian &FourDimRedBlackGrid,
 			   RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
  {
    std::cout << "Free Propagator for PartialFraction"<<std::endl;
    FermionField in_k(in.Grid());
    FermionField prop_k(in.Grid());
    FFT theFFT((GridCartesian *) in.Grid());
    //phase for boundary condition
    ComplexField coor(in.Grid());
    ComplexField ph(in.Grid());  ph = Zero();
    FermionField in_buf(in.Grid()); in_buf = Zero();
    typedef typename Simd::scalar_type Scalar;
    Scalar ci(0.0,1.0);
    assert(twist.size() == Nd);//check that twist is Nd
    assert(boundary.size() == Nd);//check that boundary conditions is Nd
    int shift = 0;
    for(unsigned int nu = 0; nu < Nd; nu++)
      {
 	// Shift coordinate lattice index by 1 to account for 5th dimension.
 	LatticeCoordinate(coor, nu + shift);
 	double boundary_phase = ::acos(real(boundary[nu]));
 	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
 	//momenta for propagator shifted by twist+boundary
 	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
      }
    in_buf = exp(ci*ph*(-1.0))*in;
    theFFT.FFT_all_dim(in_k,in,FFT::forward);
    if ( this->qmu.size() ){
      this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
    } else {
      this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
    }
    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
    //phase for boundary condition
    out = out * exp(ci*ph);
  };
  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
    std::vector<Complex> boundary;
    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
    FreePropagator(in,out,mass,boundary,twist);
  };
  void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
  void addQmu(const FermionField &in, FermionField &out, int dag);
 protected:
  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
  virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
  std::vector<RealD> qmu;
  // Part frac
  RealD mass;
  RealD dw_diag;
  RealD R;
  RealD amax;
  RealD scale;
-  Vector<double> p; 
+  std::vector<double> p; 
-  Vector<double> q;
+  std::vector<double> q;
 };
--- a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
+++ b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
@ -35,7 +35,7 @@ template<class Matrix, class Field>
 class KappaSimilarityTransform {
 public:
  INHERIT_IMPL_TYPES(Matrix);
-  Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
+  std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
  KappaSimilarityTransform (Matrix &zmob) {
    for (int i=0;i<(int)zmob.bs.size();i++) {
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
 public:
-  void DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
+  void DhopImproved(StencilImpl &st,
 		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
 		    const FermionField &in, FermionField &out, int dag, int interior,int exterior);
-  void DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
+  void DhopNaive(StencilImpl &st,
 		 DoubledGaugeField &U,
 		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -47,7 +47,7 @@ public:
  static int PartialCompressionFactor(GridBase *grid) { return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
+  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
 				   const Lattice<vobj> &rhs,
 				   cobj *buffer,
 				   compressor &compress,
@ -109,7 +109,7 @@ public:
  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
@ -197,7 +197,7 @@ public:
 #endif
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
+  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
 					 const Lattice<vobj> &rhs,
 					 cobj *buffer,
 					 compressor &compress,
@ -208,7 +208,7 @@ public:
    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
  }
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
@ -402,7 +402,6 @@ public:
  typedef CartesianStencil<vobj,cobj,Parameters> Base;
  typedef typename Base::View_type View_type;
  typedef typename Base::StencilVector StencilVector;
  //  Vector<int> surface_list;
  WilsonStencil(GridBase *grid,
@ -416,29 +415,6 @@ public:
    this->same_node.resize(npoints);
  };
  /*
  void BuildSurfaceList(int Ls,int vol4){
    // find same node for SHM
    // Here we know the distance is 1 for WilsonStencil
    for(int point=0;point<this->_npoints;point++){
      this->same_node[point] = this->SameNode(point);
    }
    for(int site = 0 ;site< vol4;site++){
      int local = 1;
      for(int point=0;point<this->_npoints;point++){
 	if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ 
 	  local = 0;
 	}
      }
      if(local == 0) { 
 	surface_list.push_back(site);
      }
    }
  }
  */
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
  {
@ -508,6 +484,11 @@ public:
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
 #ifdef NVLINK_GET
    this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
    // Or issue barrier AFTER the DMA is running
 #endif    
  }
 };
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@ -126,13 +126,16 @@ public:
  void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
                     const FermionField &A, const FermionField &B, int dag);
-  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternal(StencilImpl &st,
 		    DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternalSerial(StencilImpl &st,
 			  DoubledGaugeField &U,
 			  const FermionField &in, FermionField &out, int dag);
-  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternalOverlappedComms(StencilImpl &st,
 				   DoubledGaugeField &U,
 				   const FermionField &in, FermionField &out, int dag);
  // Constructor
@ -168,9 +171,6 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  WilsonAnisotropyCoefficients anisotropyCoeff;
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -109,6 +109,8 @@ public:
  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
 				  std::vector<double> qmu) ;
  // Implement hopping term non-hermitian hopping term; half cb or both
  // Implement s-diagonal DW
@ -117,6 +119,9 @@ public:
  void DhopOE(const FermionField &in, FermionField &out,int dag);
  void DhopEO(const FermionField &in, FermionField &out,int dag);
  void DhopComms  (const FermionField &in, FermionField &out);
  void DhopCalc   (const FermionField &in, FermionField &out,uint64_t *ids);
  // add a DhopComm
  // -- suboptimal interface will presently trigger multiple comms.
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
@ -135,21 +140,18 @@ public:
 		     int dag);
  void DhopInternal(StencilImpl & st,
 		    LebesgueOrder &lo,
 		    DoubledGaugeField &U,
 		    const FermionField &in, 
 		    FermionField &out,
 		    int dag);
  void DhopInternalOverlappedComms(StencilImpl & st,
 				   LebesgueOrder &lo,
 				   DoubledGaugeField &U,
 				   const FermionField &in, 
 				   FermionField &out,
 				   int dag);
  void DhopInternalSerialComms(StencilImpl & st,
 			       LebesgueOrder &lo,
 			       DoubledGaugeField &U,
 			       const FermionField &in, 
 			       FermionField &out,
@ -203,9 +205,6 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@ -57,6 +57,10 @@ public:
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
 			 int interior=1,int exterior=1) ;
  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
 			 uint64_t *ids);
  static void DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out,
 			    int interior=1,int exterior=1) ;
--- a/Grid/qcd/action/fermion/ZMobiusFermion.h
+++ b/Grid/qcd/action/fermion/ZMobiusFermion.h
@ -58,7 +58,7 @@ public:
  {
    //    RealD eps = 1.0;
    std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
-    Vector<Coeff_t> zgamma(this->Ls);
+    std::vector<Coeff_t> zgamma(this->Ls);
    for(int s=0;s<this->Ls;s++){
      zgamma[s] = gamma[s];
    }
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
@ -1,3 +1,5 @@
 #if 0
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -818,3 +820,5 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/deprecated/Lebesgue.cc
+++ b/Grid/qcd/action/fermion/deprecated/Lebesgue.cc
@ -1,3 +1,4 @@
 #if 0
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -241,3 +242,4 @@ void LebesgueOrder::ZGraph(void)
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/deprecated/Lebesgue.h
+++ b/Grid/qcd/action/fermion/deprecated/Lebesgue.h
@ -72,7 +72,7 @@ public:
  void ThreadInterleave(void);
 private:
-  Vector<IndexInteger> _LebesgueReorder;
+  deviceVector<IndexInteger> _LebesgueReorder;
 };    
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -49,6 +49,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FourDimRedBlackGrid,_M5,p),
  mass_plus(_mass), mass_minus(_mass)
 {
  // qmu defaults to zero size;
 }
 ///////////////////////////////////////////////////////////////
@ -156,18 +157,18 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag (Ls,1.0);
+  std::vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bs;
+  std::vector<Coeff_t> diag = bs;
-  Vector<Coeff_t> upper= cs;
+  std::vector<Coeff_t> upper= cs;
-  Vector<Coeff_t> lower= cs; 
+  std::vector<Coeff_t> lower= cs; 
  upper[Ls-1]=-mass_minus*upper[Ls-1];
  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
@ -176,9 +177,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = beo;
+  std::vector<Coeff_t> diag = beo;
-  Vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
@ -191,9 +192,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> diag = bee;
-  Vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-cee[i];
    lower[i]=-cee[i];
@ -206,9 +207,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> diag = bee;
-  Vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
@ -236,9 +237,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);
-  Vector<Coeff_t> lower(Ls,-1.0);
+  std::vector<Coeff_t> lower(Ls,-1.0);
  upper[Ls-1]=-mass_plus*upper[Ls-1];
  lower[0]   =-mass_minus*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
@ -248,9 +249,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag =bs;
+  std::vector<Coeff_t> diag =bs;
-  Vector<Coeff_t> upper=cs;
+  std::vector<Coeff_t> upper=cs;
-  Vector<Coeff_t> lower=cs; 
+  std::vector<Coeff_t> lower=cs; 
  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
@ -270,6 +271,34 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  M5Ddag(psi,psi,Din,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
 {
  if ( qmu.size() ) {
    Gamma::Algebra Gmu [] = {
      Gamma::Algebra::GammaX,
      Gamma::Algebra::GammaY,
      Gamma::Algebra::GammaZ,
      Gamma::Algebra::GammaT
    };
    std::vector<ComplexD> coeff(Nd);
    ComplexD ci(0,1);
    assert(qmu.size()==Nd);
    for(int mu=0;mu<Nd;mu++){
       coeff[mu] = ci*qmu[mu];
       if ( dag ) coeff[mu] = conjugate(coeff[mu]);
    }
    chi = chi + Gamma(Gmu[0])*psi*coeff[0];
    for(int mu=1;mu<Nd;mu++){
      chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
    }
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
@ -279,6 +308,10 @@ void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
  Meooe5D(psi,Din);
  this->DW(Din,chi,DaggerNo);
  // add i q_mu gamma_mu here
  addQmu(Din,chi,DaggerNo);
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby(chi,1.0,1.0,chi,psi); 
@ -296,6 +329,9 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  // Apply Dw
  this->DW(psi,Din,DaggerYes); 
  // add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
  addQmu(psi,Din,DaggerYes);
  MeooeDag5D(Din,chi);
  M5Ddag(psi,chi);
@ -394,7 +430,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  Vector<Coeff_t> gamma(this->Ls);
+  std::vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(1.0,gamma,b,c);
 }
@ -402,13 +438,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  Vector<Coeff_t> gamma(this->Ls);
+  std::vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(zolo_hi,gamma,b,c);
 }
 //Zolo
 template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
+void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
 {
  int Ls=this->Ls;
@ -529,6 +565,18 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
    dee[Ls-1] += delta_d;
  }  
  //////////////////////////////////////////
  // Device buffers
  //////////////////////////////////////////
  d_diag.resize(Ls);
  d_upper.resize(Ls);
  d_lower.resize(Ls);
  d_dee.resize(Ls);
  d_lee.resize(Ls);
  d_uee.resize(Ls);
  d_leem.resize(Ls);
  d_ueem.resize(Ls);
  //  int inv=1;
  //  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
  //  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@ -43,9 +43,9 @@ void
 CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
 			       const FermionField &phi_i, 
 			       FermionField &chi_i,
-			       Vector<Coeff_t> &lower,
+			       std::vector<Coeff_t> &lower,
-			       Vector<Coeff_t> &diag,
+			       std::vector<Coeff_t> &diag,
-			       Vector<Coeff_t> &upper)
+			       std::vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard()=psi_i.Checkerboard();
@ -55,12 +55,16 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
  auto pdiag = &diag[0];
  auto pupper = &upper[0];
  auto plower = &lower[0];
  int Ls =this->Ls;
  acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
  auto plower = &d_lower[0];
  // 10 = 3 complex mult + 2 complex add
  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
  uint64_t nloop = grid->oSites();
@ -82,9 +86,9 @@ void
 CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
 			      const FermionField &phi_i, 
 			      FermionField &chi_i,
-			      Vector<Coeff_t> &lower,
+			      std::vector<Coeff_t> &lower,
-			      Vector<Coeff_t> &diag,
+			      std::vector<Coeff_t> &diag,
-			      Vector<Coeff_t> &upper)
+			      std::vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
@ -93,12 +97,16 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
  auto pdiag = &diag[0];
  auto pupper = &upper[0];
  auto plower = &lower[0];
  int Ls=this->Ls;
  acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
  auto pdiag = &d_diag[0];
  auto pupper = &d_upper[0];
  auto plower = &d_lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -126,11 +134,17 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
  int Ls=this->Ls;
-  auto plee  = & lee [0];
+  acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
-  auto pdee  = & dee [0];
+  acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
-  auto puee  = & uee [0];
+  acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
-  auto pleem = & leem[0];
+  acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
-  auto pueem = & ueem[0];
+  acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
  auto plee  = & d_lee [0];
  auto pdee  = & d_dee [0];
  auto puee  = & d_uee [0];
  auto pleem = & d_leem[0];
  auto pueem = & d_ueem[0];
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -182,11 +196,17 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
  autoView(psi , psi_i,AcceleratorRead);
  autoView(chi , chi_i,AcceleratorWrite);
-  auto plee  = & lee [0];
+  acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
-  auto pdee  = & dee [0];
+  acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
-  auto puee  = & uee [0];
+  acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
-  auto pleem = & leem[0];
+  acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
-  auto pueem = & ueem[0];
+  acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
  auto plee  = & d_lee [0];
  auto pdee  = & d_dee [0];
  auto puee  = & d_uee [0];
  auto pleem = & d_leem[0];
  auto pueem = & d_ueem[0];
  assert(psi.Checkerboard() == psi.Checkerboard());
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@ -42,13 +42,13 @@ template<class Impl>
 void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
 {
  // How to check Ls matches??
-  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
+  std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
  int Ls = this->Ls;
  std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
  assert(zdata->db==Ls);// Beta has Ls coeffs
  R=(1+this->mass)/(1-this->mass);
@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid);
 // Pplus  backwards..
 template<class Impl>
 void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+				      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  int Ls = this->Ls;
@ -50,9 +50,15 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  autoView( psi , psi_i, AcceleratorRead);
  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
-  auto pdiag = &diag[0];
+
-  auto pupper = &upper[0];
+  auto pdiag  = &this->d_diag[0];
-  auto plower = &lower[0];
+  auto pupper = &this->d_upper[0];
  auto plower = &this->d_lower[0];
  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  auto nloop=grid->oSites()/Ls;
@ -73,7 +79,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
 template<class Impl>
 void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+					 std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase* grid = psi_i.Grid();
@ -83,9 +89,14 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  autoView( phi , phi_i, AcceleratorRead);
  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
-  auto pdiag = &diag[0];
+  
-  auto pupper = &upper[0];
+  auto pdiag  = &this->d_diag[0];
-  auto plower = &lower[0];
+  auto pupper = &this->d_upper[0];
  auto plower = &this->d_lower[0];
  acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
  // Flops = 6.0*(Nc*Ns) *Ls*vol
@ -114,12 +125,17 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
  autoView( chi, chi_i, AcceleratorWrite);
  int Ls = this->Ls;
-  auto plee  = & this->lee[0];
+  auto plee  = & this->d_lee [0];
-  auto pdee  = & this->dee[0];
+  auto pdee  = & this->d_dee [0];
-  auto puee  = & this->uee[0];
+  auto puee  = & this->d_uee [0];
  auto pleem = & this->d_leem[0];
  auto pueem = & this->d_ueem[0];
-  auto pleem = & this->leem[0];
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
-  auto pueem = & this->ueem[0];
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
  uint64_t nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
@ -131,9 +131,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi
    else{ shiftm = -shift*(mq3-mq2); }
  }
-  Vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
 #if(0)
  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
@ -168,9 +168,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField&
    else{ shiftm = -shift*(mq3-mq2); }
  }
-  Vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
  this->M5Ddag(psi, chi, chi, lower, diag, upper);
 }
@ -181,9 +181,9 @@ void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& c
 {
  int Ls = this->Ls;
-  Vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
@ -200,9 +200,9 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
 {
  int Ls = this->Ls;
-  Vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
@ -218,7 +218,7 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
 //Zolo
 template<class Impl>
-void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
+void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
 {
  int   Ls    = this->Ls;
  int   pm    = this->pm;
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@ -61,8 +61,6 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
  UUUmu(&FourDimGrid),
  UUUmuEven(&FourDimRedBlackGrid),
  UUUmuOdd(&FourDimRedBlackGrid),
  Lebesgue(&FourDimGrid),
  LebesgueEvenOdd(&FourDimRedBlackGrid),
  _tmp(&FiveDimRedBlackGrid)
 {
@ -277,18 +275,18 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 /*CHANGE */
 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, 
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
  else
-    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalSerialComms(st,U,UUU,in,out,dag);
 }
 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, 
 								   DoubledGaugeField & U,DoubledGaugeField & UUU,
 								   const FermionField &in, FermionField &out,int dag)
 {
@ -313,7 +311,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
  st.CommsMerge(compressor);
@ -323,12 +321,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 }
 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, 
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
@ -341,7 +339,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 }
 /*CHANGE END*/
@ -357,7 +355,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
  assert(in.Checkerboard()==Even);
  out.Checkerboard() = Odd;
-  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
+  DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
@ -368,7 +366,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
  assert(in.Checkerboard()==Odd);
  out.Checkerboard() = Even;
-  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
+  DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
@ -378,7 +376,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  out.Checkerboard() = in.Checkerboard();
-  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
+  DhopInternal(Stencil,Umu,UUUmu,in,out,dag);
 }
 /////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@ -48,8 +48,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
    mass(_mass),
    Lebesgue(_grid),
    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
@ -339,7 +337,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
  out.Checkerboard() = in.Checkerboard();
-  DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
+  DhopInternal(Stencil, Umu, UUUmu, in, out, dag);
 }
 template <class Impl>
@ -351,7 +349,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;
-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
+  DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag);
 }
 template <class Impl>
@ -363,7 +361,7 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;
-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
+  DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag);
 }
 template <class Impl>
@ -394,19 +392,19 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, 
 						  DoubledGaugeField &U,
 						  DoubledGaugeField &UUU,
 						  const FermionField &in,
 						  FermionField &out, int dag) 
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
  else
-    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalSerialComms(st,U,UUU,in,out,dag);
 }
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, 
 								 DoubledGaugeField &U,
 								 DoubledGaugeField &UUU,
 								 const FermionField &in,
@ -429,7 +427,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
  st.CommunicateComplete(requests);
@ -440,13 +438,13 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 }
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, 
 							     DoubledGaugeField &U,
 							     DoubledGaugeField &UUU,
 							     const FermionField &in,
@ -460,7 +458,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 };
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				  Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+				  std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@ -50,9 +50,13 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
  assert(phi.Checkerboard() == psi.Checkerboard());
-  auto pdiag = &diag[0];
+  auto pdiag  = &this->d_diag[0];
-  auto pupper = &upper[0];
+  auto pupper = &this->d_upper[0];
-  auto plower = &lower[0];
+  auto plower = &this->d_lower[0];
  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@ -74,8 +78,8 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
+					std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
-					Vector<Coeff_t> &shift_coeffs)
+					std::vector<Coeff_t> &shift_coeffs)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@ -89,10 +93,15 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
  assert(phi.Checkerboard() == psi.Checkerboard());
-  auto pdiag = &diag[0];
+  auto pdiag  = &this->d_diag[0];
-  auto pupper = &upper[0];
+  auto pupper = &this->d_upper[0];
-  auto plower = &lower[0];
+  auto plower = &this->d_lower[0];
-  auto pshift_coeffs = &shift_coeffs[0];
+  auto pshift_coeffs = &this->d_shift_coefficients[0];
  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@ -119,7 +128,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				     Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+				     std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@ -130,9 +139,13 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
  assert(phi.Checkerboard() == psi.Checkerboard());
-  auto pdiag = &diag[0];
+  auto pdiag  = &this->d_diag[0];
-  auto pupper = &upper[0];
+  auto pupper = &this->d_upper[0];
-  auto plower = &lower[0];
+  auto plower = &this->d_lower[0];
  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@ -154,8 +167,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					   Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
+					   std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
-					   Vector<Coeff_t> &shift_coeffs)
+					   std::vector<Coeff_t> &shift_coeffs)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@ -167,10 +180,15 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
  assert(phi.Checkerboard() == psi.Checkerboard());
-  auto pdiag = &diag[0];
+  auto pdiag  = &this->d_diag[0];
-  auto pupper = &upper[0];
+  auto pupper = &this->d_upper[0];
-  auto plower = &lower[0];
+  auto plower = &this->d_lower[0];
-  auto pshift_coeffs = &shift_coeffs[0];
+  auto pshift_coeffs = &this->d_shift_coefficients[0];
  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  auto pm = this->pm;
@ -212,11 +230,17 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);
-  auto plee = & this->lee [0];
+  auto plee  = & this->d_lee [0];
-  auto pdee = & this->dee [0];
+  auto pdee  = & this->d_dee [0];
-  auto puee = & this->uee [0];
+  auto puee  = & this->d_uee [0];
-  auto pleem= & this->leem[0];
+  auto pleem = & this->d_leem[0];
-  auto pueem= & this->ueem[0];
+  auto pueem = & this->d_ueem[0];
  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
@ -268,14 +292,23 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);
  // Move into object and constructor
  auto pm = this->pm;
-  auto plee = & this->lee [0];
+  auto plee  = & this->d_lee [0];
-  auto pdee = & this->dee [0];
+  auto pdee  = & this->d_dee [0];
-  auto puee = & this->uee [0];
+  auto puee  = & this->d_uee [0];
-  auto pleem= & this->leem[0];
+  auto pleem = & this->d_leem[0];
-  auto pueem= & this->ueem[0];
+  auto pueem = & this->d_ueem[0];
-  auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0];
+  auto pMooeeInv_shift_lc   = &this->d_MooeeInv_shift_lc[0];
-  auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
+  auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0];
  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -333,11 +366,17 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);
-  auto plee = & this->lee [0];
+  auto plee  = &this->d_lee [0];
-  auto pdee = & this->dee [0];
+  auto pdee  = &this->d_dee [0];
-  auto puee = & this->uee [0];
+  auto puee  = &this->d_uee [0];
-  auto pleem= & this->leem[0];
+  auto pleem = &this->d_leem[0];
-  auto pueem= & this->ueem[0];
+  auto pueem = &this->d_ueem[0];
  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -387,13 +426,25 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
  int Ls = this->Ls;
  auto pm = this->pm;
-  auto plee = & this->lee [0];
+  auto plee  = & this->d_lee [0];
-  auto pdee = & this->dee [0];
+  auto pdee  = & this->d_dee [0];
-  auto puee = & this->uee [0];
+  auto puee  = & this->d_uee [0];
-  auto pleem= & this->leem[0];
+  auto pleem = & this->d_leem[0];
-  auto pueem= & this->ueem[0];
+  auto pueem = & this->d_ueem[0];
-  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
+
-  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
+  auto pMooeeInvDag_shift_lc   = &this->d_MooeeInv_shift_lc[0];
  auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0];
  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
  acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
  //  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
  //  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
@ -196,9 +196,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
 {
  int Ls = this->Ls;
-  Vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+  std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
  // no shift term
  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
@ -212,9 +212,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
 {
  int Ls = this->Ls;
-  Vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+  std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
  // no shift term
  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
@ -230,9 +230,9 @@ void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
  int Ls = this->Ls;
  // coefficients of Mooee
-  Vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
    lower[s] = -this->cee[s];
@ -253,9 +253,9 @@ void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& ch
  int Ls = this->Ls;
  // coefficients of MooeeDag
-  Vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    if(s==0) {
      upper[s] = -this->cee[s+1];
@ -314,10 +314,10 @@ void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
  // Tridiagonal solve for MooeeInvDag_shift_lc
  {
    Coeff_t m(0.0);
-    Vector<Coeff_t> d = Mooee_shift;
+    std::vector<Coeff_t> d = Mooee_shift;
-    Vector<Coeff_t> u(Ls,0.0);
+    std::vector<Coeff_t> u(Ls,0.0);
-    Vector<Coeff_t> y(Ls,0.0);
+    std::vector<Coeff_t> y(Ls,0.0);
-    Vector<Coeff_t> q(Ls,0.0);
+    std::vector<Coeff_t> q(Ls,0.0);
    if(pm == 1){ u[0] = 1.0; }
    else{ u[Ls-1] = 1.0; }
--- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
@ -48,8 +48,6 @@ NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed
    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
    mass(_mass),
    Lebesgue(_grid),
    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
@ -268,7 +266,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
  out.Checkerboard() = in.Checkerboard();
-  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
+  DhopInternal(Stencil, Umu, in, out, dag);
 }
 template <class Impl>
@ -280,7 +278,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;
-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
+  DhopInternal(StencilEven, UmuOdd, in, out, dag);
 }
 template <class Impl>
@ -292,7 +290,7 @@ void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &o
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;
-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
+  DhopInternal(StencilOdd, UmuEven, in, out, dag);
 }
 template <class Impl>
@ -323,18 +321,18 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &
 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
 					       DoubledGaugeField &U,
 					       const FermionField &in,
 					       FermionField &out, int dag) 
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
+    DhopInternalOverlappedComms(st,U,in,out,dag);
  else
-    DhopInternalSerialComms(st,lo,U,in,out,dag);
+    DhopInternalSerialComms(st,U,in,out,dag);
 }
 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
 							      DoubledGaugeField &U,
 							      const FermionField &in,
 							      FermionField &out, int dag) 
@ -356,7 +354,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
  }
  st.CommunicateComplete(requests);
@ -367,12 +365,12 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
  }
 }
 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
+void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
 							  DoubledGaugeField &U,
 							  const FermionField &in,
 							  FermionField &out, int dag) 
@ -385,7 +383,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
  }
 };
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@ -239,6 +239,31 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
  this->DW(psi,D,DaggerNo);
  // DW - DW+iqslash
  //  (g5 Dw)^dag = g5 Dw
  //  (iqmu g5 gmu)^dag = (-i qmu gmu^dag g5^dag) = i qmu g5 gmu
  if ( qmu.size() ) {
    std::cout<< "Mat" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
    assert(qmu.size()==Nd);
    FermionField qslash_psi(psi.Grid());
    Gamma::Algebra Gmu [] = {
 			     Gamma::Algebra::GammaX,
 			     Gamma::Algebra::GammaY,
 			     Gamma::Algebra::GammaZ,
 			     Gamma::Algebra::GammaT
    };
    qslash_psi = qmu[0]*(Gamma(Gmu[0])*psi);
    for(int mu=1;mu<Nd;mu++){
      qslash_psi = qslash_psi + qmu[mu]*(Gamma(Gmu[mu])*psi);
    }
    ComplexD ci(0.0,1.0);
    qslash_psi = ci*qslash_psi ; // i qslash
    D = D + qslash_psi;
  }
  int nblock=(Ls-1)/2;
  for(int b=0;b<nblock;b++){
@ -255,8 +280,47 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
  }
  {
    // The 'conventional' Cayley overlap operator is
    //
    // Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
    //
    //
    // With massless limit 1/2(1+g5 sgnHw)
    //
    // Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
    //
    // However, the conventional normalisation has both a leading order factor of 2 in Zq
    // at tree level AND a mass dependent (1-m) that are convenient to absorb.
    //
    // In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
    //
    // num = -i sin kmu gmu
    //
    // denom ( sqrt(sk^2 + (2shk^2 - 1)^2
    //    b_k = sk2 - M5;
    //     
    //    w_k = sqrt(sk + b_k*b_k);
    //
    //    denom= ( w_k + b_k + mass*mass) ;
    //
    //    denom= one/denom;
    //    out = num*denom;
    //
    // Chroma, and Grid define partial fraction via 4d operator
    //
    //   Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
    //
    // Now since:
    //
    //      (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
    //
    // This corresponds to a modified mass parameter
    //
    // It has an annoying 
    //
    // 
    double R=(1+this->mass)/(1-this->mass);
-    //R g5 psi[Ls] + p[0] H
+    //R g5 psi[Ls] + p[0] Hw
    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
    for(int b=0;b<nblock;b++){
@ -264,6 +328,7 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
      double pp = p[nblock-1-b];
      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
    }
  }
 }
@ -411,17 +476,18 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
    {
      //void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
      int Ls = this->Ls;
      conformable(imported5d.Grid(),this->FermionGrid());
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
@ -442,7 +508,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
 {
  int Ls = this->Ls;
-
+  qmu.resize(0);
  assert((Ls&0x1)==1); // Odd Ls required
  int nrational=Ls-1;
@ -460,6 +526,22 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
  Approx::zolotarev_free(zdata);
 }
 template<class Impl>
 PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
 							 GridCartesian         &FiveDimGrid,
 							 GridRedBlackCartesian &FiveDimRedBlackGrid,
 							 GridCartesian         &FourDimGrid,
 							 GridRedBlackCartesian &FourDimRedBlackGrid,
 							 RealD _mass,RealD M5,
 							 std::vector<RealD> &_qmu,
 							 const ImplParams &p)
  : PartialFractionFermion5D<Impl>(_Umu,
 			     FiveDimGrid,FiveDimRedBlackGrid,
 			     FourDimGrid,FourDimRedBlackGrid,
 			     _mass,M5,p)
 {
  qmu=_qmu;
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@ -375,23 +375,6 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  }
 }
 /*
 #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 									\
  template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 									\
  template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 */
 #undef LOAD_CHI
 #undef HAND_DECLARATIONS
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@ -256,7 +256,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
  });
 template <class Impl> 
-void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
+void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 {
@ -294,7 +294,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  assert(0 && " Kernel optimisation case not covered ");
 }
 template <class Impl> 
-void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
+void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, 
 				       DoubledGaugeField &U,
 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 {
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -58,15 +58,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Umu(_FourDimGrid),
  UmuEven(_FourDimRedBlackGrid),
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid),
  _tmp(&FiveDimRedBlackGrid),
  Dirichlet(0)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
@ -305,19 +299,19 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 }
 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
+void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st,
                                         DoubledGaugeField & U,
                                         const FermionField &in, FermionField &out,int dag)
 {
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
+    DhopInternalOverlappedComms(st,U,in,out,dag);
  else 
-    DhopInternalSerialComms(st,lo,U,in,out,dag);
+    DhopInternalSerialComms(st,U,in,out,dag);
 }
 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
+void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
@ -331,21 +325,21 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
  {
    //    std::cout << " WilsonFermion5D gather " <<std::endl;
    GRID_TRACE("Gather");
    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
  }
  //  std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
  std::vector<std::vector<CommsRequest_t> > requests;
  auto id=traceStart("Communicate overlapped");
  st.CommunicateBegin(requests);
 #if 1
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  {
+  st.CommunicateBegin(requests);
    GRID_TRACE("MergeSHM");
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms 
-  }
+#endif
  /////////////////////////////
  // do the compute interior
@ -359,21 +353,34 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
  //ifdef GRID_ACCELERATED
 #if 0
  /////////////////////////////
  // Overlap with comms -- on GPU the interior kernel call is nonblocking
  /////////////////////////////
  st.CommunicateBegin(requests);
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
 #endif
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  //  std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
  st.CommunicateComplete(requests);
-  traceStop(id);
+  //  traceStop(id);
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
  {
    //    std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
    GRID_TRACE("Merge");
    st.CommsMerge(compressor);
  }
  //  std::cout << " WilsonFermion5D Exterior " <<std::endl;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
@ -381,11 +388,12 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  //  std::cout << " WilsonFermion5D Done " <<std::endl;
 }
 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
+void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, 
 						    DoubledGaugeField & U,
 						    const FermionField &in, 
 						    FermionField &out,int dag)
@ -395,11 +403,13 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
  int LLs = in.Grid()->_rdimensions[0];
  //  std::cout << " WilsonFermion5D Halo exch " <<std::endl;
  {
    GRID_TRACE("HaloExchange");
    st.HaloExchangeOpt(in,compressor);
  }
  //  std::cout << " WilsonFermion5D Dhop " <<std::endl;
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDag");
@ -408,6 +418,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  //  std::cout << " WilsonFermion5D Done " <<std::endl;
 }
@ -420,7 +431,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
  assert(in.Checkerboard()==Even);
  out.Checkerboard() = Odd;
-  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
+  DhopInternal(StencilEven,UmuOdd,in,out,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
@ -431,8 +442,31 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
  assert(in.Checkerboard()==Odd);
  out.Checkerboard() = Even;
-  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
+  DhopInternal(StencilOdd,UmuEven,in,out,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopComms(const FermionField &in, FermionField &out)
 {
  int dag =0 ;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());
  out.Checkerboard() = in.Checkerboard();
  Compressor compressor(dag);
  Stencil.HaloExchangeOpt(in,compressor);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
 {
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());
  out.Checkerboard() = in.Checkerboard();
  int LLs = in.Grid()->_rdimensions[0];
  int Opt = WilsonKernelsStatic::Opt;
  Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
@ -441,7 +475,7 @@ void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int d
  out.Checkerboard() = in.Checkerboard();
-  DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
+  DhopInternal(Stencil,Umu,in,out,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
@ -735,6 +769,15 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
 template<class Impl>
 void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist)
 {
  std::vector<double> empty_q(Nd,0.0);
  MomentumSpacePropagatorHwQ(out,in,mass,twist,empty_q);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,
 						       RealD mass,
 						       std::vector<double> twist,
 						       std::vector<double> qmu)
 {
    Gamma::Algebra Gmu [] = {
      Gamma::Algebra::GammaX,
@ -750,6 +793,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    typedef typename FermionField::scalar_type ScalComplex;
    typedef Lattice<iSinglet<vector_type> > LatComplex;
    typedef iSpinMatrix<ScalComplex> SpinMat;
    Coordinate latt_size   = _grid->_fdimensions;
@ -767,6 +811,8 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    LatComplex kmu(_grid); 
    ScalComplex ci(0.0,1.0);
    std::cout<< "Feynman Rule" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
    for(int mu=0;mu<Nd;mu++) {
      LatticeCoordinate(kmu,mu);
@ -777,9 +823,18 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
      kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
      sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
      sk  = sk  + sin(kmu)*sin(kmu); 
-      num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);
+      sk = sk + (sin(kmu)+qmu[mu])*(sin(kmu)+qmu[mu]); 
      // Terms for boosted Fermion
      // 1/2 [ -i gamma.(sin p + q )     ]
      //     [ --------------------- + 1 ]
      //     [         wq + b            ]
      //
      // wq = sqrt( (sinp+q)^2 + b^2 )
      //
      num = num - (sin(kmu)+qmu[mu])*ci*(Gamma(Gmu[mu])*in);
    }
    num = num + mass * in ;
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -52,17 +52,12 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
    StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd
    mass(_mass),
    Lebesgue(_grid),
    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  if  (anisotropyCoeff.isAnisotropic){
@ -314,7 +309,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
  out.Checkerboard() = in.Checkerboard();
-  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
+  DhopInternal(Stencil, Umu, in, out, dag);
 }
 template <class Impl>
@ -326,7 +321,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;
-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
+  DhopInternal(StencilEven, UmuOdd, in, out, dag);
 }
 template <class Impl>
@ -338,7 +333,7 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;
-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
+  DhopInternal(StencilOdd, UmuEven, in, out, dag);
 }
 template <class Impl>
@ -391,21 +386,21 @@ void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,
 };
 template <class Impl>
-void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, 
                                       DoubledGaugeField &U,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
+    DhopInternalOverlappedComms(st,U,in,out,dag);
  else
 #endif
-    DhopInternalSerial(st,lo,U,in,out,dag);
+    DhopInternalSerial(st,U,in,out,dag);
 }
 template <class Impl>
-void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, 
 						      DoubledGaugeField &U,
 						      const FermionField &in,
 						      FermionField &out, int dag)
@ -474,7 +469,7 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
 template <class Impl>
-void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
+void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, 
 					     DoubledGaugeField &U,
 					     const FermionField &in,
 					     FermionField &out, int dag)
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h
@ -40,11 +40,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /// Switch off the 5d vectorised code optimisations
 #undef DWFVEC5D
-static Vector<vComplexF> signsF;
+static std::vector<vComplexF> signsF;
  template<typename vtype>    
-  int setupSigns(Vector<vtype>& signs ){
+  int setupSigns(std::vector<vtype>& signs ){
-    Vector<vtype> bother(2);
+    std::vector<vtype> bother(2);
    signs = bother;
    vrsign(signs[0]);
    visign(signs[1]);
@ -364,7 +364,7 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled
 #include <simd/Intel512double.h>
-static Vector<vComplexD> signsD;
+static std::vector<vComplexD> signsD;
 static int signInitD = setupSigns(signsD);
 #define MAYBEPERM(A,perm) if (perm) { A ; }
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -411,6 +411,46 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }
 #ifdef GRID_SYCL
 extern "C" {
    ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );
    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg );
    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag );
    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg );
    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void );
    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void );
    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void );
    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void );
    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void );
    void  SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value );
 }
 #ifdef GRID_SIMT
 #define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id())
 #else
 #define MAKE_ID(A) (0)
 #endif
 #else
 #define MAKE_ID(A) (0)
 #endif
 #define KERNEL_CALL_ID(A)						\
  const uint64_t    NN = Nsite*Ls;					\
  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
      int sF = ss;							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
      const int Nsimd = SiteHalfSpinor::Nsimd();			\
      const int lane=acceleratorSIMTlane(Nsimd);                        \
      int idx=sF*Nsimd+lane;						\
      uint64_t id = MAKE_ID();						\
      ids[idx]=id;							\
    });									\
  accelerator_barrier();
 #define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
@ -434,7 +474,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #define ASM_CALL(A)							\
  thread_for( sss, Nsite, {						\
-    int ss = st.lo->Reorder(sss);					\
+    int ss = sss; /*st.lo->Reorder(sss);*/			\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
@ -451,6 +491,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
    });}
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 				     int Ls, int Nsite, const FermionField &in, FermionField &out,
@ -462,7 +504,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    autoView(st_v , st,AcceleratorRead);
   if( interior && exterior ) {
-     acceleratorFenceComputeStream();
+     //     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #ifndef GRID_CUDA
@ -475,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
-     // dependent on result of merge
+     //     // dependent on result of merge
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
@ -485,6 +527,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   }
   assert(0 && " Kernel optimisation case not covered ");
  }
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 				     int Ls, int Nsite, const FermionField &in, FermionField &out,
 				     uint64_t *ids)
 {
    autoView(U_v  ,  U,AcceleratorRead);
    autoView(in_v , in,AcceleratorRead);
    autoView(out_v,out,AcceleratorWrite);
    autoView(st_v , st,AcceleratorRead);
    KERNEL_CALL_ID(GenericDhopSite);
 }
  template <class Impl>
  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 					  int Ls, int Nsite, const FermionField &in, FermionField &out,
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@ -40,6 +40,11 @@ public:
  INHERIT_GIMPL_TYPES(Gimpl);
  using Action<GaugeField>::S;
  using Action<GaugeField>::Sinitial;
  using Action<GaugeField>::deriv;
  using Action<GaugeField>::refresh;
 private:
  RealD c_plaq;
  RealD c_rect;
--- a/Grid/qcd/action/gauge/WilsonGaugeAction.h
+++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h
@ -43,6 +43,11 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
 public:  
  INHERIT_GIMPL_TYPES(Gimpl);
  using Action<GaugeField>::S;
  using Action<GaugeField>::Sinitial;
  using Action<GaugeField>::deriv;
  using Action<GaugeField>::refresh;
  /////////////////////////// constructors
  explicit WilsonGaugeAction(RealD beta_):beta(beta_){};
--- a/Grid/qcd/representations/adjoint.h
+++ b/Grid/qcd/representations/adjoint.h
@ -40,7 +40,7 @@ public:
    U = Zero();
    LatticeColourMatrix tmp(Uin.Grid());
-    Vector<typename SU<ncolour>::Matrix> ta(Dimension);
+    std::vector<typename SU<ncolour>::Matrix> ta(Dimension);
    // Debug lines
    // LatticeMatrix uno(Uin.Grid());
--- a/Grid/qcd/representations/two_index.h
+++ b/Grid/qcd/representations/two_index.h
@ -43,7 +43,7 @@ public:
    U = Zero();
    LatticeColourMatrix tmp(Uin.Grid());
-    Vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
+    std::vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
    for (int a = 0; a < Dimension; a++)
      GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@ -32,9 +32,7 @@ private:
  //  Smear_Stout<Gimpl> *StoutSmearing;
  //  std::vector<GaugeField> SmearedSet;
  GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object
  std::vector<LatticeLorentzComplex> masks;
  std::vector<int> cbs;
  typedef typename SU3Adjoint::AMatrix AdjMatrix;
  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
@ -149,25 +147,6 @@ private:
    }
    pokeLorentz(Fdet, Fdet_pol, nu);
  }
  void Compute_MpInvJx_dNxxdSy(int cb,
 			       const GaugeLinkField &PlaqL,
 			       const GaugeLinkField &PlaqR,
 			       AdjMatrixField MpInvJx,
 			       AdjVectorField &Fdet2 )
  {
    GaugeLinkField PlaqLeo(UrbGrid);
    GaugeLinkField PlaqReo(UrbGrid);
    AdjMatrixField MpInvJxeo(UrbGrid);
    AdjVectorField Fdet2eo(UrbGrid);
    pickCheckerboard(cb,PlaqLeo,PlaqL);
    pickCheckerboard(cb,PlaqReo,PlaqR);
    pickCheckerboard(cb,MpInvJxeo,MpInvJx);
    Fdet2eo.Checkerboard()=cb;
    Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo);
    setCheckerboard(Fdet2,Fdet2eo);
  }
  void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
  {
    GaugeLinkField UtaU(PlaqL.Grid());
@ -299,7 +278,6 @@ public:
    ////////////////////////////////////////////////////////////////////////////////
    // Mask the gauge field
    ////////////////////////////////////////////////////////////////////////////////
    int cb = cbs[smr];
    auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
    Umsk = U;
@ -464,7 +442,7 @@ public:
    AdjMatrixField MpInvJx_nu(grid);
    MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
-    Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
+    Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
    Fdet2_mu=FdetV;
    Fdet1_mu=Zero();
@ -521,7 +499,7 @@ public:
 	time=-usecond();
 	PlaqR=(-1.0)*PlaqR;
-	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
 	Fdet2_nu = FdetV;
 	time+=usecond();
 	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
@ -542,7 +520,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
-	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;
 	///////////////// -ve nu /////////////////
@ -561,7 +539,7 @@ public:
 	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
 	MpInvJx_nu = Cshift(MpInvJx,nu,1);
-	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;
 	// x==
@ -582,7 +560,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
 	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
-	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;
 	/////////////////////////////////////////////////////////////////////
@ -611,7 +589,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,nu,-1);
-	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_mu = Fdet2_mu+FdetV;
 	//  __
@ -631,7 +609,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,nu,1);
-	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_mu = Fdet2_mu+FdetV;
      }
@ -953,10 +931,6 @@ private:
 public:
  /* Standard constructor */
  virtual ~SmearedConfigurationMasked()
  {
    delete UrbGrid;
  }
  SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
    : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
  {
@ -965,6 +939,7 @@ public:
    // was resized in base class
    assert(this->SmearedSet.size()==Nsmear);
    GridRedBlackCartesian * UrbGrid;
    UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
    LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
    LatticeComplex tmp(_UGrid);
@ -972,12 +947,11 @@ public:
    for (unsigned int i = 0; i < this->smearingLevels; ++i) {
      masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
      int mu= (i/2) %Nd;
      int cb= (i%2);
      LatticeComplex tmpcb(UrbGrid);
      cbs.push_back(cb);
      masks[i]=Zero();
      ////////////////////
      // Setup the mask
@ -988,6 +962,7 @@ public:
      PokeIndex<LorentzIndex>(masks[i],tmp, mu);
    }
    delete UrbGrid;
  }
  virtual void smeared_force(GaugeField &SigmaTilde) 
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	199818bd6c	Merge pull request #475 from lehner/feature-aurora Sync with GPT on Aurora	2025-03-13 08:55:55 -04:00
Christoph Lehner	fe66c7ca30	verbosity	2025-03-13 12:49:36 +00:00
Christoph Lehner	e9177e4af3	Blas compatibility	2025-03-13 08:48:23 +00:00
Christoph Lehner	d15a6c5933	Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora	2025-03-13 07:29:55 +00:00
Peter Boyle	25ab9325e7	Use hostVector but remove construct resize	2025-03-11 15:02:32 +00:00
Peter Boyle	19f9378b98	Should work on Aurora nowb	2025-03-11 13:50:43 +00:00
Christoph Lehner	9ffd1ed4ce	Merged	2025-03-08 15:30:08 +00:00
Peter Boyle	3d014864e2	Makinig LLVM happy	2025-03-06 14:19:25 -05:00
Peter Boyle	1d22841811	Working on aurora, GPT issue turned up is fixed	2025-03-06 03:20:18 +00:00
Peter Boyle	a1cdda833f	Update WorkArounds.txt	2025-03-05 14:04:23 -05:00
Peter Boyle	ad6db92690	Update WorkArounds.txt	2025-03-05 14:00:26 -05:00
Peter Boyle	e8ff9d8e50	Update WorkArounds.txt	2025-03-05 14:00:04 -05:00
Peter Boyle	795769c636	Update WorkArounds.txt	2025-03-05 13:50:41 -05:00
Peter Boyle	267a39d943	Update WorkArounds.txt	2025-03-05 13:49:43 -05:00
Peter Boyle	3624bd3d22	Update WorkArounds.txt	2025-03-05 13:45:09 -05:00
Peter Boyle	bc12dbbb38	Update WorkArounds.txt	2025-03-05 12:48:56 -05:00
Peter Boyle	eb8a008a8f	Create WorkArounds.txt	2025-03-05 12:41:59 -05:00
Peter Boyle	c4d9aa1a21	Config command that makes GPT happier	2025-02-27 20:12:49 +00:00
Peter Boyle	6ae809ed40	Print not liked on GPT compile	2025-02-27 20:12:49 +00:00
Peter Boyle	311e2aab3f	Update Accelerator.h	2025-02-26 11:42:52 -05:00
Peter Boyle	438dfbdb83	Only throw if there is a pending list entry in CommsComplete	2025-02-25 16:57:27 +00:00
Peter Boyle	b2ce760cf4	Verbose issue with GPT	2025-02-25 16:55:23 +00:00
Peter Boyle	ba9bbe0221	Bounce MPI through host	2025-02-12 19:34:59 +00:00
Peter Boyle	4c3dd82d84	CSHIFT with bounce throuhgh Host memory on MPI packets	2025-02-12 19:09:53 +00:00
Peter Boyle	44e911b5b7	Comment change	2025-02-12 17:37:55 +00:00
Peter Boyle	a7a16df9d0	GET not put has kinder barrier sequence for NVLINK type access as when GET is done, I can use it without barrier. Moves a barrier to a nicer place, overlapped with DtoH DMA	2025-02-12 14:59:28 +00:00
Peter Boyle	382e0abefd	Was issueing a double fence -- the gather also fences	2025-02-12 14:57:28 +00:00
Peter Boyle	6fdefe5b90	Barrier sequencing if doing "GET" not "PUT" is different. This is somewhat better timing for Barriers	2025-02-12 14:55:20 +00:00
Peter Boyle	4788dd8e2e	More states in packet progression for GPU non aware MPI	2025-02-12 14:53:57 +00:00
Peter Boyle	1cc5f221f3	GET not put ordering is better as I know when I've got all MY data	2025-02-12 14:53:05 +00:00
Peter Boyle	93251bfba0	GET not put for better ordering in the downstream dependent kernels -- I know when I'm done, so we can move a barrier / handshake between ranks intranode to a point off critical path	2025-02-12 14:50:21 +00:00
Peter Boyle	18b79508b8	New line better for pretty print	2025-02-12 14:49:48 +00:00
Peter Boyle	4de5ed1613	Remove vector view. The std::vector will not inform Memory manager of deletion and so a stale entry could be left. It is not and should not be used.	2025-02-12 14:48:46 +00:00
Peter Boyle	0baaddbe98	Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384 nodes. More concurrency/fine grained scheduling is possible.	2025-02-04 19:27:26 +00:00
Peter Boyle	b50fb34e71	Perf on Aurora	2025-02-01 18:39:34 +00:00
Peter Boyle	de84d730ff	Fastest run config on Aurora to date	2025-02-01 18:08:40 +00:00
Peter Boyle	c74d11e3d7	PVdagM MG	2025-02-01 11:04:13 -05:00
Christoph Lehner	84cab5e6e7	no comms and log cleanup	2025-02-01 16:37:21 +01:00
Peter Boyle	c4fc972fec	Merge branch 'feature/deprecate-uvm' into develop	2025-01-31 16:32:36 +00:00
Peter Boyle	8cf809e231	Best results on Aurora so far	2025-01-31 16:14:45 +00:00
Peter Boyle	94019a922e	Significantly better performance on Aurora without using pipeline mode	2025-01-30 16:36:46 +00:00
Peter Boyle	d6b2727f86	Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora	2025-01-29 09:22:21 +00:00
Peter Boyle	74a4f43946	Optional host buffer bounce for no CUDA aware MPI	2025-01-28 15:22:46 +00:00
Peter Boyle	1caf8b0f86	Rename	2025-01-28 15:22:37 +00:00
Peter Boyle	3f3661a86f	Heading towards PVdagM multigrid	2025-01-17 14:33:35 +00:00
Peter Boyle	8fe429346f	Dslash testing for reproduce	2024-11-11 23:11:11 +00:00
Peter Boyle	5a4f9bf2e3	Force the ROCM version	2024-10-29 18:12:31 -04:00
Peter Boyle	b91fc1b6b4	Merge branch 'feature/boosted' into feature/deprecate-uvm Fixed boosted free field test	2024-10-28 16:53:09 -04:00
Peter Boyle	eafc150034	Test fft asserts	2024-10-23 16:46:26 -04:00
Peter Boyle	2877f1a268	Verbose reduce	2024-10-23 15:14:16 -04:00
Peter Boyle	1e893af775	GPU happy	2024-10-23 14:52:15 -04:00
Peter Boyle	d9f430a575	Happy GPU	2024-10-23 14:51:16 -04:00
Peter Boyle	63abe87f36	Memory manager verbose improvements that were useful to track an error	2024-10-23 14:49:13 -04:00
Peter Boyle	368d649c8a	feature/deprecate-uvm happier -- preallocate device resident neigbour table	2024-10-23 14:47:55 -04:00
Peter Boyle	5603464f39	Fix in partial fraction import/export physical and make the GPU happier on the deprecate-uvm -- don't use static vectors, make member of class	2024-10-23 14:45:58 -04:00
Peter Boyle	655c79f39e	Suppress warning on partial override	2024-10-23 14:44:41 -04:00
Peter Boyle	565b231c03	Nvcc happy	2024-10-23 14:44:17 -04:00
Peter Boyle	62a9f180fa	NVCC happy	2024-10-23 14:44:04 -04:00
Peter Boyle	5ae77876a8	Meson field and Aslash field on GPU; some compiler warning removed	2024-10-18 19:08:06 -04:00
Peter Boyle	4ed2c2c74f	Config command	2024-10-18 13:58:33 -04:00
Peter Boyle	955da582b6	Working on NVCC	2024-10-18 13:58:03 -04:00
Peter Boyle	11b07b950d	Vanilla linux compile, assuming spack prerequisites	2024-10-18 13:57:40 -04:00
Peter Boyle	8f70cfeda9	Clean up	2024-10-18 13:56:53 -04:00
Peter Boyle	ce64271048	Remove the copying version	2024-10-18 13:56:24 -04:00
Peter Boyle	5cc4f3241d	Meson field test	2024-10-18 15:42:30 +00:00
Peter Boyle	6815e138b4	Boosted fermion attempt	2024-10-17 18:37:33 +01:00
Peter Boyle	a78a61d76f	Update configure	2024-10-15 14:38:45 +00:00
Peter Boyle	2eff3f34ed	Alternate reduction; default to grids own but make a configure flag --enable-reduction=grid\|mpi	2024-10-15 14:36:06 +00:00
Peter Boyle	03687c1d62	Final version of test, closer to original again	2024-10-15 14:35:17 +00:00
Peter Boyle	febfe4e77f	Make my own reduction a configure flag	2024-10-15 14:32:35 +00:00
Peter Boyle	4d1aa134b5	Use normal reduction, configure flag to force deterministic	2024-10-15 14:32:11 +00:00
Peter Boyle	5ec879860a	Odd rounding issue - bears looking into	2024-10-15 14:30:54 +00:00
Peter Boyle	f617468e04	Update Lattice_base.h	2024-10-11 10:39:16 -04:00
Peter Boyle	b728af903c	Fast axpy norm under CFLAG	2024-10-11 03:23:09 +00:00
Peter Boyle	54f1999030	axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues	2024-10-11 03:22:18 +00:00
Peter Boyle	fd58f0b669	Return ok	2024-10-11 03:21:21 +00:00
Peter Boyle	c5c67b706e	cl::sycl -> SYCL	2024-10-10 22:04:12 +00:00
Peter Boyle	be7a543e2c	Revert barriers -- these were not the problem	2024-10-10 22:03:29 +00:00
Peter Boyle	68f112d576	New software moves cl::sycl	2024-10-10 22:03:04 +00:00
Peter Boyle	ec1395a304	Better flight logging	2024-10-10 22:01:57 +00:00
Peter Boyle	beb0e474ee	Use deterministic own brand reduction	2024-10-10 22:01:24 +00:00
Peter Boyle	2b5fdcbbc5	New software version	2024-10-10 21:59:02 +00:00
Peter Boyle	295127d456	Deterministic homebrew reduction	2024-10-10 21:58:26 +00:00
Peter Boyle	7dcfb13694	New software stack	2024-10-10 21:57:35 +00:00
Peter Boyle	ee4046fe92	Added a dimension ordered column sum based reduction for scalar. Removes dependence on MPI_Allreduce and allows for work around on systems where this is bollox.	2024-09-27 09:26:03 -04:00
Peter Boyle	2a9cfeb9ea	New files	2024-09-26 14:23:29 -04:00
Peter Boyle	1147b8ea40	Cheby poly setup	2024-09-26 14:20:32 -04:00
Peter Boyle	3f9119b39d	Remove vectors used for the power spectrum table in paper	2024-09-26 14:19:41 -04:00
Peter Boyle	35e8225abd	Verbose control	2024-09-26 14:18:35 -04:00
Peter Boyle	bdbfbb7a14	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-09-26 14:05:45 -04:00
Peter Boyle	f7d4be8d96	Calculate bytes correctly	2024-09-26 14:04:44 -04:00
Peter Boyle	9fa8bd6438	Configure for AOT on Aurora latest software	2024-09-23 11:25:44 +00:00
Peter Boyle	02c8178f16	Almost working on Aurora	2024-09-23 09:43:50 +00:00
Peter Boyle	e637fbacae	Verbose remove	2024-09-23 09:42:43 +00:00
Peter Boyle	066544281f	Deprecate UVM	2024-09-17 13:34:27 +00:00
Peter Boyle	11be10d2c0	Aurora testing	2024-09-10 18:11:52 +00:00
Peter Boyle	160969a758	UVM tester, doesn't turn up anything	2024-09-10 18:09:42 +00:00
Peter Boyle	622f78ebea	SYCL updates -- operator = giving trouble on Aurora. SYCL reduction is failing intermittently with SVM interface - returns zero, expect non-zero. Think I need to remove ALL dependence on SVM.	2024-09-04 13:53:48 +00:00
Peter Boyle	aa67a5b095	Rename	2024-08-27 19:54:01 +00:00
Peter Boyle	af9ea0864c	Blas fix	2024-08-27 19:53:09 +00:00
Peter Boyle	4e2a6d87c4	Gemm batched fix	2024-08-27 19:24:05 +00:00
Peter Boyle	a465ecece9	Aurora	2024-08-27 19:20:43 +00:00
Peter Boyle	575eb72182	Converges on 16^3	2024-08-27 19:20:38 +00:00
Peter Boyle	3a973914d6	Compile on frontier	2024-08-27 14:55:42 -04:00
Peter Boyle	f568c07bbd	Improved the BLAS benchmark	2024-08-27 14:53:54 -04:00
Peter Boyle	2c9878fc3a	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-08-27 12:05:46 -04:00
Peter Boyle	27b1b1b005	Checkerboard available for offloading pickCheckerboard	2024-08-27 12:04:09 -04:00
Peter Boyle	130d7ab077	Verbose changes	2024-08-27 12:03:28 -04:00
Peter Boyle	29f6b8a74a	Setup	2024-08-27 12:02:49 -04:00
Peter Boyle	9779aaea33	16^3 optimise	2024-08-27 11:38:35 -04:00
Peter Boyle	ec25604a67	Fastest solver for mrhs multigrid	2024-08-27 11:32:34 -04:00
Peter Boyle	3668e81c5e	Extract slice working on checkerboard field for Block Lanczos	2024-08-27 11:31:30 -04:00
Peter Boyle	d66b2423cb	Move slice operations to GPU for BlockCG	2024-08-27 11:28:47 -04:00
Peter Boyle	15cc78f0b6	peek/poke local site on checkerboard arrays	2024-08-27 11:23:42 -04:00
Peter Boyle	06db4ddea2	Fast init on GPU	2024-08-27 11:22:33 -04:00
Peter Boyle	6cfb90e99f	Support needed for accelerator resident set/pick Checkerboard	2024-08-27 11:19:00 -04:00
Peter Boyle	d8be95a2a3	Don't early terminate power method to get more accurate top EV	2024-08-27 11:17:37 -04:00
Peter Boyle	f82702872d	Normal residual	2024-08-27 11:16:44 -04:00
Peter Boyle	3752c49ef0	Add option to record the CG polynomial	2024-08-27 11:14:35 -04:00
Peter Boyle	fe65fa4988	MulMatrix	2024-08-27 11:13:18 -04:00
Peter Boyle	1fe4c205a3	Adef	2024-08-27 11:11:47 -04:00
Peter Boyle	d4dc5e0f43	BlockCG linalg acceleratoin with BLAS	2024-08-27 11:08:33 -04:00
Peter Boyle	77944437ce	Functor initialisation	2024-08-27 11:01:02 -04:00
Peter Boyle	c164bff758	MMdag	2024-08-27 11:00:36 -04:00
Peter Boyle	aa2e3d954a	MMdag operator	2024-08-27 10:59:29 -04:00
Peter Boyle	de62b04728	Block CG linalg acceleration	2024-08-27 10:58:54 -04:00
Peter Boyle	d0bdb50f24	Analyse power spectrum	2024-08-27 10:58:19 -04:00
Peter Boyle	a8fecbc609	BlockCG linalg via BLAS	2024-08-21 16:08:16 -04:00
Peter Boyle	e29b97b3ea	Qslash term added	2023-09-14 16:14:03 -04:00
Peter Boyle	ad2b699d2b	Better macos	2023-09-14 16:12:21 -04:00
`@ -1,2 +1,2 @@`

	`mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench`	`mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL`
		`@ -0,0 +1,2 @@`

							`mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL`