Merge branch 'develop' of https://github.com/paboyle/Grid into specflow

2026-07-21 02:53:27 +01:00 · 2025-04-18 19:55:36 +00:00
parent 570b72a47b e652fc2825
commit cee4c8ce8c
149 changed files with 4714 additions and 3394 deletions
@@ -168,6 +168,7 @@ public:
  template<class vobj>
  void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
 #ifndef HAVE_FFTW
+    std::cerr << "FFTW is not compiled but is called"<<std::endl;
    assert(0);
 #else
    conformable(result.Grid(),vgrid);
@@ -190,6 +191,7 @@ public:
      
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
+    //std::cout << "CPU view" << std::endl;
    
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@@ -213,6 +215,7 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
      
+    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -226,6 +229,7 @@ public:
    }
      
    // Barrel shift and collect global pencil
+    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@@ -247,6 +251,7 @@ public:
      }
    }
      
+    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@@ -269,6 +274,7 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
      
+    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@@ -285,6 +291,7 @@ public:
    }
    result = result*div;
      
+    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
@@ -277,6 +277,38 @@ public:
    assert(0);
  }
 };
+template<class Matrix,class Field>
+class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD shift;
+public:
+  ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    _Mat.Mdiag(in,out);
+    out = out + shift*in;
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};

 //////////////////////////////////////////////////////////
 // Even Odd Schur decomp operators; there are several
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
  typedef cublasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
-  typedef cl::sycl::queue *gridblasHandle_t;
+  typedef sycl::queue *gridblasHandle_t;
 #endif
 #ifdef GRID_ONE_MKL
-  typedef cl::sycl::queue *gridblasHandle_t;
+  typedef sycl::queue *gridblasHandle_t;
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
  typedef int32_t gridblasHandle_t;
@@ -89,9 +89,9 @@ public:
      gridblasHandle = theGridAccelerator;
 #endif
 #ifdef GRID_ONE_MKL
-      cl::sycl::gpu_selector selector;
-      cl::sycl::device selectedDevice { selector };
-      cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
+      sycl::gpu_selector selector;
+      sycl::device selectedDevice { selector };
+      sycl::property_list q_prop{sycl::property::queue::in_order()};
      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 #endif
      gridblasInit=1;
@@ -208,8 +208,8 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);

-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@@ -367,28 +367,67 @@ public:
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
        });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@@ -414,8 +453,8 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@@ -514,28 +553,70 @@ public:
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@@ -661,29 +742,41 @@ public:
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
-	  } );
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  });
      } else { 
 	assert(0);
      }
@@ -809,28 +902,40 @@ public:
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
      //Compute double precision rsd and also new RHS vector.
      Linop_d.HermOp(sol_d, tmp_d);
      RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
-      
+      std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;

      if(norm < OuterLoopNormMult * stop){
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 	break;
      }
-      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+      while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??

      PrecChangeTimer.Start();
      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
@@ -102,11 +102,11 @@ public:
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
  
-    // dynamic sized arrays on stack; 2d is a pain with vector
-    RealD  bs[nshift];
-    RealD  rsq[nshift];
-    RealD  z[nshift][2];
-    int     converged[nshift];
+    // remove dynamic sized arrays on stack; 2d is a pain with vector
+    std::vector<RealD>  bs(nshift);
+    std::vector<RealD>  rsq(nshift);
+    std::vector<std::array<RealD,2> >  z(nshift);
+    std::vector<int>     converged(nshift);
  
    const int       primary =0;
  
@@ -123,11 +123,11 @@ public:
    assert(mresidual.size()==nshift);
  
    // dynamic sized arrays on stack; 2d is a pain with vector
-    RealD  bs[nshift];
-    RealD  rsq[nshift];
-    RealD  rsqf[nshift];
-    RealD  z[nshift][2];
-    int     converged[nshift];
+    std::vector<RealD>  bs(nshift);
+    std::vector<RealD>  rsq(nshift);
+    std::vector<RealD>  rsqf(nshift);
+    std::vector<std::array<RealD,2> >  z(nshift);
+    std::vector<int>     converged(nshift);
  
    const int       primary =0;
  
@@ -156,11 +156,11 @@ public:
    assert(mresidual.size()==nshift);
  
    // dynamic sized arrays on stack; 2d is a pain with vector
-    RealD  bs[nshift];
-    RealD  rsq[nshift];
-    RealD  rsqf[nshift];
-    RealD  z[nshift][2];
-    int     converged[nshift];
+    std::vector<RealD>  bs(nshift);
+    std::vector<RealD>  rsq(nshift);
+    std::vector<RealD>  rsqf(nshift);
+    std::vector<std::array<RealD,2> >  z(nshift);
+    std::vector<int>     converged(nshift);
  
    const int       primary =0;
  
@@ -74,7 +74,7 @@ public:

  void operator() (const Field &src, Field &psi){

-    psi=Zero();
+    //    psi=Zero();
    RealD cp, ssq,rsq;
    ssq=norm2(src);
    rsq=Tolerance*Tolerance*ssq;
@@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once

+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+
 NAMESPACE_BEGIN(Grid);

 inline RealD AggregatePowerLaw(RealD x)
@@ -95,7 +97,7 @@ public:

    RealD scale;

-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    ConjugateGradient<FineField> CG(1.0e-3,400,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);

@@ -108,7 +110,7 @@ public:
      
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;

-      for(int i=0;i<1;i++){
+      for(int i=0;i<4;i++){

 	CG(hermop,noise,subspace[b]);

@@ -124,6 +126,53 @@ public:
    }
  }

+  virtual void CreateSubspaceGCR(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
+  {
+    RealD scale;
+
+    TrivialPrecon<FineField> simple_fine;
+    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
+    FineField noise(FineGrid);
+    FineField src(FineGrid);
+    FineField guess(FineGrid);
+    FineField Mn(FineGrid);
+
+    for(int b=0;b<nn;b++){
+      
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
+
+      for(int i=0;i<2;i++){
+	//  void operator() (const Field &src, Field &psi){
+#if 1
+	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
+	src = noise;
+	guess=Zero();
+	GCR(src,guess);
+	subspace[b] = guess;
+#else
+	std::cout << GridLogMessage << " inverting on zero "<<std::endl;
+	src=Zero();
+	guess = noise;
+	GCR(src,guess);
+	subspace[b] = guess;
+#endif
+	noise = subspace[b];
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+      }
+
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
+      subspace[b]   = noise;
+
+    }
+  }
+
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
@@ -160,14 +209,21 @@ public:

    int b =0;
    {
+      ComplexD ip;
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
+
      hermop.Op(Mn,tmp);
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      ip= innerProduct(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+
+      hermop.AdjOp(Mn,tmp); 
+      ip = innerProduct(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
      b++;
    }

@@ -213,8 +269,18 @@ public:
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
+
+
+	  ComplexD ip;
+
 	  hermop.Op(Mn,tmp);
-	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  ip= innerProduct(Mn,tmp); 
+	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+
+	  hermop.AdjOp(Mn,tmp); 
+	  ip = innerProduct(Mn,tmp); 
+	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+	  
 	  b++;
 	}

@@ -99,7 +99,7 @@ public:
  CoarseMatrix AselfInvEven;
  CoarseMatrix AselfInvOdd;

-  Vector<RealD> dag_factor;
+  deviceVector<RealD> dag_factor;

  ///////////////////////
  // Interface
@@ -124,9 +124,13 @@ public:
    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
      
-    Vector<Aview> AcceleratorViewContainer;
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
  
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
+    for(int p=0;p<geom.npoint;p++) {
+      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
+      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
+    }
    Aview *Aview_p = & AcceleratorViewContainer[0];

    const int Nsimd = CComplex::Nsimd();
@@ -161,7 +165,7 @@ public:
      coalescedWrite(out_v[ss](b),res);
      });

-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  };

  void Mdag (const CoarseVector &in, CoarseVector &out)
@@ -190,9 +194,14 @@ public:
    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;

-    Vector<Aview> AcceleratorViewContainer;

-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
+  
+    for(int p=0;p<geom.npoint;p++) {
+      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
+      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
+    }
    Aview *Aview_p = & AcceleratorViewContainer[0];

    const int Nsimd = CComplex::Nsimd();
@@ -201,10 +210,10 @@ public:

    int osites=Grid()->oSites();

-    Vector<int> points(geom.npoint, 0);
-    for(int p=0; p<geom.npoint; p++)
-      points[p] = geom.points_dagger[p];
-
+    deviceVector<int> points(geom.npoint);
+    for(int p=0; p<geom.npoint; p++) { 
+      acceleratorPut(points[p],geom.points_dagger[p]);
+    }
    auto points_p = &points[0];

    RealD* dag_factor_p = &dag_factor[0];
@@ -236,7 +245,7 @@ public:
      coalescedWrite(out_v[ss](b),res);
      });

-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  }

  void MdirComms(const CoarseVector &in)
@@ -251,8 +260,14 @@ public:
    out.Checkerboard() = in.Checkerboard();

    typedef LatticeView<Cobj> Aview;
-    Vector<Aview> AcceleratorViewContainer;
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
+
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
+  
+    for(int p=0;p<geom.npoint;p++) {
+      hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
+      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
+    }
    Aview *Aview_p = & AcceleratorViewContainer[0];

    autoView( out_v , out, AcceleratorWrite);
@@ -285,7 +300,7 @@ public:
      }
      coalescedWrite(out_v[ss](b),res);
    });
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  }
  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
  {
@@ -469,14 +484,20 @@ public:

    // determine in what order we need the points
    int npoint = geom.npoint-1;
-    Vector<int> points(npoint, 0);
-    for(int p=0; p<npoint; p++)
-      points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
-
+    deviceVector<int> points(npoint);
+    for(int p=0; p<npoint; p++) {
+      int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
+      acceleratorPut(points[p], val);
+    }
    auto points_p = &points[0];

-    Vector<Aview> AcceleratorViewContainer;
-    for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
+    deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
+    hostVector<Aview>   hAcceleratorViewContainer(geom.npoint);
+  
+    for(int p=0;p<geom.npoint;p++) {
+      hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
+      acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
+    }
    Aview *Aview_p = & AcceleratorViewContainer[0];

    const int Nsimd = CComplex::Nsimd();
@@ -539,7 +560,7 @@ public:
      });
    }

-    for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
+    for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
  }
  
  CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	:
@@ -590,11 +611,13 @@ public:
    }

    // GPU readable prefactor
+    std::vector<RealD> h_dag_factor(nbasis*nbasis);
    thread_for(i, nbasis*nbasis, {
      int j = i/nbasis;
      int k = i%nbasis;
-      dag_factor[i] = dag_factor_eigen(j, k);
+      h_dag_factor[i] = dag_factor_eigen(j, k);
    });
+    acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
  }

  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
@@ -441,8 +441,20 @@ public:
    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
  }
 #else
+  //////////////////////////////////////////////////////////////////////
+  // Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    CoarsenOperator(linop,Subspace,Subspace);
+  }
+  //////////////////////////////////////////////////////////////////////
+  // Petrov - Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & U,
+		       Aggregation<Fobj,CComplex,nbasis> & V)
  {
    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
    GridBase *grid = FineGrid();
@@ -458,11 +470,9 @@ public:
    // Orthogonalise the subblocks over the basis
    /////////////////////////////////////////////////////////////
    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
+    blockOrthogonalise(InnerProd,V.subspace);
+    blockOrthogonalise(InnerProd,U.subspace);

-    //    for(int s=0;s<Subspace.subspace.size();s++){
-      //      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
-    //    }
    const int npoint = geom.npoint;
      
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -542,7 +552,7 @@ public:
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
 	tphaseBZ-=usecond();
-	phaV = phaF[p]*Subspace.subspace[i];
+	phaV = phaF[p]*V.subspace[i];
 	tphaseBZ+=usecond();

 	/////////////////////////////////////////////////////////////////////
@@ -555,7 +565,7 @@ public:
 	//	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;

 	tproj-=usecond();
-	blockProject(coarseInner,MphaV,Subspace.subspace);
+	blockProject(coarseInner,MphaV,U.subspace);
 	coarseInner = conjugate(pha[p]) * coarseInner;

 	ComputeProj[p] = coarseInner;
@@ -69,7 +69,7 @@ public:
  }

  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
@@ -174,19 +174,10 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-#ifdef ACCELERATOR_CSHIFT
-// Cshift on device
-template<class T> using cshiftAllocator = devAllocator<T>;
-#else
-// Cshift on host
-template<class T> using cshiftAllocator = std::allocator<T>;
-#endif
-
-template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
-template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
-template<class T> using commVector    = std::vector<T,devAllocator<T> >;
-template<class T> using deviceVector  = std::vector<T,devAllocator<T> >;
-template<class T> using cshiftVector  = std::vector<T,cshiftAllocator<T> >;
+template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview
+template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // Really want to deprecate
+template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page
+template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector

 /*
 template<class T> class vecView
@@ -197,8 +188,9 @@ template<class T> class vecView
  ViewMode mode;
  void * cpu_ptr;
 public:
+  // Rvalue accessor
  accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
-  vecView(std::vector<T> &refer_to_me,ViewMode _mode)
+  vecView(Vector<T> &refer_to_me,ViewMode _mode)
  {
    cpu_ptr = &refer_to_me[0];
    size = refer_to_me.size();
@@ -214,22 +206,12 @@ template<class T> class vecView
  }
 };

-template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
+template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
 {
  vecView<T> ret(vec,_mode); // does the open
  return ret;                // must be closed
 }

-// Little autoscope assister
-template<class View> 
-class VectorViewCloser
-{
-  View v;  // Take a copy of view and call view close when I go out of scope automatically
- public:
-  VectorViewCloser(View &_v) : v(_v) {};
-  ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose();  MemoryManager::NotifyDeletion(ptr);}
-};
-
 #define autoVecView(v_v,v,mode)					\
  auto v_v = VectorView(v,mode);				\
  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
@@ -1,16 +1,15 @@
 #include <Grid/GridCore.h>
 #ifndef GRID_UVM

-#warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);

 #define MAXLINE 512
 static char print_buffer [ MAXLINE ];

-#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
-#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
+#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
+#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug  << print_buffer << std::endl;
 //#define dprintf(...) 
-
+//#define mprintf(...) 

 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
@@ -111,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -121,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
    AccCache.AccPtr=(uint64_t) NULL;
-    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@@ -141,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@@ -169,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -184,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx",
+	  (uint64_t)AccCache.bytes,
+	  (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
+    dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
+    dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
+  if(bytes>=DeviceMaxBytes) {
+    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
+  }
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
@@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error

  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
+    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
@@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
-    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
+    dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
+    dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
  } else {
    assert(0);
  }
@@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
-    dprintf("AccCache entry removed from LRU \n");
+    dprintf("AccCache entry removed from LRU ");
    LRUremove(AccCache);
  }

@@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
  } else {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
-  uint64_t pagedata[npages];
+  std::vector<uint64_t> pagedata(npages);
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
-  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
+  ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
@@ -57,18 +57,29 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////

+#ifdef USE_GRID_REDUCTION
+void CartesianCommunicator::GlobalSum(ComplexF &c)
+{
+  GlobalSumP2P(c);
+}
+void CartesianCommunicator::GlobalSum(ComplexD &c)
+{
+  GlobalSumP2P(c);
+}
+#else
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
-void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
-{
-  GlobalSumVector((float *)c,2*N);
-}
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumVector((double *)&c,2);
 }
+#endif
+void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
+{
+  GlobalSumVector((float *)c,2*N);
+}
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 #include <Grid/communicator/SharedMemory.h>

+#define NVLINK_GET
+
 NAMESPACE_BEGIN(Grid);

 extern bool Stencil_force_mpi ;
@@ -136,7 +138,7 @@ public:
    for(int d=0;d<_ndimension;d++){
      column.resize(_processors[d]);
      column[0] = accum;
-      std::vector<CommsRequest_t> list;
+      std::vector<MpiCommsRequest_t> list;
      for(int p=1;p<_processors[d];p++){
 	ShiftedRanks(d,p,source,dest);
 	SendToRecvFromBegin(list,
@@ -147,7 +149,8 @@ public:
 			    sizeof(obj),d*100+p);

      }
-      CommsComplete(list);
+      if (!list.empty()) // avoid triggering assert in comms == none
+	CommsComplete(list);
      for(int p=1;p<_processors[d];p++){
 	accum = accum + column[p];
      }
@@ -166,8 +169,8 @@ public:
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
-  void CommsComplete(std::vector<CommsRequest_t> &list);
-  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+  void CommsComplete(std::vector<MpiCommsRequest_t> &list);
+  void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
 			   void *xmit,
 			   int dest,
 			   void *recv,
@@ -186,6 +189,17 @@ public:
 			       int recv_from_rank,int do_recv,
 			       int bytes,int dir);

+  double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+				      void *xmit,
+				      int xmit_to_rank,int do_xmit,
+				      void *recv,
+				      int recv_from_rank,int do_recv,
+				      int xbytes,int rbytes,int dir);
+
+  // Could do a PollHtoD and have a CommsMerge dependence
+  void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
+  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
+
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;

 ////////////////////////////////////////////
@@ -257,6 +258,25 @@ CartesianCommunicator::~CartesianCommunicator()
    }
  }
 }
+#ifdef USE_GRID_REDUCTION
+void CartesianCommunicator::GlobalSum(float &f){
+  CartesianCommunicator::GlobalSumP2P(f);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  CartesianCommunicator::GlobalSumP2P(d);
+}
+#else
+void CartesianCommunicator::GlobalSum(float &f){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+#endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@@ -287,27 +307,18 @@ void CartesianCommunicator::GlobalMax(double &d)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
  assert(ierr==0);
 }
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }

-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
@@ -332,7 +343,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  assert(ierr==0);
  list.push_back(xrq);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
+void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
 {
  int nreq=list.size();

@@ -351,9 +362,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int from,
 					   int bytes)
 {
-  std::vector<CommsRequest_t> reqs(0);
-  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
+  std::vector<MpiCommsRequest_t> reqs(0);

  int myrank = _processor;
  int ierr;
@@ -369,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);

-  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
-  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@@ -381,12 +387,25 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }

-#undef NVLINK_GET // Define to use get instead of put DMA
+
+#ifdef ACCELERATOR_AWARE_MPI
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int dest,int dox,
+							   void *recv,
+							   int from,int dor,
+							   int xbytes,int rbytes,int dir)
+{
+  return 0.0; // Do nothing -- no preparation required
+}
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
@@ -420,14 +439,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      off_node_bytes+=rbytes;
    }
 #ifdef NVLINK_GET
+    else { 
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+    }
 #endif
  }
-  
+  // This is a NVLINK PUT  
  if (dox) {
-    //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@@ -440,25 +460,338 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
 #endif
+    }
+  }
+  return off_node_bytes;
+}

+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
+{
+  int nreq=list.size();
+  /*finishes Get/Put*/
+  acceleratorCopySynchronise();
+
+  if (nreq==0) return;
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  assert(ierr==0);
+  list.resize(0);
+  this->StencilBarrier(); 
+}
+
+#else /* NOT     ... ACCELERATOR_AWARE_MPI */
+///////////////////////////////////////////
+// Pipeline mode through host memory
+///////////////////////////////////////////
+  /*
+   * In prepare (phase 1):
+   * PHASE 1: (prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   * PHASE 2: (Begin)
+   * - complete all copies
+   * - post MPI send asynch
+   * - post device - device transfers
+   * PHASE 3: (Complete)
+   * - MPI_waitall
+   * - host-device transfers
+   *
+   *********************************
+   * NB could split this further:
+   *--------------------------------
+   * PHASE 1: (Prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   * PHASE 2: (BeginInterNode)
+   * - complete all copies 
+   * - post MPI send asynch
+   * PHASE 3: (BeginIntraNode)
+   * - post device - device transfers
+   * PHASE 4: (Complete)
+   * - MPI_waitall
+   * - host-device transfers asynch
+   * - (complete all copies) 
+   */
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int dest,int dox,
+							   void *recv,
+							   int from,int dor,
+							   int xbytes,int rbytes,int dir)
+{
+/*
+ * Bring sequence from Stencil.h down to lower level.
+ * Assume using XeLink is ok
+ */  
+  int ncomm  =communicator_halo.size();
+  int commdir=dir%ncomm;
+
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  assert(gme  == ShmRank);
+  double off_node_bytes=0.0;
+  int tag;
+
+  void * host_recv = NULL;
+  void * host_xmit = NULL;
+
+  /*
+   * PHASE 1: (Prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   */
+  
+  if ( dor ) {
+    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+from*32;
+      host_recv = this->HostBufferMalloc(rbytes);
+      ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      assert(ierr==0);
+      CommsRequest_t srq;
+      srq.PacketType = InterNodeRecv;
+      srq.bytes      = rbytes;
+      srq.req        = rrq;
+      srq.host_buf   = host_recv;
+      srq.device_buf = recv;
+      list.push_back(srq);
+      off_node_bytes+=rbytes;
+    }
+  }
+  
+  if (dox) {
+    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+
+      tag= dir+_processor*32;
+
+      host_xmit = this->HostBufferMalloc(xbytes);
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+      
+      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      //      assert(ierr==0);
+      //      off_node_bytes+=xbytes;
+
+      srq.PacketType = InterNodeXmit;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = host_xmit;
+      srq.device_buf = xmit;
+      srq.tag        = tag;
+      srq.dest       = dest;
+      srq.commdir    = commdir;
+      list.push_back(srq);
    }
  }

  return off_node_bytes;
 }
+/*
+ * In the interest of better pipelining, poll for completion on each DtoH and 
+ * start MPI_ISend in the meantime
+ */
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeRecv ) {
+
+	int flag = 0;
+	MPI_Status status;
+	int ierr = MPI_Test(&list[idx].req,&flag,&status);
+	assert(ierr==0);
+
+	if ( flag ) {
+	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
+	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
+	  list[idx].PacketType=InterNodeReceiveHtoD;
+	} else {
+	  pending ++;
+	}
+      }
+    }
+    //    std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
+  } while ( pending );
+  
+}
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeXmit ) {
+
+	if ( acceleratorEventIsComplete(list[idx].ev) ) {
+
+	  void *host_xmit = list[idx].host_buf;
+	  uint32_t xbytes = list[idx].bytes;
+	  int dest        = list[idx].dest;
+	  int tag         = list[idx].tag;
+	  int commdir     = list[idx].commdir;
+	  ///////////////////
+	  // Send packet
+	  ///////////////////
+
+	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
+	  
+	  MPI_Request xrq;
+	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+	  assert(ierr==0);
+
+	  list[idx].req        = xrq; // Update the MPI request in the list
+
+	  list[idx].PacketType=InterNodeXmitISend;
+
+	} else {
+	  // not done, so return to polling loop
+	  pending++;
+	}
+      }
+    }
+  } while (pending);
+}  
+
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int dest,int dox,
+							 void *recv,
+							 int from,int dor,
+							 int xbytes,int rbytes,int dir)
+{
+  int ncomm  =communicator_halo.size();
+  int commdir=dir%ncomm;
+
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  assert(gme  == ShmRank);
+  double off_node_bytes=0.0;
+  int tag;
+
+  void * host_xmit = NULL;
+
+  ////////////////////////////////
+  // Receives already posted
+  // Copies already started
+  ////////////////////////////////
+  /*  
+   * PHASE 2: (Begin)
+   * - complete all copies
+   * - post MPI send asynch
+   */
+#ifdef NVLINK_GET
+  if ( dor ) {
+
+    if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
+      assert(shm!=NULL);
+
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+
+      srq.PacketType = IntraNodeRecv;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
+    }
+  }  
+#else
+  if (dox) {
+
+    if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
+      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+      assert(shm!=NULL);
+
+      CommsRequest_t srq;
+      
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+
+      srq.PacketType = IntraNodeXmit;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
+      
+    }
+  }
+#endif
+  return off_node_bytes;
+}
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  int nreq=list.size();
+  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D

-  acceleratorCopySynchronise();
+  std::vector<MPI_Status> status;
+  std::vector<MPI_Request> MpiRequests;
    
-  if (nreq==0) return;
+  for(int r=0;r<list.size();r++){
+    // Must check each Send buf is clear to reuse
+    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
+    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
+  }

-  std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-  assert(ierr==0);
-  list.resize(0);
+  int nreq=MpiRequests.size();
+
+  if (nreq>0) {
+    status.resize(MpiRequests.size());
+    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
+    assert(ierr==0);
+  }
+  
+  //  for(int r=0;r<nreq;r++){
+  //    if ( list[r].PacketType==InterNodeRecv ) {
+  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
+  //    }
+  //  }
+  
+  
+  list.resize(0);               // Delete the list
+  this->HostBufferFreeAll();    // Clean up the buffer allocs
+#ifndef NVLINK_GET
+  this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
+#endif   
 }
+#endif
+////////////////////////////////////////////
+// END PIPELINE MODE / NO CUDA AWARE MPI
+////////////////////////////////////////////
+
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
@@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -132,6 +132,17 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int xmit_to_rank,int dox,
+							   void *recv,
+							   int recv_from_rank,int dor,
+							   int xbytes,int rbytes, int dir)
+{
+  return 0.0;
+}
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,int dox,
@@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);

 #if defined (GRID_COMMS_MPI3) 
 typedef MPI_Comm    Grid_MPI_Comm;
+typedef MPI_Request MpiCommsRequest_t;
+#ifdef ACCELERATOR_AWARE_MPI
 typedef MPI_Request CommsRequest_t;
 #else
+/*
+ * Enable state transitions as each packet flows.
+ */
+enum PacketType_t {
+  FaceGather,
+  InterNodeXmit,
+  InterNodeRecv,
+  IntraNodeXmit,
+  IntraNodeRecv,
+  InterNodeXmitISend,
+  InterNodeReceiveHtoD
+};
+/*
+ *Package arguments needed for various actions along packet flow
+ */
+typedef struct {
+  PacketType_t PacketType;
+  void *host_buf;
+  void *device_buf;
+  int dest;
+  int tag;
+  int commdir;
+  unsigned long bytes;
+  acceleratorEvent_t ev;
+  MpiCommsRequest_t req;
+} CommsRequest_t;
+#endif
+
+#else 
+typedef int MpiCommsRequest_t;
 typedef int CommsRequest_t;
 typedef int Grid_MPI_Comm;
 #endif
@@ -105,7 +137,7 @@ public:
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
-  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
+  //  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
  static void SharedMemoryZero(void *dest,size_t bytes);

 };
@@ -42,6 +42,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef ACCELERATOR_AWARE_MPI
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #define SHM_SOCKETS
+#else
+#ifdef HAVE_NUMAIF_H
+  #warning " Using NUMAIF "
+#include <numaif.h>
+#endif 
 #endif 
 #include <syscall.h>
 #endif
@@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  HostCommBuf= malloc(bytes);
+  // printf("Host buffer allocate for GPU non-aware MPI\n");
+#if 0
+  HostCommBuf= acceleratorAllocHost(bytes);
+#else 
+  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
+#if 0
+  #warning "Moving host buffers to specific NUMA domain"
+  int numa;
+  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
+  if(numa_name) {
+    unsigned long page_size = sysconf(_SC_PAGESIZE);
+    numa = atoi(numa_name);
+    unsigned long page_count = bytes/page_size;
+    std::vector<void *> pages(page_count);
+    std::vector<int>    nodes(page_count,numa);
+    std::vector<int>    status(page_count,-1);
+    for(unsigned long p=0;p<page_count;p++){
+      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
+    }
+    int ret = move_pages(0,
+			 page_count,
+			 &pages[0],
+			 &nodes[0],
+			 &status[0],
+			 MPOL_MF_MOVE);
+    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
+    if (ret) perror(" move_pages failed for reason:");
+  }
+#endif  
+  acceleratorPin(HostCommBuf,bytes);
+#endif  
+
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
@@ -569,8 +605,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;

-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
+    auto zeDevice    = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
+    auto zeContext   = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
      
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
@@ -880,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
  bzero(dest,bytes);
 #endif
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-{
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-  acceleratorCopyToDevice(src,dest,bytes);
-#else   
-  bcopy(src,dest,bytes);
-#endif
-}
+//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+//{
+//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+//  acceleratorCopyToDevice(src,dest,bytes);
+//#else   
+//  bcopy(src,dest,bytes);
+//#endif
+//}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
@@ -923,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);

    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
+    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
  }
  ShmBufferFreeAll();

@@ -953,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  //SharedMemoryTest();
+  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@@ -975,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
       check[0]=GlobalSharedMemory::WorldNode;
       check[1]=r;
       check[2]=magic;
-       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
+       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
    }
  }
  ShmBarrier();
  for(uint64_t r=0;r<ShmSize;r++){
-    ShmBarrier();
-    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
-    ShmBarrier();
+    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==magic);
-    ShmBarrier();
  }
+  ShmBarrier();
+  std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
 }

 void *SharedMemory::ShmBuffer(int rank)
@@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #endif 

 NAMESPACE_BEGIN(Grid);
-
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
 auto Cshift(const Expression &expr,int dim,int shift)  -> decltype(closure(expr)) 
 {
@@ -30,12 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);

 extern std::vector<std::pair<int,int> > Cshift_table; 
-extern commVector<std::pair<int,int> > Cshift_table_device; 
+extern deviceVector<std::pair<int,int> > Cshift_table_device; 

 inline std::pair<int,int> *MapCshiftTable(void)
 {
  // GPU version
-#ifdef ACCELERATOR_CSHIFT    
  uint64_t sz=Cshift_table.size();
  if (Cshift_table_device.size()!=sz )    {
    Cshift_table_device.resize(sz);
@@ -45,16 +44,13 @@ inline std::pair<int,int> *MapCshiftTable(void)
 			  sizeof(Cshift_table[0])*sz);

  return &Cshift_table_device[0];
-#else 
-  return &Cshift_table[0];
-#endif
  // CPU version use identify map
 }
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];

@@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
  {
    auto buffer_p = & buffer[0];
    auto table = MapCshiftTable();
-#ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
-#else
-    autoView(rhs_v , rhs, CpuRead);
-    thread_for(i,ent,{
-      buffer_p[table[i].first]=rhs_v[table[i].second];
-    });
-#endif
  }
 }

@@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
  int n1=rhs.Grid()->_slice_stride[dimension];

  if ( cbmask ==0x3){
-#ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      });
-#else
-    autoView(rhs_v , rhs, CpuRead);
-    thread_for2d(n,e1,b,e2,{
-	int o      =   n*n1;
-	int offset = b+n*e2;
-	
-	vobj temp =rhs_v[so+o+b];
-	extract<vobj>(temp,pointers,offset);
-      });
-#endif
  } else { 
    Coordinate rdim=rhs.Grid()->_rdimensions;
    Coordinate cdm =rhs.Grid()->_checker_dim_mask;
    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
-#ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@@ -175,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	  extract<vobj>(temp,pointers,offset);
 	}
      });
-#else
-    autoView(rhs_v , rhs, CpuRead);
-    thread_for2d(n,e1,b,e2,{
-
-	Coordinate coor;
-
-	int o=n*n1;
-	int oindex = o+b;
-
-       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
-
-	int ocb=1<<cb;
-	int offset = b+n*e2;
-
-	if ( ocb & cbmask ) {
-	  vobj temp =rhs_v[so+o+b];
-	  extract<vobj>(temp,pointers,offset);
-	}
-      });
-#endif
  }
 }

 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];

@@ -245,17 +202,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
  {
    auto buffer_p = & buffer[0];
    auto table = MapCshiftTable();
-#ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
    });
-#else
-    autoView( rhs_v, rhs, CpuWrite);
-    thread_for(i,ent,{
-      rhs_v[table[i].first]=buffer_p[table[i].second];
-    });
-#endif
  }
 }

@@ -278,7 +228,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  if(cbmask ==0x3 ) {
    int _slice_stride = rhs.Grid()->_slice_stride[dimension];
    int _slice_block = rhs.Grid()->_slice_block[dimension];
-#ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v , rhs, AcceleratorWrite);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
@@ -287,14 +236,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
      });
-#else
-    autoView( rhs_v , rhs, CpuWrite);
-    thread_for2d(n,e1,b,e2,{
-	int o      = n*_slice_stride;
-	int offset = b+n*_slice_block;
-	merge(rhs_v[so+o+b],pointers,offset);
-    });
-#endif
  } else { 

    // Case of SIMD split AND checker dim cannot currently be hit, except in 
@@ -360,19 +301,11 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs

  {
    auto table = MapCshiftTable();
-#ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
-#else
-    autoView(rhs_v , rhs, CpuRead);
-    autoView(lhs_v , lhs, CpuWrite);
-    thread_for(i,ent,{
-      lhs_v[table[i].first]=rhs_v[table[i].second];
-    });
-#endif
  }
 }

@@ -412,19 +345,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo

  {
    auto table = MapCshiftTable();
-#ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);
    accelerator_for(i,ent,1,{
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
    });
-#else
-    autoView( rhs_v, rhs, CpuRead);
-    autoView( lhs_v, lhs, CpuWrite);
-    thread_for(i,ent,{
-      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
-    });
-#endif
  }
 }

@@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>


 NAMESPACE_BEGIN(Grid); 
-
+const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
  typedef typename vobj::vector_type vector_type;
@@ -55,17 +55,17 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  RealD t1,t0;
  t0=usecond();
  if ( !comm_dim ) {
-    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
+    //    std::cout << "CSHIFT: Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
  } else if ( splice_dim ) {
-    //std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
+    //    std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift);
  } else {
-    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
+    //    std::cout << "CSHIFT: Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
  t1=usecond();
-  //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
+  if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }

@@ -94,18 +94,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
  sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
  sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);

-  //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+  //  std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
  if ( sshift[0] == sshift[1] ) {
-    //std::cout << "Single pass Cshift_comms" <<std::endl;
+    //    std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
  } else {
-    //std::cout << "Two pass Cshift_comms" <<std::endl;
+    //    std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
 }
-#define ACCELERATOR_CSHIFT_NO_COPY
-#ifdef ACCELERATOR_CSHIFT_NO_COPY
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
@@ -125,8 +123,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
-  static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
+  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
+  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
+#ifndef ACCELERATOR_AWARE_MPI
+  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
+  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
+#endif
  
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -158,18 +160,31 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
+
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      
      tcomms-=usecond();
-      //      grid->Barrier();
+      grid->Barrier();

+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
+#else
+      // bouncy bouncy
+      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
+      grid->SendToRecvFrom((void *)&hsend_buf[0],
+			   xmit_to_rank,
+			   (void *)&hrecv_buf[0],
+			   recv_from_rank,
+			   bytes);
+      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
+#endif
+
      xbytes+=bytes;
-      //      grid->Barrier();
+      grid->Barrier();
      tcomms+=usecond();

      tscatter-=usecond();
@@ -177,13 +192,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      tscatter+=usecond();
    }
  }
-  /*
-  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  */
+  if (Cshift_verbose){
+    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  }
 }

 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -201,9 +216,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;

-  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
-  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
-  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+  //  std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //	    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //	    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;

  assert(comm_dim==1);
  assert(simd_layout==2);
@@ -224,8 +239,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);

-  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
-  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
+  static std::vector<deviceVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<deviceVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;

@@ -233,6 +248,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
+#ifndef ACCELERATOR_AWARE_MPI
+  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
+  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
+#endif
  
  int bytes = buffer_size*sizeof(scalar_object);

@@ -281,246 +300,31 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 

 	tcomms-=usecond();
-	//	grid->Barrier();
+	grid->Barrier();

 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
+#ifdef ACCELERATOR_AWARE_MPI
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
-
-	xbytes+=bytes;
-	//	grid->Barrier();
-	tcomms+=usecond();
-
-	rpointers[i] = &recv_buf_extract[i][0];
-      } else { 
-	rpointers[i] = &send_buf_extract[nbr_lane][0];
-      }
-
-    }
-    tscatter-=usecond();
-    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
-    tscatter+=usecond();
-  }
-  /*
-  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  */
-}
 #else
-template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
-{
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  GridBase *grid=rhs.Grid();
-  Lattice<vobj> temp(rhs.Grid());
-
-  int fd              = rhs.Grid()->_fdimensions[dimension];
-  int rd              = rhs.Grid()->_rdimensions[dimension];
-  int pd              = rhs.Grid()->_processors[dimension];
-  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
-  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
-  assert(simd_layout==1);
-  assert(comm_dim==1);
-  assert(shift>=0);
-  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-  
-  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
-  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
-  vobj *send_buf;
-  vobj *recv_buf;
-  {
-    grid->ShmBufferFreeAll();
-    size_t bytes = buffer_size*sizeof(vobj);
-    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
-    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
-  }
-    
-  int cb= (cbmask==0x2)? Odd : Even;
-  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
-  for(int x=0;x<rd;x++){       
-
-    int sx        =  (x+sshift)%rd;
-    int comm_proc = ((x+sshift)/rd)%pd;
-    
-    if (comm_proc==0) {
-
-      tcopy-=usecond();
-      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-      tcopy+=usecond();
-
-    } else {
-
-      int words = buffer_size;
-      if (cbmask != 0x3) words=words>>1;
-
-      int bytes = words * sizeof(vobj);
-
-      tgather-=usecond();
-      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
-      tgather+=usecond();
-
-      //      int rank           = grid->_processor;
-      int recv_from_rank;
-      int xmit_to_rank;
-      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-
-
-      tcomms-=usecond();
-      //      grid->Barrier();
-
-      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
-      grid->SendToRecvFrom((void *)&send_buf[0],
-			   xmit_to_rank,
-			   (void *)&recv_buf[0],
-			   recv_from_rank,
-			   bytes);
-      xbytes+=bytes;
-      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
-
-      //      grid->Barrier();
-      tcomms+=usecond();
-
-      tscatter-=usecond();
-      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
-      tscatter+=usecond();
-    }
-  }
-  /*
-  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  */
-}
-
-template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
-{
-  GridBase *grid=rhs.Grid();
-  const int Nsimd = grid->Nsimd();
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_object scalar_object;
-  typedef typename vobj::scalar_type scalar_type;
-   
-  int fd = grid->_fdimensions[dimension];
-  int rd = grid->_rdimensions[dimension];
-  int ld = grid->_ldimensions[dimension];
-  int pd = grid->_processors[dimension];
-  int simd_layout     = grid->_simd_layout[dimension];
-  int comm_dim        = grid->_processors[dimension] >1 ;
-
-  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
-  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
-  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
-
-  assert(comm_dim==1);
-  assert(simd_layout==2);
-  assert(shift>=0);
-  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-
-  int permute_type=grid->PermuteType(dimension);
-
-  ///////////////////////////////////////////////
-  // Simd direction uses an extract/merge pair
-  ///////////////////////////////////////////////
-  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
-  //  int words = sizeof(vobj)/sizeof(vector_type);
-
-  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
-  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
-  scalar_object *  recv_buf_extract_mpi;
-  scalar_object *  send_buf_extract_mpi;
-  {
-    size_t bytes = sizeof(scalar_object)*buffer_size;
-    grid->ShmBufferFreeAll();
-    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
-    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
-  }
-  for(int s=0;s<Nsimd;s++){
-    send_buf_extract[s].resize(buffer_size);
-    recv_buf_extract[s].resize(buffer_size);
-  }
-
-  int bytes = buffer_size*sizeof(scalar_object);
-
-  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
-  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
-
-  ///////////////////////////////////////////
-  // Work out what to send where
-  ///////////////////////////////////////////
-  int cb    = (cbmask==0x2)? Odd : Even;
-  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
-  // loop over outer coord planes orthog to dim
-  for(int x=0;x<rd;x++){       
-
-    // FIXME call local permute copy if none are offnode.
-    for(int i=0;i<Nsimd;i++){       
-      pointers[i] = &send_buf_extract[i][0];
-    }
-    tgather-=usecond();
-    int sx   = (x+sshift)%rd;
-    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
-    tgather+=usecond();
-
-    for(int i=0;i<Nsimd;i++){
-      
-      int inner_bit = (Nsimd>>(permute_type+1));
-      int ic= (i&inner_bit)? 1:0;
-
-      int my_coor          = rd*ic + x;
-      int nbr_coor         = my_coor+sshift;
-      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
-
-      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
-      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
-      int nbr_lane = (i&(~inner_bit));
-
-      int recv_from_rank;
-      int xmit_to_rank;
-
-      if (nbr_ic) nbr_lane|=inner_bit;
-
-      assert (sx == nbr_ox);
-
-      if(nbr_proc){
-	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-
-	tcomms-=usecond();
-	//	grid->Barrier();
-
-	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
-	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
+      // bouncy bouncy
+	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
+	grid->SendToRecvFrom((void *)&hsend_buf[0],
 			     xmit_to_rank,
-			     (void *)recv_buf_extract_mpi,
+			     (void *)&hrecv_buf[0],
 			     recv_from_rank,
 			     bytes);
-	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
-	xbytes+=bytes;
+	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
+#endif

-	//	grid->Barrier();
+	xbytes+=bytes;
+	grid->Barrier();
 	tcomms+=usecond();
+
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
@@ -530,17 +334,16 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
-
  }
-  /*
-  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
-  */
+  if(Cshift_verbose){
+    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  }
 }
-#endif
+
 NAMESPACE_END(Grid); 

 #endif
@@ -1,5 +1,5 @@
 #include <Grid/GridCore.h>       
 NAMESPACE_BEGIN(Grid);
 std::vector<std::pair<int,int> > Cshift_table; 
-commVector<std::pair<int,int> > Cshift_table_device; 
+deviceVector<std::pair<int,int> > Cshift_table_device; 
 NAMESPACE_END(Grid);
@@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
  });
 }

+#define FAST_AXPY_NORM
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpy_norm");
-    return axpy_norm_fast(ret,a,x,y);
+#ifdef FAST_AXPY_NORM
+  return axpy_norm_fast(ret,a,x,y);
+#else
+  ret = a*x+y;
+  RealD nn=norm2(ret);
+  return nn;
+#endif
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpby_norm");
-    return axpby_norm_fast(ret,a,b,x,y);
+#ifdef FAST_AXPY_NORM
+  return axpby_norm_fast(ret,a,b,x,y);
+#else
+  ret = a*x+b*y;
+  RealD nn=norm2(ret);
+  return nn;
+#endif
 }

 /// Trace product
@@ -237,9 +237,12 @@ public:
    vobj vtmp;
    vtmp = r;
 #if 0
+    deviceVector<vobj> vvtmp(1);
+    acceleratorPut(vvtmp[0],vtmp);
+    vobj *vvtmp_p = & vvtmp[0];
    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
-	auto stmp=coalescedRead(vtmp);
+	auto stmp=coalescedRead(*vvtmp_p);
 	coalescedWrite(me[ss],stmp);
    });
 #else    
@@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  typedef decltype(basis[0]) Field;
  typedef decltype(basis[0].View(AcceleratorRead)) View;

-  Vector<View> basis_v; basis_v.reserve(basis.size());
-  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
+  hostVector<View>  h_basis_v(basis.size());
+  deviceVector<View> d_basis_v(basis.size());
+  typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
  typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
+
  GridBase* grid = basis[0].Grid();
      
  for(int k=0;k<basis.size();k++){
-    basis_v.push_back(basis[k].View(AcceleratorWrite));
+    h_basis_v[k] = basis[k].View(AcceleratorWrite);
+    acceleratorPut(d_basis_v[k],h_basis_v[k]);
  }

-#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
-  int max_threads = thread_max();
-  Vector < vobj > Bt(Nm * max_threads);
-  thread_region
-    {
-      vobj* B = &Bt[Nm * thread_num()];
-      thread_for_in_region(ss, grid->oSites(),{
-	  for(int j=j0; j<j1; ++j) B[j]=0.;
-      
-	  for(int j=j0; j<j1; ++j){
-	    for(int k=k0; k<k1; ++k){
-	      B[j] +=Qt(j,k) * basis_v[k][ss];
-	    }
-	  }
-	  for(int j=j0; j<j1; ++j){
-	    basis_v[j][ss] = B[j];
-	  }
-	});
-    }
-#else
-  View *basis_vp = &basis_v[0];
+  View *basis_vp = &d_basis_v[0];

  int nrot = j1-j0;
  if (!nrot) // edge case not handled gracefully by Cuda
@@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  uint64_t oSites   =grid->oSites();
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead

-  Vector <vobj> Bt(siteBlock * nrot); 
+  deviceVector <vobj> Bt(siteBlock * nrot); 
  auto Bp=&Bt[0];

  // GPU readable copy of matrix
-  Vector<Coeff_t> Qt_jv(Nm*Nm);
+  hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
+  deviceVector<Coeff_t> Qt_jv(Nm*Nm);
  Coeff_t *Qt_p = & Qt_jv[0];
  thread_for(i,Nm*Nm,{
      int j = i/Nm;
      int k = i%Nm;
-      Qt_p[i]=Qt(j,k);
+      h_Qt_jv[i]=Qt(j,k);
  });
+  acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));

  // Block the loop to keep storage footprint down
  for(uint64_t s=0;s<oSites;s+=siteBlock){
@@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 	coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
      });
  }
-#endif

-  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
+  for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
 }

 // Extract a single rotated vector
@@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in

  result.Checkerboard() = basis[0].Checkerboard();

-  Vector<View> basis_v; basis_v.reserve(basis.size());
+  hostVector<View>  h_basis_v(basis.size());
+  deviceVector<View> d_basis_v(basis.size());
  for(int k=0;k<basis.size();k++){
-    basis_v.push_back(basis[k].View(AcceleratorRead));
+    h_basis_v[k]=basis[k].View(AcceleratorRead);
+    acceleratorPut(d_basis_v[k],h_basis_v[k]);
  }
-  vobj zz=Zero();
-  Vector<double> Qt_jv(Nm);
-  double * Qt_j = & Qt_jv[0];
-  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);

-  auto basis_vp=& basis_v[0];
+  vobj zz=Zero();
+  deviceVector<double> Qt_jv(Nm);
+  double * Qt_j = & Qt_jv[0];
+  for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
+
+  auto basis_vp=& d_basis_v[0];
  autoView(result_v,result,AcceleratorWrite);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
    vobj zzz=Zero();
@@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
    }
    coalescedWrite(result_v[ss], B);
  });
-  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
+  for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
 }

 template<class Field>
@@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
  //  const int Nsimd = vobj::Nsimd();
  const int nthread = GridThread::GetThreads();

-  Vector<sobj> sumarray(nthread);
+  std::vector<sobj> sumarray(nthread);
  for(int i=0;i<nthread;i++){
    sumarray[i]=Zero();
  }
@@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)

  const int nthread = GridThread::GetThreads();

-  Vector<sobj> sumarray(nthread);
+  std::vector<sobj> sumarray(nthread);
  for(int i=0;i<nthread;i++){
    sumarray[i]=Zero();
  }
@@ -290,8 +290,10 @@ template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();

+  bool ok;
 #ifdef GRID_SYCL
  uint64_t csum=0;
+  uint64_t csum2=0;
  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
  {
    // Hack
@@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
    uint64_t *base= (uint64_t *)&l_v[0];
    csum=svm_xor(base,words);
+    ok = FlightRecorder::CsumLog(csum);
+    if ( !ok ) {
+      csum2=svm_xor(base,words);
+      std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+    } else {
+      //      csum2=svm_xor(base,words);
+      //      std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+    }
+    assert(ok);
  }
-  FlightRecorder::CsumLog(csum);
 #endif
+  FlightRecorder::StepLog("rank inner product");
  ComplexD nrm = rankInnerProduct(left,right);
+  //  ComplexD nrmck=nrm;
  RealD local = real(nrm);
-  FlightRecorder::NormLog(real(nrm)); 
+  ok = FlightRecorder::NormLog(real(nrm));
+  if ( !ok ) {
+    ComplexD nrm2 = rankInnerProduct(left,right);
+    RealD local2 = real(nrm2);
+    std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
+    assert(ok);
+  }
+  FlightRecorder::StepLog("Start global sum");
+  //  grid->GlobalSumP2P(nrm);
  grid->GlobalSum(nrm);
+  FlightRecorder::StepLog("Finished global sum");
+  //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
  FlightRecorder::ReductionLog(local,real(nrm)); 
  return nrm;
 }
@@ -343,18 +365,6 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
-#if 0
-  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
-  Vector<inner_t> inner_tmp(sites);
-  auto inner_tmp_v = &inner_tmp[0];
-
-  accelerator_for( ss, sites, nsimd,{
-      auto tmp = a*x_v(ss)+b*y_v(ss);
-      coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
-      coalescedWrite(z_v[ss],tmp);
-  });
-  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
-#else
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
  deviceVector<inner_t> inner_tmp;
  inner_tmp.resize(sites);
@@ -365,9 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
-  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
+  bool ok;
+#ifdef GRID_SYCL
+  uint64_t csum=0;
+  uint64_t csum2=0;
+  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
+  {
+    // z_v
+    {
+      Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
+      uint64_t *base= (uint64_t *)&z_v[0];
+      csum=svm_xor(base,words);
+      ok = FlightRecorder::CsumLog(csum);
+      if ( !ok ) {
+	csum2=svm_xor(base,words);
+	std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+      }
+      assert(ok);
+    }
+    // inner_v
+    {
+      Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
+      uint64_t *base= (uint64_t *)&inner_tmp_v[0];
+      csum=svm_xor(base,words);
+      ok = FlightRecorder::CsumLog(csum);
+      if ( !ok ) {
+	csum2=svm_xor(base,words);
+	std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+      }
+      assert(ok);
+    }
+  }
 #endif
+  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
+  ok = FlightRecorder::NormLog(real(nrm));
+  assert(ok);
+  RealD local = real(nrm);
  grid->GlobalSum(nrm);
+  FlightRecorder::ReductionLog(local,real(nrm));
  return nrm; 
 }
 
@@ -377,7 +422,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
  conformable(left,right);

  typedef typename vobj::vector_typeD vector_type;
-  Vector<ComplexD> tmp(2);
+  std::vector<ComplexD> tmp(2);

  GridBase *grid = left.Grid();

@@ -387,8 +432,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
  // GPU
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  typedef decltype(innerProductD(vobj(),vobj())) norm_t;
-  Vector<inner_t> inner_tmp(sites);
-  Vector<norm_t>  norm_tmp(sites);
+  deviceVector<inner_t> inner_tmp(sites);
+  deviceVector<norm_t>  norm_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  auto norm_tmp_v = &norm_tmp[0];
  {
@@ -438,7 +483,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
 // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////

-template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
+template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
+					  std::vector<typename vobj::scalar_object> &result,
+					  int orthogdim)
 {
  ///////////////////////////////////////////////////////
  // FIXME precision promoted summation
@@ -460,8 +507,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];

-  Vector<vobj> lvSum(rd); // will locally sum vectors first
-  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
+  std::vector<vobj> lvSum(rd); // will locally sum vectors first
+  std::vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD

  result.resize(fd); // And then global sum to return the same vector to every node 
@@ -509,6 +556,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  scalar_type * ptr = (scalar_type *) &result[0];
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
+  //  std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
+  
 }
 template<class vobj> inline
 std::vector<typename vobj::scalar_object> 
@@ -552,8 +601,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];

-  Vector<vector_type> lvSum(rd); // will locally sum vectors first
-  Vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
+  std::vector<vector_type> lvSum(rd); // will locally sum vectors first
+  std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  

  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  // Move out of UVM
  // Turns out I had messed up the synchronise after move to compute stream
  // as running this on the default stream fools the synchronise
-#undef UVM_BLOCK_BUFFER  
-#ifndef UVM_BLOCK_BUFFER  
-  commVector<sobj> buffer(numBlocks);
+  deviceVector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
  sobj result;
  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
  accelerator_barrier();
  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
-#else
-  Vector<sobj> buffer(numBlocks);
-  sobj *buffer_v = &buffer[0];
-  sobj result;
-  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
-  accelerator_barrier();
-  result = *buffer_v;
-#endif
  return result;
 }

@@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
  
  const int words = sizeof(vobj)/sizeof(vector);

-  Vector<vector> buffer(osites);
+  deviceVector<vector> buffer(osites);
  vector *dat = (vector *)lat;
  vector *buf = &buffer[0];
  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
@@ -4,33 +4,28 @@ NAMESPACE_BEGIN(Grid);
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////

+
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_objectD sobjD;
-  static Vector<sobj> mysum;
-  mysum.resize(1);
-  sobj *mysum_p = & mysum[0];
+
  sobj identity; zeroit(identity);
-  mysum[0] = identity;
-  sobj ret ; 
-
+  sobj ret; zeroit(ret);
  Integer nsimd= vobj::Nsimd();
-
-  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-    auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
-     cgh.parallel_for(cl::sycl::range<1>{osites},
-		      Reduction,
-		      [=] (cl::sycl::id<1> item, auto &sum) {
-      auto osite   = item[0];
-      sum +=Reduce(lat[osite]);
-     });
-   });
-  theGridAccelerator->wait();
-  ret = mysum[0];
-  //  free(mysum,*theGridAccelerator);
+  { 
+    sycl::buffer<sobj, 1> abuff(&ret, {1});
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
+      cgh.parallel_for(sycl::range<1>{osites},
+                      Reduction,
+                      [=] (sycl::id<1> item, auto &sum) {
+                        auto osite   = item[0];
+                        sum +=Reduce(lat[osite]);
+                      });
+    });
+  }
  sobjD dret; convertType(dret,ret);
  return dret;
 }
@@ -76,59 +71,22 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite

 template<class Word> Word svm_xor(Word *vec,uint64_t L)
 {
-  Word xorResult; xorResult = 0;
-  static Vector<Word> d_sum;
-  d_sum.resize(1);
-  Word *d_sum_p=&d_sum[0];
  Word identity;  identity=0;
-  d_sum[0] = identity;
-  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-    auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
-     cgh.parallel_for(cl::sycl::range<1>{L},
-		      Reduction,
-		      [=] (cl::sycl::id<1> index, auto &sum) {
-	 sum^=vec[index];
-     });
-   });
+  Word ret = 0;
+  { 
+    sycl::buffer<Word, 1> abuff(&ret, {1});
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
+      cgh.parallel_for(sycl::range<1>{L},
+                      Reduction,
+                      [=] (sycl::id<1> index, auto &sum) {
+                        sum ^=vec[index];
+                      });
+    });
+  }
  theGridAccelerator->wait();
-  Word ret = d_sum[0];
-  //  free(d_sum,*theGridAccelerator);
  return ret;
 }

 NAMESPACE_END(Grid);

-/*
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::vector_type  vector;
-  typedef typename vobj::scalar_type  scalar;
-
-  typedef typename vobj::scalar_typeD scalarD;
-  typedef typename vobj::scalar_objectD sobjD;
-
-  sobjD ret;
-  scalarD *ret_p = (scalarD *)&ret;
-  
-  const int nsimd = vobj::Nsimd();
-  const int words = sizeof(vobj)/sizeof(vector);
-
-  Vector<scalar> buffer(osites*nsimd);
-  scalar *buf = &buffer[0];
-  vector *dat = (vector *)lat;
-
-  for(int w=0;w<words;w++) {
-
-    accelerator_for(ss,osites,nsimd,{
-	int lane = acceleratorSIMTlane(nsimd);
-	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
-    });
-    //Precision change at this point is to late to gain precision
-    ret_p[w] = svm_reduce(buf,nsimd*osites);
-  }
-  return ret;
-}
-*/
@@ -21,9 +21,18 @@ NAMESPACE_BEGIN(Grid);


 #if defined(GRID_CUDA) || defined(GRID_HIP)
-template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+template<class vobj>
+inline void sliceSumReduction_cub_small(const vobj *Data,
+					std::vector<vobj> &lvSum,
+					const int rd,
+					const int e1,
+					const int e2,
+					const int stride,
+					const int ostride,
+					const int Nsimd)
+{
  size_t subvol_size = e1*e2;
-  commVector<vobj> reduction_buffer(rd*subvol_size);
+  deviceVector<vobj> reduction_buffer(rd*subvol_size);
  auto rb_p = &reduction_buffer[0];
  vobj zero_init;
  zeroit(zero_init);
@@ -46,7 +55,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
  
  //copy offsets to device
-  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
+  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  
  
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@@ -79,7 +88,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
    exit(EXIT_FAILURE);
  }
  
-  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
+  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  
  //sync after copy
  accelerator_barrier();
@@ -94,7 +103,15 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V


 #if defined(GRID_SYCL)
-template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
+template<class vobj>
+inline void sliceSumReduction_sycl_small(const vobj *Data,
+					 std::vector <vobj> &lvSum,
+					 const int  &rd,
+					 const int &e1,
+					 const int &e2,
+					 const int &stride,
+					 const int &ostride,
+					 const int &Nsimd)
 {
  size_t subvol_size = e1*e2;

@@ -105,7 +122,7 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
    mysum[r] = vobj_zero; 
  }

-  commVector<vobj> reduction_buffer(rd*subvol_size);    
+  deviceVector<vobj> reduction_buffer(rd*subvol_size);    

  auto rb_p = &reduction_buffer[0];

@@ -124,11 +141,11 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
  });

  for (int r = 0; r < rd; r++) {
-      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
-          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
+      theGridAccelerator->submit([&](sycl::handler &cgh) {
+          auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
+          cgh.parallel_for(sycl::range<1>{subvol_size},
          Reduction,
-          [=](cl::sycl::id<1> item, auto &sum) {
+          [=](sycl::id<1> item, auto &sum) {
              auto s = item[0];
              sum += rb_p[r*subvol_size+s];
          });
@@ -144,14 +161,23 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
 }
 #endif

-template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+template<class vobj>
+inline void sliceSumReduction_large(const vobj *Data,
+				    std::vector<vobj> &lvSum,
+				    const int rd,
+				    const int e1,
+				    const int e2,
+				    const int stride,
+				    const int ostride,
+				    const int Nsimd)
+{
  typedef typename vobj::vector_type vector;
  const int words = sizeof(vobj)/sizeof(vector);
  const int osites = rd*e1*e2;
-  commVector<vector>buffer(osites);
+  deviceVector<vector>buffer(osites);
  vector *dat = (vector *)Data;
  vector *buf = &buffer[0];
-  Vector<vector> lvSum_small(rd);
+  std::vector<vector> lvSum_small(rd);
  vector *lvSum_ptr = (vector *)&lvSum[0];

  for (int w = 0; w < words; w++) {
@@ -168,13 +194,18 @@ template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vecto
    for (int r = 0; r < rd; r++) {
      lvSum_ptr[w+words*r]=lvSum_small[r];
    }
-
  }
-
-  
 }

-template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
+template<class vobj>
+inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
+				  std::vector<vobj> &lvSum,
+				  const int rd,
+				  const int e1,
+				  const int e2,
+				  const int stride,
+				  const int ostride,
+				  const int Nsimd)
 {
  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
    if constexpr (sizeof(vobj) <= 256) { 
@@ -192,7 +223,15 @@ template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data
 }


-template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
+template<class vobj>
+inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
+				  std::vector<vobj> &lvSum,
+				  const int &rd,
+				  const int &e1,
+				  const int &e2,
+				  const int &stride,
+				  const int &ostride,
+				  const int &Nsimd)
 {
  // sum over reduced dimension planes, breaking out orthog dir
  // Parallel over orthog direction
@@ -208,16 +247,20 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
  });
 }

-template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
+template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
+						   std::vector<vobj> &lvSum,
+						   const int &rd,
+						   const int &e1,
+						   const int &e2,
+						   const int &stride,
+						   const int &ostride,
+						   const int &Nsimd) 
 {
-  #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-  
+#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-  
-  #else
+#else
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-
-  #endif
+#endif
 }


@@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
 *
 */

-template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
+template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
 					      Lattice<vobj> &lat,
 					      int x,
 					      int dim,
@@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
  });
 }

-template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
+template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
 					     const Lattice<vobj> &lat,
 					     int x,
 					     int dim,
@@ -462,13 +462,19 @@ public:
    int rNsimd = Nsimd / simd[dimension];
    assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);

-    static cshiftVector<vobj> send_buf; 
-    static cshiftVector<vobj> recv_buf;
+    static deviceVector<vobj> send_buf; 
+    static deviceVector<vobj> recv_buf;
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);
+#ifndef ACCELERATOR_AWARE_MPI
+    static hostVector<vobj> hsend_buf; 
+    static hostVector<vobj> hrecv_buf;
+    hsend_buf.resize(buffer_size*2*depth);    
+    hrecv_buf.resize(buffer_size*2*depth);
+#endif    

-    std::vector<CommsRequest_t> fwd_req;   
-    std::vector<CommsRequest_t> bwd_req;   
+    std::vector<MpiCommsRequest_t> fwd_req;   
+    std::vector<MpiCommsRequest_t> bwd_req;   

    int words = buffer_size;
    int bytes = words * sizeof(vobj);
@@ -495,9 +501,17 @@ public:
      t_gather+=usecond()-t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&send_buf[d*buffer_size], xmit_to_rank,
 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
+      grid->SendToRecvFromBegin(fwd_req,
+				(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
+				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
+#endif
      t_comms+=usecond()-t;
     }
    for ( int d=0;d < depth ; d ++ ) {
@@ -508,9 +522,17 @@ public:
      t_gather+= usecond() - t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
+      grid->SendToRecvFromBegin(bwd_req,
+				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
+				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
+#endif      
      t_comms+=usecond()-t;
    }

@@ -132,6 +132,10 @@ public:
 template <class GaugeField >
 class EmptyAction : public Action <GaugeField>
 {
+  using Action<GaugeField>::refresh;
+  using Action<GaugeField>::Sinitial;
+  using Action<GaugeField>::deriv;
+
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative
@@ -55,6 +55,11 @@ public:
  RealD alpha; // Mobius scale
  RealD k;     // EOFA normalization constant

+  // Device resident
+  deviceVector<Coeff_t> d_shift_coefficients;
+  deviceVector<Coeff_t> d_MooeeInv_shift_lc;
+  deviceVector<Coeff_t> d_MooeeInv_shift_norm;
+  
  virtual void Instantiatable(void) = 0;

  // EOFA-specific operations
@@ -92,6 +97,11 @@ public:
    this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
      ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
      ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
+    
+    d_shift_coefficients.resize(Ls);
+    d_MooeeInv_shift_lc.resize(Ls);
+    d_MooeeInv_shift_norm.resize(Ls);
+
  };
 };

@@ -90,16 +90,16 @@ public:
  void M5D(const FermionField &psi,
 	   const FermionField &phi,
 	   FermionField &chi,
-	   Vector<Coeff_t> &lower,
-	   Vector<Coeff_t> &diag,
-	   Vector<Coeff_t> &upper);
+	   std::vector<Coeff_t> &lower,
+	   std::vector<Coeff_t> &diag,
+	   std::vector<Coeff_t> &upper);

  void M5Ddag(const FermionField &psi,
 	      const FermionField &phi,
 	      FermionField &chi,
-	      Vector<Coeff_t> &lower,
-	      Vector<Coeff_t> &diag,
-	      Vector<Coeff_t> &upper);
+	      std::vector<Coeff_t> &lower,
+	      std::vector<Coeff_t> &diag,
+	      std::vector<Coeff_t> &upper);

  virtual void   Instantiatable(void)=0;

@@ -119,35 +119,51 @@ public:
  RealD mass_plus, mass_minus;

  // Save arguments to SetCoefficientsInternal
-  Vector<Coeff_t> _gamma;
+  std::vector<Coeff_t> _gamma;
  RealD                _zolo_hi;
  RealD                _b;
  RealD                _c;

+  // possible boost
+  std::vector<ComplexD> qmu;
+  void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
+  void addQmu(const FermionField &in, FermionField &out, int dag);
+  
  // Cayley form Moebius (tanh and zolotarev)
-  Vector<Coeff_t> omega;
-  Vector<Coeff_t> bs;    // S dependent coeffs
-  Vector<Coeff_t> cs;
-  Vector<Coeff_t> as;
+  std::vector<Coeff_t> omega;
+  std::vector<Coeff_t> bs;    // S dependent coeffs
+  std::vector<Coeff_t> cs;
+  std::vector<Coeff_t> as;
  // For preconditioning Cayley form
-  Vector<Coeff_t> bee;
-  Vector<Coeff_t> cee;
-  Vector<Coeff_t> aee;
-  Vector<Coeff_t> beo;
-  Vector<Coeff_t> ceo;
-  Vector<Coeff_t> aeo;
+  std::vector<Coeff_t> bee;
+  std::vector<Coeff_t> cee;
+  std::vector<Coeff_t> aee;
+  std::vector<Coeff_t> beo;
+  std::vector<Coeff_t> ceo;
+  std::vector<Coeff_t> aeo;
  // LDU factorisation of the eeoo matrix
-  Vector<Coeff_t> lee;
-  Vector<Coeff_t> leem;
-  Vector<Coeff_t> uee;
-  Vector<Coeff_t> ueem;
-  Vector<Coeff_t> dee;
+  std::vector<Coeff_t> lee;
+  std::vector<Coeff_t> leem;
+  std::vector<Coeff_t> uee;
+  std::vector<Coeff_t> ueem;
+  std::vector<Coeff_t> dee;
+
+  // Device memory
+  deviceVector<Coeff_t> d_diag;
+  deviceVector<Coeff_t> d_upper;
+  deviceVector<Coeff_t> d_lower;
+
+  deviceVector<Coeff_t> d_lee;
+  deviceVector<Coeff_t> d_dee;
+  deviceVector<Coeff_t> d_uee;
+  deviceVector<Coeff_t> d_leem;
+  deviceVector<Coeff_t> d_ueem;

  // Matrices of 5d ee inverse params
-  Vector<iSinglet<Simd> >  MatpInv;
-  Vector<iSinglet<Simd> >  MatmInv;
-  Vector<iSinglet<Simd> >  MatpInvDag;
-  Vector<iSinglet<Simd> >  MatmInvDag;
+  //  std::vector<iSinglet<Simd> >  MatpInv;
+  //  std::vector<iSinglet<Simd> >  MatmInv;
+  //  std::vector<iSinglet<Simd> >  MatpInvDag;
+  //  std::vector<iSinglet<Simd> >  MatmInvDag;

  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
@@ -187,7 +203,7 @@ public:
 protected:
  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
-  virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
+  virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
 };

 NAMESPACE_END(Grid);
@@ -60,6 +60,50 @@ public:
  //      virtual void   Instantiatable(void)=0;
  virtual void   Instantiatable(void) =0;

+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
+  {
+    std::cout << "Free Propagator for PartialFraction"<<std::endl;
+    FermionField in_k(in.Grid());
+    FermionField prop_k(in.Grid());
+    
+    FFT theFFT((GridCartesian *) in.Grid());
+
+    //phase for boundary condition
+    ComplexField coor(in.Grid());
+    ComplexField ph(in.Grid());  ph = Zero();
+    FermionField in_buf(in.Grid()); in_buf = Zero();
+    typedef typename Simd::scalar_type Scalar;
+    Scalar ci(0.0,1.0);
+    assert(twist.size() == Nd);//check that twist is Nd
+    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    int shift = 0;
+    for(unsigned int nu = 0; nu < Nd; nu++)
+      {
+	// Shift coordinate lattice index by 1 to account for 5th dimension.
+	LatticeCoordinate(coor, nu + shift);
+	double boundary_phase = ::acos(real(boundary[nu]));
+	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
+	//momenta for propagator shifted by twist+boundary
+	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
+      }
+    in_buf = exp(ci*ph*(-1.0))*in;
+
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    
+    //phase for boundary condition
+    out = out * exp(ci*ph);
+  };
+
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+    std::vector<Complex> boundary;
+    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+    FreePropagator(in,out,mass,boundary,twist);
+  };
+
+  
  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);
@@ -90,12 +134,12 @@ protected:
  RealD mass;
  RealD R;
  RealD ZoloHiInv;
-  Vector<double> Beta;
-  Vector<double> cc;;
-  Vector<double> cc_d;;
-  Vector<double> sqrt_cc;
-  Vector<double> See;
-  Vector<double> Aee;
+  std::vector<double> Beta;
+  std::vector<double> cc;;
+  std::vector<double> cc_d;;
+  std::vector<double> sqrt_cc;
+  std::vector<double> See;
+  std::vector<double> Aee;

 };

@@ -69,10 +69,10 @@ public:
  // Instantiate different versions depending on Impl
  /////////////////////////////////////////////////////
  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	   std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);

  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);

  virtual void RefreshShiftCoefficients(RealD new_shift);

@@ -83,7 +83,7 @@ public:
 			RealD _M5, const ImplParams& p=ImplParams());

 protected:
-  void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
+  void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
 };

 NAMESPACE_END(Grid);
@@ -102,11 +102,11 @@ public:
 		     GaugeField &mat, 
 		     const FermionField &A, const FermionField &B, int dag);

-  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+  void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
                    const FermionField &in, FermionField &out, int dag);

  //////////////////////////////////////////////////////////////////////////
@@ -164,8 +164,6 @@ public:
  DoubledGaugeField UUUmuEven;
  DoubledGaugeField UUUmuOdd;

-  LebesgueOrder Lebesgue;
-  LebesgueOrder LebesgueEvenOdd;
  
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
@@ -100,7 +100,6 @@ public:
 		     int dag);
    
  void DhopInternal(StencilImpl & st,
-		    LebesgueOrder &lo,
 		    DoubledGaugeField &U,
 		    DoubledGaugeField &UUU,
 		    const FermionField &in, 
@@ -108,7 +107,6 @@ public:
 		    int dag);
    
    void DhopInternalOverlappedComms(StencilImpl & st,
-		      LebesgueOrder &lo,
 		      DoubledGaugeField &U,
 		      DoubledGaugeField &UUU,
 		      const FermionField &in, 
@@ -116,7 +114,6 @@ public:
 		      int dag);

    void DhopInternalSerialComms(StencilImpl & st,
-		      LebesgueOrder &lo,
 		      DoubledGaugeField &U,
 		      DoubledGaugeField &UUU,
 		      const FermionField &in, 
@@ -192,8 +189,6 @@ public:
  DoubledGaugeField UUUmuEven;
  DoubledGaugeField UUUmuOdd;
    
-  LebesgueOrder Lebesgue;
-  LebesgueOrder LebesgueEvenOdd;
    
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
@@ -42,11 +42,11 @@ public:

 public:
  // Shift operator coefficients for red-black preconditioned Mobius EOFA
-  Vector<Coeff_t> Mooee_shift;
-  Vector<Coeff_t> MooeeInv_shift_lc;
-  Vector<Coeff_t> MooeeInv_shift_norm;
-  Vector<Coeff_t> MooeeInvDag_shift_lc;
-  Vector<Coeff_t> MooeeInvDag_shift_norm;
+  std::vector<Coeff_t> Mooee_shift;
+  std::vector<Coeff_t> MooeeInv_shift_lc;
+  std::vector<Coeff_t> MooeeInv_shift_norm;
+  std::vector<Coeff_t> MooeeInvDag_shift_lc;
+  std::vector<Coeff_t> MooeeInvDag_shift_norm;

  virtual void Instantiatable(void) {};

@@ -74,18 +74,18 @@ public:
  // Instantiate different versions depending on Impl
  /////////////////////////////////////////////////////
  void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	   std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);

  void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-		 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-		 Vector<Coeff_t>& shift_coeffs);
+		 std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+		 std::vector<Coeff_t>& shift_coeffs);

  void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
-	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
+	      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);

  void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
-		    Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
-		    Vector<Coeff_t>& shift_coeffs);
+		    std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+		    std::vector<Coeff_t>& shift_coeffs);

  virtual void RefreshShiftCoefficients(RealD new_shift);

@@ -102,11 +102,11 @@ public:
 		     GaugeField &mat, 
 		     const FermionField &A, const FermionField &B, int dag);

-  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
-  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
 			       const FermionField &in, FermionField &out, int dag);
-  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
 				   const FermionField &in, FermionField &out, int dag);

  //////////////////////////////////////////////////////////////////////////
@@ -152,9 +152,6 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;

-  LebesgueOrder Lebesgue;
-  LebesgueOrder LebesgueEvenOdd;
-  
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
@@ -42,7 +42,7 @@ public:

     void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
       this->MomentumSpacePropagatorHw(out,in,_m,twist);
-  };
+     };

  // Constructors
  OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
@@ -41,6 +41,10 @@ public:
 public:

  // Constructors
+  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };

  OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
 				      GridCartesian         &FiveDimGrid,
@@ -41,6 +41,9 @@ public:
 public:

  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
  // Constructors
  OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
 				   GridCartesian         &FiveDimGrid,
@@ -40,6 +40,9 @@ public:
  INHERIT_IMPL_TYPES(Impl);

  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
  // Constructors
  OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
 					GridCartesian         &FiveDimGrid,
@@ -41,6 +41,9 @@ public:
 public:

  virtual void   Instantiatable(void){};
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
  // Constructors
  OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
 					  GridCartesian         &FiveDimGrid,
@@ -40,6 +40,11 @@ public:
  INHERIT_IMPL_TYPES(Impl);

  virtual void   Instantiatable(void){};
+
+  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+    this->MomentumSpacePropagatorHw(out,in,_m,twist);
+  };
+
  // Constructors
  OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
 					       GridCartesian         &FiveDimGrid,
@@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
 public:
  INHERIT_IMPL_TYPES(Impl);

-  const int part_frac_chroma_convention=1;
+  const int part_frac_chroma_convention=0;

  void   Meooe_internal(const FermionField &in, FermionField &out,int dag);
  void   Mooee_internal(const FermionField &in, FermionField &out,int dag);
@@ -83,19 +83,78 @@ public:
 			   GridRedBlackCartesian &FourDimRedBlackGrid,
 			   RealD _mass,RealD M5,const ImplParams &p= ImplParams());

+  PartialFractionFermion5D(GaugeField &_Umu,
+			   GridCartesian         &FiveDimGrid,
+			   GridRedBlackCartesian &FiveDimRedBlackGrid,
+			   GridCartesian         &FourDimGrid,
+			   GridRedBlackCartesian &FourDimRedBlackGrid,
+			   RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
+
+  void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
+  {
+    std::cout << "Free Propagator for PartialFraction"<<std::endl;
+    FermionField in_k(in.Grid());
+    FermionField prop_k(in.Grid());
+    
+    FFT theFFT((GridCartesian *) in.Grid());
+
+    //phase for boundary condition
+    ComplexField coor(in.Grid());
+    ComplexField ph(in.Grid());  ph = Zero();
+    FermionField in_buf(in.Grid()); in_buf = Zero();
+    typedef typename Simd::scalar_type Scalar;
+    Scalar ci(0.0,1.0);
+    assert(twist.size() == Nd);//check that twist is Nd
+    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    int shift = 0;
+    for(unsigned int nu = 0; nu < Nd; nu++)
+      {
+	// Shift coordinate lattice index by 1 to account for 5th dimension.
+	LatticeCoordinate(coor, nu + shift);
+	double boundary_phase = ::acos(real(boundary[nu]));
+	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
+	//momenta for propagator shifted by twist+boundary
+	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
+      }
+    in_buf = exp(ci*ph*(-1.0))*in;
+
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    if ( this->qmu.size() ){
+      this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
+    } else {
+      this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
+    }
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+    
+    //phase for boundary condition
+    out = out * exp(ci*ph);
+  };
+
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+    std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+    std::vector<Complex> boundary;
+    for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
+    FreePropagator(in,out,mass,boundary,twist);
+  };
+
+  void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
+  void addQmu(const FermionField &in, FermionField &out, int dag);
+
 protected:

  virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
  virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);

+  std::vector<RealD> qmu;
+
  // Part frac
  RealD mass;
  RealD dw_diag;
  RealD R;
  RealD amax;
  RealD scale;
-  Vector<double> p; 
-  Vector<double> q;
+  std::vector<double> p; 
+  std::vector<double> q;

 };

@@ -35,7 +35,7 @@ template<class Matrix, class Field>
 class KappaSimilarityTransform {
 public:
  INHERIT_IMPL_TYPES(Matrix);
-  Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
+  std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;

  KappaSimilarityTransform (Matrix &zmob) {
    for (int i=0;i<(int)zmob.bs.size();i++) {
@@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   
 public:

-  void DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
+  void DhopImproved(StencilImpl &st,
 		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
 		    const FermionField &in, FermionField &out, int dag, int interior,int exterior);
-  void DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
+  void DhopNaive(StencilImpl &st,
 		 DoubledGaugeField &U,
 		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
  
@@ -47,7 +47,7 @@ public:
  static int PartialCompressionFactor(GridBase *grid) { return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
+  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
 				   const Lattice<vobj> &rhs,
 				   cobj *buffer,
 				   compressor &compress,
@@ -109,7 +109,7 @@ public:
  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
@@ -197,7 +197,7 @@ public:
 #endif
  
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
+  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
 					 const Lattice<vobj> &rhs,
 					 cobj *buffer,
 					 compressor &compress,
@@ -208,7 +208,7 @@ public:
    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
  }
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
@@ -402,7 +402,6 @@ public:

  typedef CartesianStencil<vobj,cobj,Parameters> Base;
  typedef typename Base::View_type View_type;
-  typedef typename Base::StencilVector StencilVector;

  //  Vector<int> surface_list;
  WilsonStencil(GridBase *grid,
@@ -416,29 +415,6 @@ public:
    this->same_node.resize(npoints);
  };
  
-  /*
-  void BuildSurfaceList(int Ls,int vol4){
-
-    // find same node for SHM
-    // Here we know the distance is 1 for WilsonStencil
-    for(int point=0;point<this->_npoints;point++){
-      this->same_node[point] = this->SameNode(point);
-    }
-    
-    for(int site = 0 ;site< vol4;site++){
-      int local = 1;
-      for(int point=0;point<this->_npoints;point++){
-	if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ 
-	  local = 0;
-	}
-      }
-      if(local == 0) { 
-	surface_list.push_back(site);
-      }
-    }
-  }
-  */
-  
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
  {
@@ -508,6 +484,11 @@ public:
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
+#ifdef NVLINK_GET
+    this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
+    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
+    // Or issue barrier AFTER the DMA is running
+#endif    
  }

 };
@@ -126,14 +126,17 @@ public:
  void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
                     const FermionField &A, const FermionField &B, int dag);

-  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+  void DhopInternal(StencilImpl &st,
+		    DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);

-  void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-                    const FermionField &in, FermionField &out, int dag);
+  void DhopInternalSerial(StencilImpl &st,
+			  DoubledGaugeField &U,
+			  const FermionField &in, FermionField &out, int dag);

-  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-                    const FermionField &in, FermionField &out, int dag);
+  void DhopInternalOverlappedComms(StencilImpl &st,
+				   DoubledGaugeField &U,
+				   const FermionField &in, FermionField &out, int dag);

  // Constructor
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
@@ -168,9 +171,6 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;

-  LebesgueOrder Lebesgue;
-  LebesgueOrder LebesgueEvenOdd;
-
  WilsonAnisotropyCoefficients anisotropyCoeff;

  ///////////////////////////////////////////////////////////////
@@ -109,6 +109,8 @@ public:
  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
+				  std::vector<double> qmu) ;

  // Implement hopping term non-hermitian hopping term; half cb or both
  // Implement s-diagonal DW
@@ -117,6 +119,9 @@ public:
  void DhopOE(const FermionField &in, FermionField &out,int dag);
  void DhopEO(const FermionField &in, FermionField &out,int dag);

+  void DhopComms  (const FermionField &in, FermionField &out);
+  void DhopCalc   (const FermionField &in, FermionField &out,uint64_t *ids);
+  
  // add a DhopComm
  // -- suboptimal interface will presently trigger multiple comms.
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
@@ -135,21 +140,18 @@ public:
 		     int dag);
    
  void DhopInternal(StencilImpl & st,
-		    LebesgueOrder &lo,
 		    DoubledGaugeField &U,
 		    const FermionField &in, 
 		    FermionField &out,
 		    int dag);

  void DhopInternalOverlappedComms(StencilImpl & st,
-				   LebesgueOrder &lo,
 				   DoubledGaugeField &U,
 				   const FermionField &in, 
 				   FermionField &out,
 				   int dag);

  void DhopInternalSerialComms(StencilImpl & st,
-			       LebesgueOrder &lo,
 			       DoubledGaugeField &U,
 			       const FermionField &in, 
 			       FermionField &out,
@@ -203,9 +205,6 @@ public:
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
    
-  LebesgueOrder Lebesgue;
-  LebesgueOrder LebesgueEvenOdd;
-    
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;

@@ -57,6 +57,10 @@ public:
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
 			 int interior=1,int exterior=1) ;

+  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 uint64_t *ids);
+  
  static void DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out,
 			    int interior=1,int exterior=1) ;
@@ -58,7 +58,7 @@ public:
  {
    //    RealD eps = 1.0;
    std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
-    Vector<Coeff_t> zgamma(this->Ls);
+    std::vector<Coeff_t> zgamma(this->Ls);
    for(int s=0;s<this->Ls;s++){
      zgamma[s] = gamma[s];
    }
@@ -1,3 +1,5 @@
+#if 0
+
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -818,3 +820,5 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
 }

 NAMESPACE_END(Grid);
+
+#endif
@@ -1,3 +1,4 @@
+#if 0
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -241,3 +242,4 @@ void LebesgueOrder::ZGraph(void)
 }
 NAMESPACE_END(Grid);

+#endif
@@ -72,7 +72,7 @@ public:
  void ThreadInterleave(void);

 private:
-  Vector<IndexInteger> _LebesgueReorder;
+  deviceVector<IndexInteger> _LebesgueReorder;

 };    

@@ -49,6 +49,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FourDimRedBlackGrid,_M5,p),
  mass_plus(_mass), mass_minus(_mass)
 {
+  // qmu defaults to zero size;
 }

 ///////////////////////////////////////////////////////////////
@@ -156,18 +157,18 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
+  std::vector<Coeff_t> diag (Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bs;
-  Vector<Coeff_t> upper= cs;
-  Vector<Coeff_t> lower= cs; 
+  std::vector<Coeff_t> diag = bs;
+  std::vector<Coeff_t> upper= cs;
+  std::vector<Coeff_t> lower= cs; 
  upper[Ls-1]=-mass_minus*upper[Ls-1];
  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
@@ -176,9 +177,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = beo;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = beo;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
@@ -191,9 +192,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-cee[i];
    lower[i]=-cee[i];
@@ -206,9 +207,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag = bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);

  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
@@ -236,9 +237,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);
-  Vector<Coeff_t> lower(Ls,-1.0);
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);
+  std::vector<Coeff_t> lower(Ls,-1.0);
  upper[Ls-1]=-mass_plus*upper[Ls-1];
  lower[0]   =-mass_minus*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
@@ -248,9 +249,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  Vector<Coeff_t> diag =bs;
-  Vector<Coeff_t> upper=cs;
-  Vector<Coeff_t> lower=cs; 
+  std::vector<Coeff_t> diag =bs;
+  std::vector<Coeff_t> upper=cs;
+  std::vector<Coeff_t> lower=cs; 

  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
@@ -270,6 +271,34 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  M5Ddag(psi,psi,Din,lower,diag,upper);
 }

+template<class Impl>
+void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
+{
+  if ( qmu.size() ) {
+
+    Gamma::Algebra Gmu [] = {
+      Gamma::Algebra::GammaX,
+      Gamma::Algebra::GammaY,
+      Gamma::Algebra::GammaZ,
+      Gamma::Algebra::GammaT
+    };
+    std::vector<ComplexD> coeff(Nd);
+    ComplexD ci(0,1);
+
+    assert(qmu.size()==Nd);
+
+    for(int mu=0;mu<Nd;mu++){
+       coeff[mu] = ci*qmu[mu];
+       if ( dag ) coeff[mu] = conjugate(coeff[mu]);
+    }
+
+    chi = chi + Gamma(Gmu[0])*psi*coeff[0];
+    for(int mu=1;mu<Nd;mu++){
+      chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
+    }
+  }
+}
+
 template<class Impl>
 void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
@@ -279,6 +308,10 @@ void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
  Meooe5D(psi,Din);

  this->DW(Din,chi,DaggerNo);
+
+  // add i q_mu gamma_mu here
+  addQmu(Din,chi,DaggerNo);
+  
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby(chi,1.0,1.0,chi,psi); 
  
@@ -296,6 +329,9 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  // Apply Dw
  this->DW(psi,Din,DaggerYes); 

+  // add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
+  addQmu(psi,Din,DaggerYes);
+  
  MeooeDag5D(Din,chi);
  
  M5Ddag(psi,chi);
@@ -394,7 +430,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  Vector<Coeff_t> gamma(this->Ls);
+  std::vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(1.0,gamma,b,c);
 }
@@ -402,13 +438,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  Vector<Coeff_t> gamma(this->Ls);
+  std::vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(zolo_hi,gamma,b,c);
 }
 //Zolo
 template<class Impl>
-void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
+void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
 {
  int Ls=this->Ls;

@@ -529,6 +565,18 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
    dee[Ls-1] += delta_d;
  }  

+  //////////////////////////////////////////
+  // Device buffers
+  //////////////////////////////////////////
+  d_diag.resize(Ls);
+  d_upper.resize(Ls);
+  d_lower.resize(Ls);
+
+  d_dee.resize(Ls);
+  d_lee.resize(Ls);
+  d_uee.resize(Ls);
+  d_leem.resize(Ls);
+  d_ueem.resize(Ls);
  //  int inv=1;
  //  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
  //  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
@@ -43,9 +43,9 @@ void
 CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
 			       const FermionField &phi_i, 
 			       FermionField &chi_i,
-			       Vector<Coeff_t> &lower,
-			       Vector<Coeff_t> &diag,
-			       Vector<Coeff_t> &upper)
+			       std::vector<Coeff_t> &lower,
+			       std::vector<Coeff_t> &diag,
+			       std::vector<Coeff_t> &upper)
 {
  
  chi_i.Checkerboard()=psi_i.Checkerboard();
@@ -55,12 +55,16 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());

-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
-
  int Ls =this->Ls;

+  acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
+  
+  auto pdiag = &d_diag[0];
+  auto pupper = &d_upper[0];
+  auto plower = &d_lower[0];
+
  // 10 = 3 complex mult + 2 complex add
  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
  uint64_t nloop = grid->oSites();
@@ -82,9 +86,9 @@ void
 CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
 			      const FermionField &phi_i, 
 			      FermionField &chi_i,
-			      Vector<Coeff_t> &lower,
-			      Vector<Coeff_t> &diag,
-			      Vector<Coeff_t> &upper)
+			      std::vector<Coeff_t> &lower,
+			      std::vector<Coeff_t> &diag,
+			      std::vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
@@ -93,12 +97,16 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());

-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
-
  int Ls=this->Ls;

+  acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
+  
+  auto pdiag = &d_diag[0];
+  auto pupper = &d_upper[0];
+  auto plower = &d_lower[0];
+
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -126,11 +134,17 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi

  int Ls=this->Ls;

-  auto plee  = & lee [0];
-  auto pdee  = & dee [0];
-  auto puee  = & uee [0];
-  auto pleem = & leem[0];
-  auto pueem = & ueem[0];
+  acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+
+  auto plee  = & d_lee [0];
+  auto pdee  = & d_dee [0];
+  auto puee  = & d_uee [0];
+  auto pleem = & d_leem[0];
+  auto pueem = & d_ueem[0];

  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -182,11 +196,17 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
  autoView(psi , psi_i,AcceleratorRead);
  autoView(chi , chi_i,AcceleratorWrite);

-  auto plee  = & lee [0];
-  auto pdee  = & dee [0];
-  auto puee  = & uee [0];
-  auto pleem = & leem[0];
-  auto pueem = & ueem[0];
+  acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
+
+  auto plee  = & d_lee [0];
+  auto pdee  = & d_dee [0];
+  auto puee  = & d_uee [0];
+  auto pleem = & d_leem[0];
+  auto pueem = & d_ueem[0];

  assert(psi.Checkerboard() == psi.Checkerboard());

@@ -42,13 +42,13 @@ template<class Impl>
 void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
 {
  // How to check Ls matches??
-  //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
-  //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
+  std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl;
+  std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
+  std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
+  std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
  int Ls = this->Ls;
+  std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
  assert(zdata->db==Ls);// Beta has Ls coeffs

  R=(1+this->mass)/(1-this->mass);
@@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
@@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid);
 // Pplus  backwards..
 template<class Impl>
 void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, 
-				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+				      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  int Ls = this->Ls;
@@ -50,9 +50,15 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  autoView( psi , psi_i, AcceleratorRead);
  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
+
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+
+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
+
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  
  auto nloop=grid->oSites()/Ls;
@@ -73,7 +79,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi

 template<class Impl>
 void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, 
-					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
+					 std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase* grid = psi_i.Grid();
@@ -83,9 +89,14 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  autoView( phi , phi_i, AcceleratorRead);
  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
+  
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+
+  acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));

  // Flops = 6.0*(Nc*Ns) *Ls*vol

@@ -114,12 +125,17 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
  autoView( chi, chi_i, AcceleratorWrite);
  int Ls = this->Ls;

-  auto plee  = & this->lee[0];
-  auto pdee  = & this->dee[0];
-  auto puee  = & this->uee[0];
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];
  
-  auto pleem = & this->leem[0];
-  auto pueem = & this->ueem[0];
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));

  uint64_t nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -131,9 +131,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi
    else{ shiftm = -shift*(mq3-mq2); }
  }

-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;

 #if(0)
  std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
@@ -168,9 +168,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField&
    else{ shiftm = -shift*(mq3-mq2); }
  }

-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;

  this->M5Ddag(psi, chi, chi, lower, diag, upper);
 }
@@ -181,9 +181,9 @@ void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& c
 {
  int Ls = this->Ls;

-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);

  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
@@ -200,9 +200,9 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
 {
  int Ls = this->Ls;

-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);

  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
@@ -218,7 +218,7 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField

 //Zolo
 template<class Impl>
-void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
+void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
 {
  int   Ls    = this->Ls;
  int   pm    = this->pm;
@@ -61,8 +61,6 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
  UUUmu(&FourDimGrid),
  UUUmuEven(&FourDimRedBlackGrid),
  UUUmuOdd(&FourDimRedBlackGrid),
-  Lebesgue(&FourDimGrid),
-  LebesgueEvenOdd(&FourDimRedBlackGrid),
  _tmp(&FiveDimRedBlackGrid)
 {

@@ -277,18 +275,18 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,

 /*CHANGE */
 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, 
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
  else
-    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalSerialComms(st,U,UUU,in,out,dag);
 }

 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, 
 								   DoubledGaugeField & U,DoubledGaugeField & UUU,
 								   const FermionField &in, FermionField &out,int dag)
 {
@@ -313,7 +311,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }

  st.CommsMerge(compressor);
@@ -323,12 +321,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 }

 template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, 
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
@@ -341,7 +339,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 }
 /*CHANGE END*/
@@ -357,7 +355,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
  assert(in.Checkerboard()==Even);
  out.Checkerboard() = Odd;

-  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
+  DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
@@ -368,7 +366,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
  assert(in.Checkerboard()==Odd);
  out.Checkerboard() = Even;

-  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
+  DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
@@ -378,7 +376,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField

  out.Checkerboard() = in.Checkerboard();

-  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
+  DhopInternal(Stencil,Umu,UUUmu,in,out,dag);
 }

 /////////////////////////////////////////////////////////////////////////
@@ -48,8 +48,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
    mass(_mass),
-    Lebesgue(_grid),
-    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
@@ -339,7 +337,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &

  out.Checkerboard() = in.Checkerboard();

-  DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
+  DhopInternal(Stencil, Umu, UUUmu, in, out, dag);
 }

 template <class Impl>
@@ -351,7 +349,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;

-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
+  DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag);
 }

 template <class Impl>
@@ -363,7 +361,7 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;

-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
+  DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag);
 }

 template <class Impl>
@@ -394,19 +392,19 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel


 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, 
 						  DoubledGaugeField &U,
 						  DoubledGaugeField &UUU,
 						  const FermionField &in,
 						  FermionField &out, int dag) 
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
  else
-    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+    DhopInternalSerialComms(st,U,UUU,in,out,dag);
 }
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, 
 								 DoubledGaugeField &U,
 								 DoubledGaugeField &UUU,
 								 const FermionField &in,
@@ -429,7 +427,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }

  st.CommunicateComplete(requests);
@@ -440,13 +438,13 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 }


 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
+void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, 
 							     DoubledGaugeField &U,
 							     DoubledGaugeField &UUU,
 							     const FermionField &in,
@@ -460,7 +458,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
+    Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
  }
 };

@@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
 
 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				  Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+				  std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@@ -50,9 +50,13 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField

  assert(phi.Checkerboard() == psi.Checkerboard());

-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+
+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
  
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@@ -74,8 +78,8 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField

 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
-					Vector<Coeff_t> &shift_coeffs)
+					std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
+					std::vector<Coeff_t> &shift_coeffs)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@@ -89,10 +93,15 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
  
  assert(phi.Checkerboard() == psi.Checkerboard());

-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
-  auto pshift_coeffs = &shift_coeffs[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+  auto pshift_coeffs = &this->d_shift_coefficients[0];
+
+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));

  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@@ -119,7 +128,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion

 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-				     Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
+				     std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@@ -130,9 +139,13 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie

  assert(phi.Checkerboard() == psi.Checkerboard());
  
-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+
+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));

  // Flops = 6.0*(Nc*Ns) *Ls*vol
  int nloop = grid->oSites()/Ls;
@@ -154,8 +167,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie

 template<class Impl>
 void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
-					   Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
-					   Vector<Coeff_t> &shift_coeffs)
+					   std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
+					   std::vector<Coeff_t> &shift_coeffs)
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
@@ -167,10 +180,15 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm

  assert(phi.Checkerboard() == psi.Checkerboard());

-  auto pdiag = &diag[0];
-  auto pupper = &upper[0];
-  auto plower = &lower[0];
-  auto pshift_coeffs = &shift_coeffs[0];
+  auto pdiag  = &this->d_diag[0];
+  auto pupper = &this->d_upper[0];
+  auto plower = &this->d_lower[0];
+  auto pshift_coeffs = &this->d_shift_coefficients[0];
+
+  acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
  
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  auto pm = this->pm;
@@ -212,11 +230,17 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);

-  auto plee = & this->lee [0];
-  auto pdee = & this->dee [0];
-  auto puee = & this->uee [0];
-  auto pleem= & this->leem[0];
-  auto pueem= & this->ueem[0];
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];
+
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));

  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }

@@ -268,14 +292,23 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);

+  // Move into object and constructor
  auto pm = this->pm;
-  auto plee = & this->lee [0];
-  auto pdee = & this->dee [0];
-  auto puee = & this->uee [0];
-  auto pleem= & this->leem[0];
-  auto pueem= & this->ueem[0];
-  auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0];
-  auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];
+  auto pMooeeInv_shift_lc   = &this->d_MooeeInv_shift_lc[0];
+  auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0];
+
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -333,11 +366,17 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  autoView(psi , psi_i, AcceleratorRead);
  autoView(chi , chi_i, AcceleratorWrite);

-  auto plee = & this->lee [0];
-  auto pdee = & this->dee [0];
-  auto puee = & this->uee [0];
-  auto pleem= & this->leem[0];
-  auto pueem= & this->ueem[0];
+  auto plee  = &this->d_lee [0];
+  auto pdee  = &this->d_dee [0];
+  auto puee  = &this->d_uee [0];
+  auto pleem = &this->d_leem[0];
+  auto pueem = &this->d_ueem[0];
+
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -387,13 +426,25 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
  int Ls = this->Ls;

  auto pm = this->pm;
-  auto plee = & this->lee [0];
-  auto pdee = & this->dee [0];
-  auto puee = & this->uee [0];
-  auto pleem= & this->leem[0];
-  auto pueem= & this->ueem[0];
-  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
-  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
+  auto plee  = & this->d_lee [0];
+  auto pdee  = & this->d_dee [0];
+  auto puee  = & this->d_uee [0];
+  auto pleem = & this->d_leem[0];
+  auto pueem = & this->d_ueem[0];
+
+  auto pMooeeInvDag_shift_lc   = &this->d_MooeeInv_shift_lc[0];
+  auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0];
+
+  acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
+  acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
+
+  //  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
+  //  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -196,9 +196,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
 {
  int Ls = this->Ls;

-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;

  // no shift term
  if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
@@ -212,9 +212,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
 {
  int Ls = this->Ls;

-  Vector<Coeff_t> diag(Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-  Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+  std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;

  // no shift term
  if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
@@ -230,9 +230,9 @@ void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
  int Ls = this->Ls;

  // coefficients of Mooee
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    upper[s] = -this->cee[s];
    lower[s] = -this->cee[s];
@@ -253,9 +253,9 @@ void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& ch
  int Ls = this->Ls;

  // coefficients of MooeeDag
-  Vector<Coeff_t> diag = this->bee;
-  Vector<Coeff_t> upper(Ls);
-  Vector<Coeff_t> lower(Ls);
+  std::vector<Coeff_t> diag = this->bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int s=0; s<Ls; s++){
    if(s==0) {
      upper[s] = -this->cee[s+1];
@@ -314,10 +314,10 @@ void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
  // Tridiagonal solve for MooeeInvDag_shift_lc
  {
    Coeff_t m(0.0);
-    Vector<Coeff_t> d = Mooee_shift;
-    Vector<Coeff_t> u(Ls,0.0);
-    Vector<Coeff_t> y(Ls,0.0);
-    Vector<Coeff_t> q(Ls,0.0);
+    std::vector<Coeff_t> d = Mooee_shift;
+    std::vector<Coeff_t> u(Ls,0.0);
+    std::vector<Coeff_t> y(Ls,0.0);
+    std::vector<Coeff_t> q(Ls,0.0);
    if(pm == 1){ u[0] = 1.0; }
    else{ u[Ls-1] = 1.0; }

@@ -48,8 +48,6 @@ NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed
    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
    mass(_mass),
-    Lebesgue(_grid),
-    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
@@ -268,7 +266,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out

  out.Checkerboard() = in.Checkerboard();

-  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
+  DhopInternal(Stencil, Umu, in, out, dag);
 }

 template <class Impl>
@@ -280,7 +278,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;

-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
+  DhopInternal(StencilEven, UmuOdd, in, out, dag);
 }

 template <class Impl>
@@ -292,7 +290,7 @@ void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &o
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;

-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
+  DhopInternal(StencilOdd, UmuEven, in, out, dag);
 }

 template <class Impl>
@@ -323,18 +321,18 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &


 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
 					       DoubledGaugeField &U,
 					       const FermionField &in,
 					       FermionField &out, int dag) 
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
+    DhopInternalOverlappedComms(st,U,in,out,dag);
  else
-    DhopInternalSerialComms(st,lo,U,in,out,dag);
+    DhopInternalSerialComms(st,U,in,out,dag);
 }
 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
 							      DoubledGaugeField &U,
 							      const FermionField &in,
 							      FermionField &out, int dag) 
@@ -356,7 +354,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
  {
    int interior=1;
    int exterior=0;
-    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
  }

  st.CommunicateComplete(requests);
@@ -367,12 +365,12 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
  {
    int interior=0;
    int exterior=1;
-    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
  }
 }

 template <class Impl>
-void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
+void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
 							  DoubledGaugeField &U,
 							  const FermionField &in,
 							  FermionField &out, int dag) 
@@ -385,7 +383,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
  {
    int interior=1;
    int exterior=1;
-    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
+    Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
  }
 };

@@ -239,6 +239,31 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi

  this->DW(psi,D,DaggerNo);

+  // DW - DW+iqslash
+  //  (g5 Dw)^dag = g5 Dw
+  //  (iqmu g5 gmu)^dag = (-i qmu gmu^dag g5^dag) = i qmu g5 gmu
+  if ( qmu.size() ) {
+
+    std::cout<< "Mat" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
+    assert(qmu.size()==Nd);
+
+    FermionField qslash_psi(psi.Grid());
+
+    Gamma::Algebra Gmu [] = {
+			     Gamma::Algebra::GammaX,
+			     Gamma::Algebra::GammaY,
+			     Gamma::Algebra::GammaZ,
+			     Gamma::Algebra::GammaT
+    };
+    qslash_psi = qmu[0]*(Gamma(Gmu[0])*psi);
+    for(int mu=1;mu<Nd;mu++){
+      qslash_psi = qslash_psi + qmu[mu]*(Gamma(Gmu[mu])*psi);
+    }
+    ComplexD ci(0.0,1.0);
+    qslash_psi = ci*qslash_psi ; // i qslash
+    D = D + qslash_psi;
+  }
+
  int nblock=(Ls-1)/2;
  for(int b=0;b<nblock;b++){
 	
@@ -255,8 +280,47 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
  }
 	
  {
+    // The 'conventional' Cayley overlap operator is
+    //
+    // Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
+    //
+    //
+    // With massless limit 1/2(1+g5 sgnHw)
+    //
+    // Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
+    //
+    // However, the conventional normalisation has both a leading order factor of 2 in Zq
+    // at tree level AND a mass dependent (1-m) that are convenient to absorb.
+    //
+    // In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
+    //
+    // num = -i sin kmu gmu
+    //
+    // denom ( sqrt(sk^2 + (2shk^2 - 1)^2
+    //    b_k = sk2 - M5;
+    //     
+    //    w_k = sqrt(sk + b_k*b_k);
+    //
+    //    denom= ( w_k + b_k + mass*mass) ;
+    //
+    //    denom= one/denom;
+    //    out = num*denom;
+    //
+    // Chroma, and Grid define partial fraction via 4d operator
+    //
+    //   Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
+    //
+    // Now since:
+    //
+    //      (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
+    //
+    // This corresponds to a modified mass parameter
+    //
+    // It has an annoying 
+    //
+    // 
    double R=(1+this->mass)/(1-this->mass);
-    //R g5 psi[Ls] + p[0] H
+    //R g5 psi[Ls] + p[0] Hw
    ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
    
    for(int b=0;b<nblock;b++){
@@ -264,6 +328,7 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
      double pp = p[nblock-1-b];
      axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
    }
+   
  }

 }
@@ -411,17 +476,18 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
      int Ls = this->Ls;
      conformable(solution5d.Grid(),this->FermionGrid());
      conformable(exported4d.Grid(),this->GaugeGrid());
-      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+      ExtractSlice(exported4d, solution5d, Ls-1, 0);
    }
    template<class Impl>
    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
    {
+      //void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
      int Ls = this->Ls;
      conformable(imported5d.Grid(),this->FermionGrid());
      conformable(input4d.Grid()   ,this->GaugeGrid());
      FermionField tmp(this->FermionGrid());
      tmp=Zero();
-      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      InsertSlice(input4d, tmp, Ls-1, 0);
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
      this->Dminus(tmp,imported5d);
    }
@@ -442,7 +508,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,

 {
  int Ls = this->Ls;
-
+  qmu.resize(0);
  assert((Ls&0x1)==1); // Odd Ls required
  int nrational=Ls-1;

@@ -460,6 +526,22 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
  Approx::zolotarev_free(zdata);

 }
+template<class Impl>
+PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
+							 GridCartesian         &FiveDimGrid,
+							 GridRedBlackCartesian &FiveDimRedBlackGrid,
+							 GridCartesian         &FourDimGrid,
+							 GridRedBlackCartesian &FourDimRedBlackGrid,
+							 RealD _mass,RealD M5,
+							 std::vector<RealD> &_qmu,
+							 const ImplParams &p)
+  : PartialFractionFermion5D<Impl>(_Umu,
+			     FiveDimGrid,FiveDimRedBlackGrid,
+			     FourDimGrid,FourDimRedBlackGrid,
+			     _mass,M5,p)
+{
+  qmu=_qmu;
+}

 NAMESPACE_END(Grid);

@@ -375,23 +375,6 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  }
 }

-/*
-#define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
-  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-									\
-  template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-									\
-  template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
-						     SiteSpinor *buf, int LLs, int sU, \
-						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-*/
 #undef LOAD_CHI
 #undef HAND_DECLARATIONS

@@ -256,7 +256,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
  });

 template <class Impl> 
-void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
+void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 {
@@ -294,7 +294,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  assert(0 && " Kernel optimisation case not covered ");
 }
 template <class Impl> 
-void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
+void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, 
 				       DoubledGaugeField &U,
 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 {
@@ -58,15 +58,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Umu(_FourDimGrid),
  UmuEven(_FourDimRedBlackGrid),
  UmuOdd (_FourDimRedBlackGrid),
-  Lebesgue(_FourDimGrid),
-  LebesgueEvenOdd(_FourDimRedBlackGrid),
  _tmp(&FiveDimRedBlackGrid),
  Dirichlet(0)
 {
-  Stencil.lo     = &Lebesgue;
-  StencilEven.lo = &LebesgueEvenOdd;
-  StencilOdd.lo  = &LebesgueEvenOdd;
-  
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
@@ -305,19 +299,19 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 }

 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
+void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st,
                                         DoubledGaugeField & U,
                                         const FermionField &in, FermionField &out,int dag)
 {
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
+    DhopInternalOverlappedComms(st,U,in,out,dag);
  else 
-    DhopInternalSerialComms(st,lo,U,in,out,dag);
+    DhopInternalSerialComms(st,U,in,out,dag);
 }


 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
+void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
@@ -331,21 +325,21 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
  {
+    //    std::cout << " WilsonFermion5D gather " <<std::endl;
    GRID_TRACE("Gather");
    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
  }
  
+  //  std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
  std::vector<std::vector<CommsRequest_t> > requests;
-  auto id=traceStart("Communicate overlapped");
-  st.CommunicateBegin(requests);

+#if 1
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  {
-    GRID_TRACE("MergeSHM");
-    st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-  }
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms 
+#endif

  /////////////////////////////
  // do the compute interior
@@ -359,21 +353,34 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
  
+  //ifdef GRID_ACCELERATED
+#if 0
+  /////////////////////////////
+  // Overlap with comms -- on GPU the interior kernel call is nonblocking
+  /////////////////////////////
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+#endif
+  
+  
  /////////////////////////////
  // Complete comms
  /////////////////////////////
+  //  std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
  st.CommunicateComplete(requests);
-  traceStop(id);
+  //  traceStop(id);

  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
  {
+    //    std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
    GRID_TRACE("Merge");
    st.CommsMerge(compressor);
  }
  

+  //  std::cout << " WilsonFermion5D Exterior " <<std::endl;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
@@ -381,11 +388,12 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
+  //  std::cout << " WilsonFermion5D Done " <<std::endl;
 }


 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
+void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, 
 						    DoubledGaugeField & U,
 						    const FermionField &in, 
 						    FermionField &out,int dag)
@@ -395,11 +403,13 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr

  int LLs = in.Grid()->_rdimensions[0];

+  //  std::cout << " WilsonFermion5D Halo exch " <<std::endl;
  {
    GRID_TRACE("HaloExchange");
    st.HaloExchangeOpt(in,compressor);
  }
  
+  //  std::cout << " WilsonFermion5D Dhop " <<std::endl;
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDag");
@@ -408,6 +418,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
+  //  std::cout << " WilsonFermion5D Done " <<std::endl;
 }


@@ -420,7 +431,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
  assert(in.Checkerboard()==Even);
  out.Checkerboard() = Odd;

-  DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
+  DhopInternal(StencilEven,UmuOdd,in,out,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
@@ -431,8 +442,31 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
  assert(in.Checkerboard()==Odd);
  out.Checkerboard() = Even;

-  DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
+  DhopInternal(StencilOdd,UmuEven,in,out,dag);
 }
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopComms(const FermionField &in, FermionField &out)
+{
+  int dag =0 ;
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+  out.Checkerboard() = in.Checkerboard();
+  Compressor compressor(dag);
+  Stencil.HaloExchangeOpt(in,compressor);
+}
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
+{
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  int LLs = in.Grid()->_rdimensions[0];
+  int Opt = WilsonKernelsStatic::Opt;
+  Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
+}
+
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
@@ -441,7 +475,7 @@ void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int d

  out.Checkerboard() = in.Checkerboard();

-  DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
+  DhopInternal(Stencil,Umu,in,out,dag);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
@@ -735,6 +769,15 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe

 template<class Impl>
 void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist)
+{
+  std::vector<double> empty_q(Nd,0.0);
+  MomentumSpacePropagatorHwQ(out,in,mass,twist,empty_q);
+}
+template<class Impl>
+void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,
+						       RealD mass,
+						       std::vector<double> twist,
+						       std::vector<double> qmu)
 {
    Gamma::Algebra Gmu [] = {
      Gamma::Algebra::GammaX,
@@ -750,6 +793,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    typedef typename FermionField::scalar_type ScalComplex;

    typedef Lattice<iSinglet<vector_type> > LatComplex;
+    typedef iSpinMatrix<ScalComplex> SpinMat;


    Coordinate latt_size   = _grid->_fdimensions;
@@ -767,6 +811,8 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    LatComplex kmu(_grid); 
    ScalComplex ci(0.0,1.0);

+    std::cout<< "Feynman Rule" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
+    
    for(int mu=0;mu<Nd;mu++) {
      
      LatticeCoordinate(kmu,mu);
@@ -777,9 +823,18 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
      kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions

      sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
-      sk  = sk  + sin(kmu)*sin(kmu); 

-      num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);
+      sk = sk + (sin(kmu)+qmu[mu])*(sin(kmu)+qmu[mu]); 
+
+      // Terms for boosted Fermion
+      // 1/2 [ -i gamma.(sin p + q )     ]
+      //     [ --------------------- + 1 ]
+      //     [         wq + b            ]
+      //
+      // wq = sqrt( (sinp+q)^2 + b^2 )
+      //
+      
+      num = num - (sin(kmu)+qmu[mu])*ci*(Gamma(Gmu[mu])*in);

    }
    num = num + mass * in ;
@@ -52,17 +52,12 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
    StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd
    mass(_mass),
-    Lebesgue(_grid),
-    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
-  Stencil.lo     = &Lebesgue;
-  StencilEven.lo = &LebesgueEvenOdd;
-  StencilOdd.lo  = &LebesgueEvenOdd;
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  if  (anisotropyCoeff.isAnisotropic){
@@ -314,7 +309,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da

  out.Checkerboard() = in.Checkerboard();

-  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
+  DhopInternal(Stencil, Umu, in, out, dag);
 }

 template <class Impl>
@@ -326,7 +321,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;

-  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
+  DhopInternal(StencilEven, UmuOdd, in, out, dag);
 }

 template <class Impl>
@@ -338,7 +333,7 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;

-  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
+  DhopInternal(StencilOdd, UmuEven, in, out, dag);
 }

 template <class Impl>
@@ -391,21 +386,21 @@ void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,
 };

 template <class Impl>
-void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, 
                                       DoubledGaugeField &U,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
-    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
+    DhopInternalOverlappedComms(st,U,in,out,dag);
  else
 #endif
-    DhopInternalSerial(st,lo,U,in,out,dag);
+    DhopInternalSerial(st,U,in,out,dag);
 }

 template <class Impl>
-void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, 
 						      DoubledGaugeField &U,
 						      const FermionField &in,
 						      FermionField &out, int dag)
@@ -474,10 +469,10 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO


 template <class Impl>
-void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
-                                       DoubledGaugeField &U,
-                                       const FermionField &in,
-                                       FermionField &out, int dag)
+void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, 
+					     DoubledGaugeField &U,
+					     const FermionField &in,
+					     FermionField &out, int dag)
 {
  GRID_TRACE("DhopSerial");
  assert((dag == DaggerNo) || (dag == DaggerYes));
@@ -40,11 +40,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /// Switch off the 5d vectorised code optimisations
 #undef DWFVEC5D

-static Vector<vComplexF> signsF;
+static std::vector<vComplexF> signsF;

  template<typename vtype>    
-  int setupSigns(Vector<vtype>& signs ){
-    Vector<vtype> bother(2);
+  int setupSigns(std::vector<vtype>& signs ){
+    std::vector<vtype> bother(2);
    signs = bother;
    vrsign(signs[0]);
    visign(signs[1]);
@@ -364,7 +364,7 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled

 #include <simd/Intel512double.h>
    
-static Vector<vComplexD> signsD;
+static std::vector<vComplexD> signsD;
 static int signInitD = setupSigns(signsD);
    
 #define MAYBEPERM(A,perm) if (perm) { A ; }
@@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else {							\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  acceleratorSynchronise();						\
+  acceleratorSynchronise();					\
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
  Recon(result, Uchi);

@@ -411,6 +411,46 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }

+#ifdef GRID_SYCL
+extern "C" {
+    ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void );
+    void  SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value );
+}
+#ifdef GRID_SIMT
+#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id())
+#else
+#define MAKE_ID(A) (0)
+#endif
+
+#else
+
+#define MAKE_ID(A) (0)
+
+#endif
+
+
+#define KERNEL_CALL_ID(A)						\
+  const uint64_t    NN = Nsite*Ls;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+      const int Nsimd = SiteHalfSpinor::Nsimd();			\
+      const int lane=acceleratorSIMTlane(Nsimd);                        \
+      int idx=sF*Nsimd+lane;						\
+      uint64_t id = MAKE_ID();						\
+      ids[idx]=id;							\
+    });									\
+  accelerator_barrier();

 #define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
@@ -418,7 +458,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
      int sF = ss;							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
-  });
+    });

 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();

@@ -434,7 +474,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S

 #define ASM_CALL(A)							\
  thread_for( sss, Nsite, {						\
-    int ss = st.lo->Reorder(sss);					\
+    int ss = sss; /*st.lo->Reorder(sss);*/			\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
@@ -451,6 +491,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
    });}

+
+
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 				     int Ls, int Nsite, const FermionField &in, FermionField &out,
@@ -475,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
-     // dependent on result of merge
+     //     // dependent on result of merge
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
@@ -485,6 +527,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   }
   assert(0 && " Kernel optimisation case not covered ");
  }
+
+template <class Impl>
+void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+				     int Ls, int Nsite, const FermionField &in, FermionField &out,
+				     uint64_t *ids)
+{
+    autoView(U_v  ,  U,AcceleratorRead);
+    autoView(in_v , in,AcceleratorRead);
+    autoView(out_v,out,AcceleratorWrite);
+    autoView(st_v , st,AcceleratorRead);
+    KERNEL_CALL_ID(GenericDhopSite);
+}
  template <class Impl>
  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 					  int Ls, int Nsite, const FermionField &in, FermionField &out,
@@ -40,6 +40,11 @@ public:

  INHERIT_GIMPL_TYPES(Gimpl);

+  using Action<GaugeField>::S;
+  using Action<GaugeField>::Sinitial;
+  using Action<GaugeField>::deriv;
+  using Action<GaugeField>::refresh;
+
 private:
  RealD c_plaq;
  RealD c_rect;
@@ -43,6 +43,11 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
 public:  
  INHERIT_GIMPL_TYPES(Gimpl);

+  using Action<GaugeField>::S;
+  using Action<GaugeField>::Sinitial;
+  using Action<GaugeField>::deriv;
+  using Action<GaugeField>::refresh;
+  
  /////////////////////////// constructors
  explicit WilsonGaugeAction(RealD beta_):beta(beta_){};

@@ -40,7 +40,7 @@ public:
    U = Zero();
    LatticeColourMatrix tmp(Uin.Grid());

-    Vector<typename SU<ncolour>::Matrix> ta(Dimension);
+    std::vector<typename SU<ncolour>::Matrix> ta(Dimension);

    // Debug lines
    // LatticeMatrix uno(Uin.Grid());
@@ -43,7 +43,7 @@ public:
    U = Zero();
    LatticeColourMatrix tmp(Uin.Grid());

-    Vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
+    std::vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);

    for (int a = 0; a < Dimension; a++)
      GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);
@@ -971,7 +971,9 @@ void BaryonUtils<FImpl>::BaryonGamma3pt(
  autoView( vq_ti , q_ti     , AcceleratorRead);
  autoView( vq_tf , q_tf     , AcceleratorRead);

-  Vector<mobj> my_Dq_spec{Dq_spec1,Dq_spec2};
+  deviceVector<mobj> my_Dq_spec(2);
+  acceleratorPut(my_Dq_spec[0],Dq_spec1);
+  acceleratorPut(my_Dq_spec[1],Dq_spec2);
  mobj * Dq_spec_p = &my_Dq_spec[0];

  if (group == 1) {
@@ -1300,7 +1302,8 @@ void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
  autoView( vd_tf   , qd_tf    , AcceleratorRead);
  autoView( vs_ti   , qs_ti    , AcceleratorRead);

-  Vector<mobj> my_Dq_spec{Du_spec};
+  deviceVector<mobj> my_Dq_spec(1);
+  acceleratorPut(my_Dq_spec[0],Du_spec);
  mobj * Dq_spec_p = &my_Dq_spec[0];

  if(op == "Q1"){
@@ -1353,7 +1356,8 @@ void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
  autoView( vd_tf , qd_tf    , AcceleratorRead  );
  autoView( vs_ti , qs_ti    , AcceleratorRead  );
  
-  Vector<mobj> my_Dq_spec{Du_spec};
+  deviceVector<mobj> my_Dq_spec(1);
+  acceleratorPut(my_Dq_spec[0],Du_spec);
  mobj * Dq_spec_p = &my_Dq_spec[0];

  if(op == "Q1"){
@@ -1544,7 +1548,9 @@ void BaryonUtils<FImpl>::XiToSigmaEye(const PropagatorField &qq_loop,
  autoView( vd_tf   , qd_tf    , AcceleratorRead);
  autoView( vs_ti   , qs_ti    , AcceleratorRead);

-  Vector<mobj> my_Dq_spec{Dd_spec,Ds_spec};
+  deviceVector<mobj> my_Dq_spec(2);
+  acceleratorPut(my_Dq_spec[0],Dd_spec);
+  acceleratorPut(my_Dq_spec[0],Ds_spec);
  mobj * Dq_spec_p = &my_Dq_spec[0];

  if(op == "Q1"){
@@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
 ////////////////////////////////////////////////////////////////////////
 // Map a su2 subgroup number to the pair of rows that are non zero
 ////////////////////////////////////////////////////////////////////////
-static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
+static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
  assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));

  int spare = su2_index;
@@ -62,7 +62,7 @@ public:
    // returns i(T_Adj)^index necessary for the projectors
    // see definitions above
    iAdjTa = Zero();
-    Vector<iSUnMatrix<cplx> > ta(ncolour * ncolour - 1);
+    iSUnMatrix<cplx> ta[ncolour * ncolour - 1];
    iSUnMatrix<cplx> tmp;

    // FIXME not very efficient to get all the generators everytime
@@ -207,7 +207,7 @@ static void generatorZtype(int zIndex, iGroupMatrix<cplx> &ta) {
 // Map a su2 subgroup number to the pair of rows that are non zero
 ////////////////////////////////////////////////////////////////////////
 template <ONLY_IF_Sp>
-static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
+static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
  const int nsp=ncolour/2;
  assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2));

@@ -72,7 +72,7 @@ public:
  }

  // Resident in managed memory
-  Vector<GeneralStencilEntry>  _entries; 
+  deviceVector<GeneralStencilEntry>  _entries; 

  GeneralLocalStencil(GridBase *grid, const std::vector<Coordinate> &shifts)
  {
@@ -141,7 +141,7 @@ public:
 	  ////////////////////////////////////////////////
 	  // Store in look up table
 	  ////////////////////////////////////////////////
-	  this->_entries[lex] = SE;
+	  acceleratorPut(this->_entries[lex],SE);
 	}
      });
  }
@@ -19,7 +19,7 @@ public:
  static int PartialCompressionFactor(GridBase *grid) {return 1;};
  // Decompress is after merge so ok
  template<class vobj,class cobj,class compressor> 
-  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
+  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
 				   const Lattice<vobj> &rhs,
 				   cobj *buffer,
 				   compressor &compress,
@@ -35,7 +35,7 @@ public:
    rhs_v.ViewClose();
  }
  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
@@ -83,25 +83,6 @@ public:
 // Wilson compressor will add alternate policies for Dirichlet
 // and possibly partial Dirichlet for DWF
 ////////////////////////////////////
-/*
-class FaceGatherDirichlet
-{
-  // If it's dirichlet we don't assemble comms buffers
-  //
-  // Rely on zeroes in gauge field to drive the correct result
-  // NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so){};
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
-				   Vector<cobj *> pointers,int dimension,int plane,int cbmask,
-				   compressor &compress,int type) {}
-  template<class decompressor,class Merger>
-  static void Merge(decompressor decompress,Merge &mm)  {  }
-  template<class decompressor,class Decompression>
-  static void Decompress(decompressor decompress,Decompression &dd) {}
-};
-*/

 template<class vobj,class FaceGather>
 class SimpleCompressorGather : public FaceGather {
@@ -31,7 +31,6 @@
 #define STENCIL_MAX (16)

 #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate
-#include <Grid/stencil/Lebesgue.h>   // subdir aggregate
 #include <Grid/stencil/GeneralLocalStencil.h>

 //////////////////////////////////////////////////////////////////////////////////////////
@@ -122,17 +121,22 @@ class CartesianStencilAccelerator {
  StencilVector same_node;
  Coordinate    _simd_layout;
  Parameters    parameters;
+  ViewMode mode;
  StencilEntry*  _entries_p;
+  StencilEntry*  _entries_host_p;
  cobj* u_recv_buf_p;
  cobj* u_send_buf_p;

  accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; }

-  accelerator_inline int GetNodeLocal(int osite,int point) const {
-    return this->_entries_p[point+this->_npoints*osite]._is_local;
+  // Not a device function
+  inline int GetNodeLocal(int osite,int point) const {
+    StencilEntry SE=this->_entries_host_p[point+this->_npoints*osite];
+    return SE._is_local;
  }
  accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const {
-    ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite];
+    ptype = this->_permute_type[point];
+    return & this->_entries_p[point+this->_npoints*osite];
  }

  accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const {
@@ -165,28 +169,22 @@ class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parame
 {
 public:
  int *closed;
-  StencilEntry *cpu_ptr;
-  ViewMode      mode;
+  //  StencilEntry *cpu_ptr;
 public:
  // default copy constructor
  CartesianStencilView (const CartesianStencilView &refer_to_me) = default;

  CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode)
-    : CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me),
-    cpu_ptr(this->_entries_p),
-    mode(_mode)
+    : CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me)
  {
-    this->_entries_p =(StencilEntry *)
-      MemoryManager::ViewOpen(this->_entries_p,
-			      this->_npoints*this->_osites*sizeof(StencilEntry),
-			      mode,
-			      AdviseDefault);
+    this->ViewOpen(_mode);
+  }
+  void ViewOpen(ViewMode _mode)
+  {
+    this->mode = _mode;
  }

-  void ViewClose(void)
-  {
-    MemoryManager::ViewClose(this->cpu_ptr,this->mode);
-  }
+  void ViewClose(void)  {  }

 };

@@ -256,7 +254,6 @@ protected:
  GridBase *                        _grid;
 public:
  GridBase *Grid(void) const { return _grid; }
-  LebesgueOrder *lo;

  ////////////////////////////////////////////////////////////////////////
  // Needed to conveniently communicate gparity parameters into GPU memory
@@ -273,11 +270,11 @@ public:
  int face_table_computed;
  int partialDirichlet;
  int fullDirichlet;
-  std::vector<commVector<std::pair<int,int> > > face_table ;
-  Vector<int> surface_list;
+  std::vector<deviceVector<std::pair<int,int> > > face_table ;
+  deviceVector<int> surface_list;

-  stencilVector<StencilEntry>  _entries; // Resident in managed memory
-  commVector<StencilEntry>     _entries_device; // Resident in device memory
+  std::vector<StencilEntry>   _entries; // Resident in host memory
+  deviceVector<StencilEntry>  _entries_device; // Resident in device memory
  std::vector<Packet> Packets;
  std::vector<Merge> Mergers;
  std::vector<Merge> MergersSHM;
@@ -366,12 +363,32 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    //    std::cout << "Communicate Begin "<<std::endl;
+    //    _grid->Barrier();
+    FlightRecorder::StepLog("Communicate begin");
    // All GPU kernel tasks must complete
    //    accelerator_barrier();     // All kernels should ALREADY be complete
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
-#ifdef ACCELERATOR_AWARE_MPI
    for(int i=0;i<Packets.size();i++){
+      //      std::cout << "Communicate prepare "<<i<<std::endl;
+      //      _grid->Barrier();
+      _grid->StencilSendToRecvFromPrepare(MpiReqs,
+					  Packets[i].send_buf,
+					  Packets[i].to_rank,Packets[i].do_send,
+					  Packets[i].recv_buf,
+					  Packets[i].from_rank,Packets[i].do_recv,
+					  Packets[i].xbytes,Packets[i].rbytes,i);
+    }
+    //    std::cout << "Communicate PollDtoH "<<std::endl;
+    //    _grid->Barrier();
+    _grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
+    //    std::cout << "Communicate CopySynch "<<std::endl;
+    //    _grid->Barrier();
+    acceleratorCopySynchronise();
+    // Starts intranode
+    for(int i=0;i<Packets.size();i++){
+      //      std::cout << "Communicate Begin "<<i<<std::endl;
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
@@ -379,23 +396,6 @@ public:
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
-#else
-#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
-    for(int i=0;i<Packets.size();i++){
-      // Introduce a host buffer with a cheap slab allocator and zero cost wipe all
-      Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
-      Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
-      if ( Packets[i].do_send ) {
-	acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
-      }
-      _grid->StencilSendToRecvFromBegin(MpiReqs,
-					Packets[i].host_send_buf,
-					Packets[i].to_rank,Packets[i].do_send,
-					Packets[i].host_recv_buf,
-					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes,Packets[i].rbytes,i);
-    }
-#endif
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
@@ -406,27 +406,25 @@ public:

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    //    std::cout << "Communicate Complete "<<std::endl;
+    //    _grid->Barrier();
+    FlightRecorder::StepLog("Start communicate complete");
+    //    std::cout << "Communicate Complete PollIRecv "<<std::endl;
+    //    _grid->Barrier();
+    _grid->StencilSendToRecvFromPollIRecv(MpiReqs);
+    //    std::cout << "Communicate Complete Complete "<<std::endl;
+    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
-    // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
+    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
    //    accelerator_barrier(); 
-    _grid->StencilBarrier(); 
-#ifndef ACCELERATOR_AWARE_MPI
-#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
-    for(int i=0;i<Packets.size();i++){
-      if ( Packets[i].do_recv ) {
-	acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
-      }
-    }
-    _grid->HostBufferFreeAll();
-#endif
-    // run any checksums
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_recv )
 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
    }
+    FlightRecorder::StepLog("Finish communicate complete");
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
@@ -448,6 +446,7 @@ public:
    Communicate();
    CommsMergeSHM(compress);
    CommsMerge(compress);
+    accelerator_barrier();
  }

  template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@@ -503,6 +502,9 @@ public:
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
    //    accelerator_barrier();
+    //////////////////////////////////
+    // I will overwrite my send buffers
+    //////////////////////////////////
    _grid->StencilBarrier();// Synch shared memory on a single nodes

    assert(source.Grid()==_grid);
@@ -516,6 +518,11 @@ public:
      HaloGatherDir(source,compress,point,face_idx);
    }
    accelerator_barrier(); // All my local gathers are complete
+#ifdef NVLINK_GET
+    _grid->StencilBarrier(); // He can now get mu local gather, I can get his
+    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
+    // Or issue barrier AFTER the DMA is running
+#endif    
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
  }
@@ -554,6 +561,7 @@ public:
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
      acceleratorFenceComputeStream();
+      // Also fenced in WilsonKernels
    }
  }
  
@@ -651,10 +659,10 @@ public:
  ////////////////////////////////////////
  void PrecomputeByteOffsets(void){
    for(int i=0;i<_entries.size();i++){
-      if( _entries[i]._is_local ) {
-	_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
+      if( this->_entries[i]._is_local ) {
+	this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(vobj);
      } else {
-	_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
+	this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(cobj);
      }
    }
  };
@@ -668,7 +676,7 @@ public:
    for(int point=0;point<this->_npoints;point++){
      this->same_node[point] = this->SameNode(point);
    }
-
+    int32_t surface_list_size=0;
    for(int site = 0 ;site< vol4;site++){
      int local = 1;
      for(int point=0;point<this->_npoints;point++){
@@ -678,11 +686,31 @@ public:
      }
      if(local == 0) {
 	for(int s=0;s<Ls;s++){
-	  surface_list.push_back(site*Ls+s);
+	  surface_list_size++;
 	}
      }
    }
-    //std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    //    std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
+    surface_list.resize(surface_list_size);
+    std::vector<int> surface_list_host(surface_list_size);
+    int32_t ss=0;
+    for(int site = 0 ;site< vol4;site++){
+      int local = 1;
+      for(int point=0;point<this->_npoints;point++){
+	if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
+	  local = 0;
+	}
+      }
+      if(local == 0) {
+	for(int s=0;s<Ls;s++){
+	  int idx=site*Ls+s;
+	  surface_list_host[ss]= idx;
+	  ss++;
+	}
+      }
+    }
+    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
+    //    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@@ -770,7 +798,13 @@ public:
    this->_osites  = _grid->oSites();

    _entries.resize(this->_npoints* this->_osites);
-    this->_entries_p = &_entries[0];
+    _entries_device.resize(this->_npoints* this->_osites);
+    this->_entries_host_p = &_entries[0];
+    this->_entries_p = &_entries_device[0];
+
+    //    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
+    //	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
+    
    for(int ii=0;ii<npoints;ii++){

      int i = ii; // reverse direction to get SIMD comms done first
@@ -847,6 +881,7 @@ public:
      u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    }
    PrecomputeByteOffsets();
+    acceleratorCopyToDevice(&this->_entries[0],&this->_entries_device[0],this->_entries.size()*sizeof(StencilEntry));
  }

  void Local     (int point, int dimension,int shiftpm,int cbmask)
@@ -1002,10 +1037,10 @@ public:
      for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 	for(int b=0;b<_grid->_slice_block[dimension];b++){
 	  int idx=point+(lo+o+b)*this->_npoints;
-	  _entries[idx]._offset  =ro+o+b;
-	  _entries[idx]._permute=permute;
-	  _entries[idx]._is_local=1;
-	  _entries[idx]._around_the_world=wrap;
+	  this->_entries[idx]._offset  =ro+o+b;
+	  this->_entries[idx]._permute=permute;
+	  this->_entries[idx]._is_local=1;
+	  this->_entries[idx]._around_the_world=wrap;
 	}
 	o +=_grid->_slice_stride[dimension];
      }
@@ -1023,10 +1058,10 @@ public:

 	  if ( ocb&cbmask ) {
 	    int idx = point+(lo+o+b)*this->_npoints;
-	    _entries[idx]._offset =ro+o+b;
-	    _entries[idx]._is_local=1;
-	    _entries[idx]._permute=permute;
-	    _entries[idx]._around_the_world=wrap;
+	    this->_entries[idx]._offset =ro+o+b;
+	    this->_entries[idx]._is_local=1;
+	    this->_entries[idx]._permute=permute;
+	    this->_entries[idx]._around_the_world=wrap;
 	  }

 	}
@@ -1050,10 +1085,10 @@ public:
      for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 	for(int b=0;b<_grid->_slice_block[dimension];b++){
 	  int idx=point+(so+o+b)*this->_npoints;
-	  _entries[idx]._offset  =offset+(bo++);
-	  _entries[idx]._is_local=0;
-	  _entries[idx]._permute=0;
-	  _entries[idx]._around_the_world=wrap;
+	  this->_entries[idx]._offset  =offset+(bo++);
+	  this->_entries[idx]._is_local=0;
+	  this->_entries[idx]._permute=0;
+	  this->_entries[idx]._around_the_world=wrap;
 	}
 	o +=_grid->_slice_stride[dimension];
      }
@@ -1070,10 +1105,10 @@ public:
 	  int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	  if ( ocb & cbmask ) {
 	    int idx = point+(so+o+b)*this->_npoints;
-	    _entries[idx]._offset  =offset+(bo++);
-	    _entries[idx]._is_local=0;
-	    _entries[idx]._permute =0;
-	    _entries[idx]._around_the_world=wrap;
+	    this->_entries[idx]._offset  =offset+(bo++);
+	    this->_entries[idx]._is_local=0;
+	    this->_entries[idx]._permute =0;
+	    this->_entries[idx]._around_the_world=wrap;
 	  }
 	}
 	o +=_grid->_slice_stride[dimension];
@@ -202,15 +202,15 @@ void acceleratorInit(void)

 #ifdef GRID_SYCL

-cl::sycl::queue *theGridAccelerator;
-cl::sycl::queue *theCopyAccelerator;
+sycl::queue *theGridAccelerator;
+sycl::queue *theCopyAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
-  cl::sycl::gpu_selector selector;
-  cl::sycl::device selectedDevice { selector };
-  theGridAccelerator = new sycl::queue (selectedDevice);
-  theCopyAccelerator = new sycl::queue (selectedDevice);
+  //  sycl::gpu_selector selector;
+  //  sycl::device selectedDevice { selector };
+  theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
+  theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
  //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.

 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
@@ -242,14 +242,14 @@ void acceleratorInit(void)
  gethostname(hostname, HOST_NAME_MAX+1);
  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);

-  auto devices = cl::sycl::device::get_devices();
+  auto devices = sycl::device::get_devices();
  for(int d = 0;d<devices.size();d++){

 #define GPU_PROP_STR(prop) \
-    printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str());
+    printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());

 #define GPU_PROP_FMT(prop,FMT) \
-    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
+    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());

 #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld");
    if ( world_rank == 0) {
@@ -132,27 +132,17 @@ inline void cuda_mem(void)

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
-    int nt=acceleratorThreads();					\
-    typedef uint64_t Iterator;						\
-    auto lambda = [=] accelerator					\
-      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
-      __VA_ARGS__;							\
-    };									\
-    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
-    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
-  }
-#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
-  {									\
-    int nt=acceleratorThreads();					\
-    typedef uint64_t Iterator;						\
-    auto lambda = [=] accelerator					\
-      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
-      __VA_ARGS__;							\
-    };									\
-    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
-    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
+    if ( num1*num2 ) {							\
+      int nt=acceleratorThreads();					\
+      typedef uint64_t Iterator;					\
+      auto lambda = [=] accelerator					\
+	(Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
+		      __VA_ARGS__;					\
+		    };							\
+      dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
+      dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
+      LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
+    }									\
  }

 #define accelerator_for6dNB(iter1, num1,				\
@@ -175,19 +165,6 @@ inline void cuda_mem(void)
  }


-#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
-  {									\
-    int nt=acceleratorThreads();					\
-    typedef uint64_t Iterator;						\
-    auto lambda = [=] accelerator					\
-      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
-      __VA_ARGS__;							\
-    };									\
-    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
-    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
-  }
-
 template<typename lambda>  __global__
 void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
 {
@@ -199,17 +176,6 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
    Lambda(x,y,z);
  }
 }
-template<typename lambda>  __global__
-void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
-{
-  // Weird permute is to make lane coalesce for large blocks
-  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
-  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
-  uint64_t z = threadIdx.x;
-  if ( (x < num1) && (y<num2) && (z<num3) ) {
-    Lambda(x,y,z);
-  }
-}

 template<typename lambda>  __global__
 void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
@@ -243,6 +209,17 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
    }									\
  }

+inline void *acceleratorAllocHost(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = cudaMallocHost((void **)&ptr,bytes);
+  if( err != cudaSuccess ) {
+    ptr = (void *) NULL;
+    printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
+    assert(0);
+  }
+  return ptr;
+}
 inline void *acceleratorAllocShared(size_t bytes)
 {
  void *ptr=NULL;
@@ -264,18 +241,34 @@ inline void *acceleratorAllocDevice(size_t bytes)
  }
  return ptr;
 };
+
+typedef int acceleratorEvent_t;
+
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
+inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //auto discard=cudaStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}


 inline int  acceleratorIsCommunicable(void *ptr)
@@ -302,7 +295,7 @@ NAMESPACE_END(Grid);

 // Force deterministic reductions
 #define SYCL_REDUCTION_DETERMINISTIC
-#include <sycl/CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
@@ -314,8 +307,8 @@ inline void acceleratorMem(void)
  std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
 }

-extern cl::sycl::queue *theGridAccelerator;
-extern cl::sycl::queue *theCopyAccelerator;
+extern sycl::queue *theGridAccelerator;
+extern sycl::queue *theCopyAccelerator;

 #ifdef __SYCL_DEVICE_ONLY__
 #define GRID_SIMT
@@ -326,24 +319,24 @@ extern cl::sycl::queue *theCopyAccelerator;

 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
- return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; 
+ return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2]; 
 #else
 return 0;
 #endif
 } // SYCL specific

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
+  theGridAccelerator->submit([&](sycl::handler &cgh) {		\
    unsigned long nt=acceleratorThreads();				\
    if(nt < 8)nt=8;							\
    unsigned long unum1 = num1;						\
    unsigned long unum2 = num2;						\
    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
-    cl::sycl::range<3> local {nt,1,nsimd};				\
-    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
+    sycl::range<3> local {nt,1,nsimd};				\
+    sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
    cgh.parallel_for(							\
-		     cl::sycl::nd_range<3>(global,local),		\
-		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
+		     sycl::nd_range<3>(global,local),			\
+		     [=] (sycl::nd_item<3> item) /*mutable*/		\
 		     [[intel::reqd_sub_group_size(16)]]			\
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
@@ -356,26 +349,50 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }

 inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
+inline void *acceleratorAllocHost(size_t bytes)  { return malloc_host(bytes,*theGridAccelerator);};
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
+inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};

 inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+
+
+///////
+// Asynch event interface
+///////
+typedef sycl::event acceleratorEvent_t;
+
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  ev.wait();
+}
+
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
+{
+  return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
+}
+
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);}
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
+
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}

 inline int  acceleratorIsCommunicable(void *ptr)
 {
 #if 0
-  auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
-  if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
+  auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
+  if ( uvm = sycl::usm::alloc::shared ) return 1;
  else return 0;
 #endif
  return 1;
+
 }

+
 #endif

 //////////////////////////////////////////////
@@ -472,6 +489,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
    }								\
  }

+inline void *acceleratorAllocHost(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = hipHostMalloc((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
+    ptr = (void *) NULL;
+    fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
+  }
+  return ptr;
+};
 inline void *acceleratorAllocShared(size_t bytes)
 {
  void *ptr=NULL;
@@ -495,37 +522,53 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };

+inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
-//inline void acceleratorCopySynchronise(void) {  }
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}

-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+typedef int acceleratorEvent_t;
+
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  return 0;
 }
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyToDevice(from,to,bytes);
+  return 0;
 }
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };

+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //  auto discard=hipStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
+
+
 #endif

+inline void acceleratorPin(void *ptr,unsigned long bytes)
+{
+#ifdef GRID_SYCL
+  sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context());
+#endif
+}
+
 //////////////////////////////////////////////
 // Common on all GPU targets
 //////////////////////////////////////////////
 #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
 // FIXME -- the non-blocking nature got broken March 30 2023 by PAB
 #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );  
-#define prof_accelerator_for( iter1, num1, nsimd, ... ) \
-  prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\
-  accelerator_barrier(dummy);

 #define accelerator_for( iter, num, nsimd, ... )		\
  accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } );	\
@@ -547,6 +590,8 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize

 #undef GRID_SIMT

+typedef int acceleratorEvent_t;
+
 inline void acceleratorMem(void)
 {
  /*
@@ -566,16 +611,21 @@ inline void acceleratorMem(void)

 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific

-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { acceleratorCopyToDevice(from,to,bytes); return 0; }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { acceleratorCopyFromDevice(from,to,bytes); return 0; }
+inline void acceleratorEventWait(acceleratorEvent_t ev){}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); return 0;}
+
 inline void acceleratorCopySynchronise(void) {};

 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
 #ifdef HAVE_MM_MALLOC_H
+inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
+inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);};
 inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
 inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
 #else
@@ -655,9 +705,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
  acceleratorCopySynchronise();
 }

-template<class T> void acceleratorPut(T& dev,T&host)
+template<class T> void acceleratorPut(T& dev,const T&host)
 {
-  acceleratorCopyToDevice(&host,&dev,sizeof(T));
+  acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
 }
 template<class T> T acceleratorGet(T& dev)
 {
@@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_critical                                     DO_PRAGMA(omp critical)

 #ifdef GRID_OMP
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
-  uint64_t *ufrom = (uint64_t *)from;
+  const uint64_t *ufrom = (const uint64_t *)from;
  uint64_t *uto   = (uint64_t *)to;
  assert(bytes%8==0);
  uint64_t words=bytes/8;
@@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
  });
 }
 #else
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
  bcopy(from,to,bytes);
 }
@@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail;
 int FlightRecorder::LoggingMode;
 int FlightRecorder::ChecksumComms;
 int FlightRecorder::ChecksumCommsSend;
+const char *   FlightRecorder::StepName;
+int32_t  FlightRecorder::StepLoggingCounter;
 int32_t  FlightRecorder::XmitLoggingCounter;
 int32_t  FlightRecorder::RecvLoggingCounter;
 int32_t  FlightRecorder::CsumLoggingCounter;
@@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void)
  CsumLoggingCounter=0;
  NormLoggingCounter=0;
  ReductionLoggingCounter=0;
+  StepName = "No steps started";
+  StepLoggingCounter=0;
 }
 void FlightRecorder::Truncate(void)
 {
@@ -88,6 +92,12 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
    assert(0);
  }
 }
+bool FlightRecorder::StepLog(const char *name)
+{
+  StepName = name;
+  StepLoggingCounter ++;
+  return true;
+}

 void FlightRecorder::SetLoggingModePrint(void)
 {
@@ -111,17 +121,19 @@ uint64_t FlightRecorder::ErrorCount(void)
 {
  return ErrorCounter;
 }
-void FlightRecorder::NormLog(double value)
+bool FlightRecorder::NormLog(double value)
 {
  uint64_t hex = * ( (uint64_t *)&value );
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    NormLoggingCounter++;
+    return true;
  }
  if(LoggingMode == LoggingModeRecord) {
    std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    NormLogVector.push_back(value);
    NormLoggingCounter++;
+    return true;
  }
  if(LoggingMode == LoggingModeVerify) {

@@ -130,6 +142,9 @@ void FlightRecorder::NormLog(double value)

      if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {

+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
 		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
@@ -142,7 +157,9 @@ void FlightRecorder::NormLog(double value)
 		NormLoggingCounter,NormLogVector.size(),
 		value, NormLogVector[NormLoggingCounter]); fflush(stderr);

-	if(!ContinueOnFail)assert(0); // Force takedown of job
+	BACKTRACEFP(stderr);
+
+	if(!ContinueOnFail) return false;
 	  
 	ErrorCounter++;
      } else {
@@ -159,18 +176,21 @@ void FlightRecorder::NormLog(double value)
    }
    NormLoggingCounter++;
  }
+  return true;
 }
-void FlightRecorder::CsumLog(uint64_t hex)
+bool FlightRecorder::CsumLog(uint64_t hex)
 {
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    CsumLoggingCounter++;
+    return true;
  }

  if(LoggingMode == LoggingModeRecord) {
    std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    CsumLogVector.push_back(hex);
    CsumLoggingCounter++;
+    return true;
  }

  if(LoggingMode == LoggingModeVerify) {
@@ -181,6 +201,9 @@ void FlightRecorder::CsumLog(uint64_t hex)

      if ( hex != hexref ) {

+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
        std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;

@@ -188,9 +211,10 @@ void FlightRecorder::CsumLog(uint64_t hex)
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		CsumLoggingCounter,hex, hexref);
+	BACKTRACEFP(stderr);
 	fflush(stderr);

-	if(!ContinueOnFail) assert(0); // Force takedown of job
+	if(!ContinueOnFail) return false;
 	  
 	ErrorCounter++;

@@ -207,7 +231,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
    }
    CsumLoggingCounter++;
  }
+  return true;
 }
+
 void FlightRecorder::ReductionLog(double local,double global)
 {
  uint64_t hex_l = * ( (uint64_t *)&local );
@@ -224,11 +250,15 @@ void FlightRecorder::ReductionLog(double local,double global)
  if(LoggingMode == LoggingModeVerify) {
    if(ReductionLoggingCounter < ReductionLogVector.size()){
      if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		ReductionLoggingCounter,ReductionLogVector.size(),
 		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
+	BACKTRACEFP(stderr);
 	
 	if ( !ContinueOnFail ) assert(0);

@@ -250,10 +280,11 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
  if(LoggingMode == LoggingModeNone) return;

  if ( ChecksumCommsSend ){
-  uint64_t *ubuf = (uint64_t *)buf;
-  if(LoggingMode == LoggingModeNone) return;
+
+    if(LoggingMode == LoggingModeNone) return;
  
 #ifdef GRID_SYCL
+  uint64_t *ubuf = (uint64_t *)buf;
  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
@@ -267,11 +298,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
  if(LoggingMode == LoggingModeVerify) {
    if(XmitLoggingCounter < XmitLogVector.size()){
      if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		XmitLoggingCounter,XmitLogVector.size(),
 		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
+	BACKTRACEFP(stderr);
 	
 	if ( !ContinueOnFail ) assert(0);

@@ -293,9 +328,9 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
 void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
 {
  if ( ChecksumComms ){
-  uint64_t *ubuf = (uint64_t *)buf;
  if(LoggingMode == LoggingModeNone) return;
 #ifdef GRID_SYCL
+  uint64_t *ubuf = (uint64_t *)buf;
  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
@@ -309,11 +344,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
  if(LoggingMode == LoggingModeVerify) {
    if(RecvLoggingCounter < RecvLogVector.size()){
      if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		RecvLoggingCounter,RecvLogVector.size(),
 		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
+	BACKTRACEFP(stderr);
 	
 	if ( !ContinueOnFail ) assert(0);

@@ -12,6 +12,8 @@ class FlightRecorder {
  
  static int                   LoggingMode;
  static uint64_t              ErrorCounter;
+  static const char *                StepName;
+  static int32_t               StepLoggingCounter;
  static int32_t               XmitLoggingCounter;
  static int32_t               RecvLoggingCounter;
  static int32_t               CsumLoggingCounter;
@@ -30,8 +32,9 @@ class FlightRecorder {
  static void SetLoggingModeRecord(void);
  static void SetLoggingModeVerify(void);
  static void SetLoggingMode(LoggingMode_t mode);
-  static void NormLog(double value);
-  static void CsumLog(uint64_t csum);
+  static bool StepLog(const char *name);
+  static bool NormLog(double value);
+  static bool CsumLog(uint64_t csum);
  static void ReductionLog(double lcl, double glbl);
  static void Truncate(void);
  static void ResetCounters(void);
@@ -464,16 +464,12 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
-    std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
-    std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
    std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
-    std::cout<<GridLogMessage<<"  --lebesgue      : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;    
-    std::cout<<GridLogMessage<<"  --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    exit(EXIT_SUCCESS);
  }
@@ -501,28 +497,8 @@ void Grid_init(int *argc,char ***argv)
    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
    StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute;
  }
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
-    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
-  }
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
-    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
-  }

-  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
-    LebesgueOrder::UseLebesgueOrder=1;
-  }
  CartesianCommunicator::nCommThreads = 1;
-#ifdef GRID_COMMS_THREADS  
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
-    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
-    GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
-    assert(CartesianCommunicator::nCommThreads > 0);
-  }
-#endif  
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
-    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
-    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
-  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
    GridLogTimestamp(0);
  } else {
@@ -533,6 +509,13 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);

+  if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
+    std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
+    FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
+    FlightRecorder::PrintEntireLog = 1;
+    FlightRecorder::ChecksumComms  = 1;
+    FlightRecorder::ChecksumCommsSend=1;
+  }
  
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
@@ -573,8 +556,34 @@ void GridLogLayout() {

 void * Grid_backtrace_buffer[_NBACKTRACE];

+void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
+{
+  fprintf(stderr,"Signal handler on host %s\n",hostname);
+  fprintf(stderr,"FlightRecorder step %d stage %s \n",
+	  FlightRecorder::StepLoggingCounter,
+	  FlightRecorder::StepName);
+  fprintf(stderr,"Caught signal %d\n",si->si_signo);
+  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
+  fprintf(stderr,"         code %d\n",si->si_code);
+  // x86 64bit
+#ifdef __linux__
+#ifdef __x86_64__
+  ucontext_t * uc= (ucontext_t *)ptr;
+  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
+  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
+#endif
+#endif
+  fflush(stderr);
+  BACKTRACEFP(stderr);
+  fprintf(stderr,"Called backtrace\n");
+  fflush(stdout);
+  fflush(stderr);
+  return;
+}
+
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
+  fprintf(stderr,"Signal handler on host %s\n",hostname);
  fprintf(stderr,"Caught signal %d\n",si->si_signo);
  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
  fprintf(stderr,"         code %d\n",si->si_code);
@@ -585,7 +594,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
-#define REG(A)  printf("  %s %lx\n",#A,sc-> A);
+#define REG(A)  fprintf(stderr,"  %s %lx\n",#A,sc-> A);
  REG(rdi);
  REG(rsi);
  REG(rbp);
@@ -618,8 +627,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)

 void Grid_exit_handler(void)
 {
-  BACKTRACEFP(stdout);
-  fflush(stdout);
+  //  BACKTRACEFP(stdout);
+  //  fflush(stdout);
 }
 void Grid_debug_handler_init(void)
 {
@@ -627,10 +636,10 @@ void Grid_debug_handler_init(void)
  sigemptyset (&sa.sa_mask);
  sa.sa_sigaction= Grid_sa_signal_handler;
  sa.sa_flags    = SA_SIGINFO;
-  sigaction(SIGSEGV,&sa,NULL);
+  //  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  sigaction(SIGBUS,&sa,NULL);
-  sigaction(SIGUSR2,&sa,NULL);
+  //  sigaction(SIGUSR2,&sa,NULL);

  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);

@@ -638,7 +647,15 @@ void Grid_debug_handler_init(void)
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);

-  atexit(Grid_exit_handler);
+  // Non terminating SIGUSR1/2 handler
+  struct sigaction sa_ping;
+  sigemptyset (&sa_ping.sa_mask);
+  sa_ping.sa_sigaction= Grid_usr_signal_handler;
+  sa_ping.sa_flags    = SA_SIGINFO;
+  sigaction(SIGHUP,&sa_ping,NULL);
+
+  //  atexit(Grid_exit_handler);
 }

 NAMESPACE_END(Grid);
+
@@ -50,7 +50,7 @@ namespace Grid{
      int64_t index64;
      IndexFromCoorReversed(coor,index64,dims);
      if ( index64>=2*1024*1024*1024LL ){
-	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+	//	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
      }
      assert(index64<2*1024*1024*1024LL);
      index = (int) index64;
@@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-SUBDIRS = Grid HMC benchmarks tests examples
+SUBDIRS = Grid  benchmarks tests examples HMC

 include $(top_srcdir)/doxygen.inc

@@ -644,11 +644,6 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);

  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
-#ifdef KNL
-  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
-#else
-  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
-#endif
  Benchmark::Decomposition();

  int do_su4=1;
@@ -70,7 +70,7 @@ int main (int argc, char ** argv)
    pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101}));

    std::vector<double> stop(threads);
-    Vector<Vec> sum(threads);
+    std::vector<Vec> sum(threads);

    std::vector<LatticeVec> x(threads,&Grid);
    for(int t=0;t<threads;t++){
@@ -78,9 +78,9 @@ int main (int argc, char ** argv)
    double t0,t1;
    
    typedef typename DomainWallFermionD::Coeff_t Coeff_t;
-    Vector<Coeff_t> diag = Dw.bs;
-    Vector<Coeff_t> upper= Dw.cs;
-    Vector<Coeff_t> lower= Dw.cs;
+    std::vector<Coeff_t> diag = Dw.bs;
+    std::vector<Coeff_t> upper= Dw.cs;
+    std::vector<Coeff_t> lower= Dw.cs;
    upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
    lower[0]   =-Dw.mass_plus*lower[0];
    
@@ -118,7 +118,7 @@ public:
    fprintf(FP,"Packet bytes, direction, GB/s per node\n");
    for(int lat=16;lat<=maxlat;lat+=8){
      //      for(int Ls=8;Ls<=8;Ls*=2){
-      { int Ls=12;
+      { int Ls=8;

 	Coordinate latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
@@ -492,17 +492,18 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Dw.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Dw.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@@ -520,11 +521,11 @@ public:
 	double mf_hi, mf_lo, mf_err;

 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@@ -535,6 +536,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;

      }

@@ -654,17 +656,19 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;

-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Ds.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@@ -675,11 +679,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@@ -689,6 +693,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
      
      }

@@ -792,19 +797,18 @@ public:
 	  Dc.M(src,r);
 	}
 	FGrid->Barrier();
-	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t ni = 100;
+	uint64_t no = 50;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  Dc.M(src,r);
-	  t1=usecond();
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
+	  double t0=usecond();
+	  for(uint64_t j=0;j<ni;j++){
+	    Dc.M(src,r);
+	  }
+	  double t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
@@ -814,20 +818,21 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call   "<< timestat.mean/ni<<std::endl;
      
      }

@@ -861,7 +866,7 @@ int main (int argc, char ** argv)
  }

  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
-  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
+  //  LebesgueOrder::Block = std::vector<int>({2,2,2,2});

  Benchmark::Decomposition();

@@ -72,6 +72,7 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
+AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])

@@ -128,6 +129,20 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac

+############### internal reduction
+AC_ARG_ENABLE([reduction],
+    [AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])],
+    [ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid])
+
+case ${ac_REDUCTION} in
+    mpi)
+        ;;
+    grid)
+        AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
+    *)
+        AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
+esac
+
 ############### tracing
 AC_ARG_ENABLE([tracing],
    [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
@@ -225,19 +240,21 @@ case ${ac_SFW_FP16} in
      AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac

-############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
+
+############### MPI BOUNCE TO HOST
 AC_ARG_ENABLE([accelerator-aware-mpi],
    [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
    [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])

+# Force accelerator CSHIFT now
+AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device])
+
 case ${ac_ACCELERATOR_AWARE_MPI} in
    yes)
-      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
      AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
    *);;
 esac

-
 ############### SYCL/CUDA/HIP/none
 AC_ARG_ENABLE([accelerator],
    [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
@@ -664,16 +681,6 @@ case ${ac_SHM_FAST_PATH} in
     *) ;;
 esac

-############### communication type selection
-AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
-
-case ${ac_COMMS_THREADS} in
-     yes)
-        AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] )
-      ;;
-     *) ;;
-esac
-
 ############### communication type selection
 AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])

--- a/Show More
+++ b/Show More