Updates in tests to make all of Grid compile

2026-05-12 05:04:30 +01:00 · 2018-12-14 16:55:54 +00:00
parent afc462bd58
commit 422764757d
26 changed files with 388 additions and 399 deletions
@@ -157,7 +157,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  Nblock=8;
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;

-  X.checkerboard = B.checkerboard;
+  X.Checkerboard() = B.Checkerboard();
  conformable(X, B);

  Field tmp(B);
@@ -336,7 +336,7 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &

  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;

-  Psi.checkerboard = Src.checkerboard;
+  Psi.Checkerboard() = Src.Checkerboard();
  conformable(Psi, Src);

  Field P(Src);
@@ -515,7 +515,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
  std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;

  for(int b=0;b<Nblock;b++){ 
-    X[b].checkerboard = B[b].checkerboard;
+    X[b].Checkerboard() = B[b].Checkerboard();
    conformable(X[b], B[b]);
    conformable(X[b], X[0]); 
  }
@@ -1,4 +1,4 @@
-/*************************************************************************************
+    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -24,197 +24,198 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_COMPARISON_H
-#define GRID_COMPARISON_H
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#pragma once

 NAMESPACE_BEGIN(Grid);

-/////////////////////////////////////////
-// This implementation is a bit poor.
-//
-// Only support relational logical operations (<, >  etc)
-// on scalar objects. Therefore can strip any tensor structures.
-//
-// Should guard this with isGridTensor<> enable if?
-/////////////////////////////////////////
-//
-// Generic list of functors
-//
-template<class lobj,class robj> class veq {
-public:
-  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) == (rhs);
-  }
-};
-template<class lobj,class robj> class vne {
-public:
-  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) != (rhs);
-  }
-};
-template<class lobj,class robj> class vlt {
-public:
-  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) < (rhs);
-  }
-};
-template<class lobj,class robj> class vle {
-public:
-  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) <= (rhs);
-  }
-};
-template<class lobj,class robj> class vgt {
-public:
-  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) > (rhs);
-  }
-};
-template<class lobj,class robj> class vge {
-public:
-  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) >= (rhs);
-  }
-};
+  /////////////////////////////////////////
+  // This implementation is a bit poor.
+  //
+  // Only support relational logical operations (<, >  etc)
+  // on scalar objects. Therefore can strip any tensor structures.
+  //
+  // Should guard this with isGridTensor<> enable if?
+  /////////////////////////////////////////
+  //
+  // Generic list of functors
+  //
+  template<class lobj,class robj> class veq {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) == (rhs);
+    }
+  };
+  template<class lobj,class robj> class vne {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) != (rhs);
+    }
+  };
+  template<class lobj,class robj> class vlt {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) < (rhs);
+    }
+  };
+  template<class lobj,class robj> class vle {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) <= (rhs);
+    }
+  };
+  template<class lobj,class robj> class vgt {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) > (rhs);
+    }
+  };
+  template<class lobj,class robj> class vge {
+    public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) >= (rhs);
+    }
+  };
  
-// Generic list of functors
-template<class lobj,class robj> class seq {
-public:
-  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) == (rhs);
-  }
-};
-template<class lobj,class robj> class sne {
-public:
-  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) != (rhs);
-  }
-};
-template<class lobj,class robj> class slt {
-public:
-  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) < (rhs);
-  }
-};
-template<class lobj,class robj> class sle {
-public:
-  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) <= (rhs);
-  }
-};
-template<class lobj,class robj> class sgt {
-public:
-  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) > (rhs);
-  }
-};
-template<class lobj,class robj> class sge {
-public:
-  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
-  { 
-    return (lhs) >= (rhs);
-  }
-};
+  // Generic list of functors
+  template<class lobj,class robj> class seq {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) == (rhs);
+    }
+  };
+  template<class lobj,class robj> class sne {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) != (rhs);
+    }
+  };
+  template<class lobj,class robj> class slt {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) < (rhs);
+    }
+  };
+  template<class lobj,class robj> class sle {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) <= (rhs);
+    }
+  };
+  template<class lobj,class robj> class sgt {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) > (rhs);
+    }
+  };
+  template<class lobj,class robj> class sge {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) >= (rhs);
+    }
+  };

-//////////////////////////////////////////////////////////////////////////////////////////////////////
-// Integer and real get extra relational functions.
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
-{
-  typedef typename vsimd::scalar_type scalar;
-  ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
-  ExtractBuffer<scalar> vrhs(vsimd::Nsimd());
-  ExtractBuffer<Integer> vpred(vsimd::Nsimd());
-  vInteger ret;
-  extract<vsimd,scalar>(lhs,vlhs);
-  extract<vsimd,scalar>(rhs,vrhs);
-  for(int s=0;s<vsimd::Nsimd();s++){
-    vpred[s] = sop(vlhs[s],vrhs[s]);
-  }
-  merge<vInteger,Integer>(ret,vpred);
-  return ret;
-}
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Integer and real get extra relational functions.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
+    {
+      typedef typename vsimd::scalar_type scalar;
+      ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+      ExtractBuffer<scalar> vrhs(vsimd::Nsimd());
+      ExtractBuffer<Integer> vpred(vsimd::Nsimd());
+      vInteger ret;
+      extract<vsimd,scalar>(lhs,vlhs);
+      extract<vsimd,scalar>(rhs,vrhs);
+      for(int s=0;s<vsimd::Nsimd();s++){
+	vpred[s] = sop(vlhs[s],vrhs[s]);
+      }
+      merge<vInteger,Integer>(ret,vpred);
+      return ret;
+    }

-template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
-{
-  typedef typename vsimd::scalar_type scalar;
-  ExtractBuffer<scalar>  vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
-  ExtractBuffer<Integer> vpred(vsimd::Nsimd());
-  vInteger ret;
-  extract<vsimd,scalar>(lhs,vlhs);
-  for(int s=0;s<vsimd::Nsimd();s++){
-    vpred[s] = sop(vlhs[s],rhs);
-  }
-  merge<vInteger,Integer>(ret,vpred);
-  return ret;
-}
+  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
+    {
+      typedef typename vsimd::scalar_type scalar;
+      ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+      ExtractBuffer<Integer> vpred(vsimd::Nsimd());
+      vInteger ret;
+      extract<vsimd,scalar>(lhs,vlhs);
+      for(int s=0;s<vsimd::Nsimd();s++){
+	vpred[s] = sop(vlhs[s],rhs);
+      }
+      merge<vInteger,Integer>(ret,vpred);
+      return ret;
+    }

-template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
-{
-  typedef typename vsimd::scalar_type scalar;
-  ExtractBuffer<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
-  ExtractBuffer<Integer> vpred(vsimd::Nsimd());
-  vInteger ret;
-  extract<vsimd,scalar>(rhs,vrhs);
-  for(int s=0;s<vsimd::Nsimd();s++){
-    vpred[s] = sop(lhs,vrhs[s]);
-  }
-  merge<vInteger,Integer>(ret,vpred);
-  return ret;
-}
+  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+    inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
+    {
+      typedef typename vsimd::scalar_type scalar;
+      ExtractBuffer<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+      ExtractBuffer<Integer> vpred(vsimd::Nsimd());
+      vInteger ret;
+      extract<vsimd,scalar>(rhs,vrhs);
+      for(int s=0;s<vsimd::Nsimd();s++){
+	vpred[s] = sop(lhs,vrhs[s]);
+      }
+      merge<vInteger,Integer>(ret,vpred);
+      return ret;
+    }

 #define DECLARE_RELATIONAL_EQ(op,functor) \
-  template<class vsimd,IfSimd<vsimd> = 0>				\
-  accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)	\
-  {									\
-    typedef typename vsimd::scalar_type scalar;				\
-    return Comparison(functor<scalar,scalar>(),lhs,rhs);		\
-  }									\
-  template<class vsimd,IfSimd<vsimd> = 0>				\
-  accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
-  {									\
-    typedef typename vsimd::scalar_type scalar;				\
-    return Comparison(functor<scalar,scalar>(),lhs,rhs);		\
-  }									\
-  template<class vsimd,IfSimd<vsimd> = 0>				\
-  accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
-  {									\
-    typedef typename vsimd::scalar_type scalar;				\
-    return Comparison(functor<scalar,scalar>(),lhs,rhs);		\
-  }									\
-  template<class vsimd>							\
-  accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs) \
-  {									\
-    return lhs._internal op rhs._internal;				\
-  }									\
-  template<class vsimd>							\
-  accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
-  {									\
-    return lhs._internal op rhs;					\
-  }									\
-  template<class vsimd>							\
-  accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
-  {									\
-    return lhs op rhs._internal;					\
+  template<class vsimd,IfSimd<vsimd> = 0>\
+    inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
+    {\
+      typedef typename vsimd::scalar_type scalar;\
+      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
+    }\
+  template<class vsimd,IfSimd<vsimd> = 0>\
+    inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
+    {\
+      typedef typename vsimd::scalar_type scalar;\
+      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
+    }\
+  template<class vsimd,IfSimd<vsimd> = 0>\
+    inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
+    {\
+      typedef typename vsimd::scalar_type scalar;\
+      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
+    }\
+  template<class vsimd>\
+    inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
+    {									\
+      return lhs._internal op rhs;					\
+    }									\
+  template<class vsimd>\
+    inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
+    {									\
+      return lhs op rhs._internal;					\
    }									\

-#define DECLARE_RELATIONAL(op,functor) DECLARE_RELATIONAL_EQ(op,functor)    
+#define DECLARE_RELATIONAL(op,functor) \
+  DECLARE_RELATIONAL_EQ(op,functor)    \
+  template<class vsimd>\
+    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
+    {									\
+      return lhs._internal op rhs._internal;				\
+    }									

 DECLARE_RELATIONAL(<,slt);
 DECLARE_RELATIONAL(<=,sle);
@@ -228,4 +229,4 @@ DECLARE_RELATIONAL(!=,sne);
 NAMESPACE_END(Grid);


-#endif
+
@@ -477,7 +477,7 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
  typedef typename vobj::vector_type vector_type;
  
  int Nblock = rhs.Grid()->GlobalDimensions()[Orthog];
-  Vector<ComplexD> ip(Nblock);
+  std::vector<ComplexD> ip(Nblock);
  sn.resize(Nblock);
  
  sliceInnerProductVector(ip,rhs,rhs,Orthog);
@@ -586,6 +586,10 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
+
+  auto X_v=X.View();
+  auto Y_v=Y.View();
+  auto R_v=R.View();
  thread_region
  {
    Vector<vobj> s_x(Nblock);
@@ -595,16 +599,16 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
      int o  = n*stride + b;

      for(int i=0;i<Nblock;i++){
-	s_x[i] = X[o+i*ostride];
+	s_x[i] = X_v[o+i*ostride];
      }

      vobj dot;
      for(int i=0;i<Nblock;i++){
-	dot = Y[o+i*ostride];
+	dot = Y_v[o+i*ostride];
 	for(int j=0;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
-	R[o+i*ostride]=dot;
+	R_v[o+i*ostride]=dot;
      }
    }});
  }
@@ -635,6 +639,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
+  auto R_v = R.View();
+  auto X_v = X.View();
  thread_region
  {
    std::vector<vobj> s_x(Nblock);
@@ -645,7 +651,7 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
      int o  = n*stride + b;

      for(int i=0;i<Nblock;i++){
-	s_x[i] = X[o+i*ostride];
+	s_x[i] = X_v[o+i*ostride];
      }

      vobj dot;
@@ -654,7 +660,7 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
 	for(int j=1;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
-	R[o+i*ostride]=dot;
+	R_v[o+i*ostride]=dot;
      }
    }});
  }
@@ -692,6 +698,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>

  typedef typename vobj::vector_typeD vector_typeD;

+  auto lhs_v=lhs.View();
+  auto rhs_v=rhs.View();
  thread_region
  {
    std::vector<vobj> Left(Nblock);
@@ -704,8 +712,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
      int o  = n*stride + b;

      for(int i=0;i<Nblock;i++){
-	Left [i] = lhs[o+i*ostride];
-	Right[i] = rhs[o+i*ostride];
+	Left [i] = lhs_v[o+i*ostride];
+	Right[i] = rhs_v[o+i*ostride];
      }

      for(int i=0;i<Nblock;i++){
@@ -63,7 +63,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    }
  
  virtual void Meooe(const FermionField &in, FermionField &out) {
-    if (in.checkerboard == Odd) {
+    if (in.Checkerboard() == Odd) {
      this->DhopEO(in, out, DaggerNo);
    } else {
      this->DhopOE(in, out, DaggerNo);
@@ -71,7 +71,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
  }
  
  virtual void MeooeDag(const FermionField &in, FermionField &out) {
-    if (in.checkerboard == Odd) {
+    if (in.Checkerboard() == Odd) {
      this->DhopEO(in, out, DaggerYes);
    } else {
      this->DhopOE(in, out, DaggerYes);
@@ -80,7 +80,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
  
  // allow override for twisted mass and clover
  virtual void Mooee(const FermionField &in, FermionField &out) {
-    out.checkerboard = in.checkerboard;
+    out.Checkerboard() = in.Checkerboard();
    //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
    for (int s=0;s<(int)this->mass.size();s++) {
      ComplexD a = 4.0+this->mass[s];
@@ -90,7 +90,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
  }
  
  virtual void MooeeDag(const FermionField &in, FermionField &out) {
-    out.checkerboard = in.checkerboard;
+    out.Checkerboard() = in.Checkerboard();
    for (int s=0;s<(int)this->mass.size();s++) {
      ComplexD a = 4.0+this->mass[s];
      ComplexD b(0.0,-this->mu[s]);
@@ -121,9 +121,9 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
  }
  
  virtual RealD M(const FermionField &in, FermionField &out) {
-    out.checkerboard = in.checkerboard;
+    out.Checkerboard() = in.Checkerboard();
    this->Dhop(in, out, DaggerNo);
-    FermionField tmp(out._grid);
+    FermionField tmp(out.Grid());
    for (int s=0;s<(int)this->mass.size();s++) {
      ComplexD a = 4.0+this->mass[s];
      ComplexD b(0.0,this->mu[s]);
@@ -81,16 +81,20 @@ public:

  virtual RealD S(const Field &p)
  {
-    assert(p._grid->Nd() == Ndim);
-    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+    assert(p.Grid()->Nd() == Ndim);
+    static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements);
    phiStencil.HaloExchange(p, compressor);
-    Field action(p._grid), pshift(p._grid), phisquared(p._grid);
+    Field action(p.Grid()), pshift(p.Grid()), phisquared(p.Grid());
    phisquared = p * p;
    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
+    
+    
+    auto p_v = p.View();
+    auto action_v = action.View();
    for (int mu = 0; mu < Ndim; mu++)
    {
      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
-      parallel_for(int i = 0; i < p._grid->oSites(); i++)
+      parallel_for(int i = 0; i < p.Grid()->oSites(); i++)
      {
        int permute_type;
        StencilEntry *SE;
@@ -98,23 +102,20 @@ public:
        const vobj *temp, *t_p;

        SE = phiStencil.GetEntry(permute_type, mu, i);
-        t_p = &p._odata[i];
+        t_p = &p_v[i];
        if (SE->_is_local)
        {
-          temp = &p._odata[SE->_offset];
-          if (SE->_permute)
-          {
+          temp = &p_v[SE->_offset];
+          if (SE->_permute) {
            permute(temp2, *temp, permute_type);
-            action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
-          }
-          else
-          {
-            action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
+            action_v[i] -= temp2 * (*t_p) + (*t_p) * temp2;
+          } else {
+            action_v[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
          }
        }
        else
        {
-          action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
+          action_v[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
        }
      }
      //  action -= pshift*p + p*pshift;
@@ -127,12 +128,12 @@ public:
  virtual void deriv(const Field &p, Field &force)
  {
    double t0 = usecond();
-    assert(p._grid->Nd() == Ndim);
+    assert(p.Grid()->Nd() == Ndim);
    force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
    double interm_t = usecond();

    // move this outside
-    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+    static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements);

    phiStencil.HaloExchange(p, compressor);
    double halo_t = usecond();
@@ -145,59 +146,51 @@ public:
    for (int point = 0; point < npoint; point++)
    {

-#pragma omp parallel 
-{
-        int permute_type;
-        StencilEntry *SE;
-        const vobj *temp;
+      auto p_v = p.View();
+      auto force_v = force.View();
+            
+      int permute_type;
+      StencilEntry *SE;
+      const vobj *temp;

-#pragma omp for schedule(static, chunk)
-      for (int i = 0; i < p._grid->oSites(); i++)
-      {
-        SE = phiStencil.GetEntry(permute_type, point, i);
-        // prefetch next p?
-
-        if (SE->_is_local)
-        {
-          temp = &p._odata[SE->_offset];
-      
-          if (SE->_permute)
-          {
+      parallel_for (int i = 0; i < p.Grid()->oSites(); i++) {
+	
+	SE = phiStencil.GetEntry(permute_type, point, i);
+	// prefetch next p?
+	  
+	if (SE->_is_local) {
+	  temp = &p_v[SE->_offset];
+	    
+          if (SE->_permute) {
            vobj temp2;
            permute(temp2, *temp, permute_type);
-            force._odata[i] -= temp2;
+            force_v[i] -= temp2;
+          } else {
+            force_v[i] -= *temp; // slow part. Dominated by this read/write (BW)
          }
-          else
-          {
-            force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
-          }
-        }
-        else
-        {
-          force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
+        } else {
+          force_v[i] -= phiStencil.CommBuf()[SE->_offset];
        }
      }
-
    }
-  }
-  force *= N / g;
+    force *= N / g;

-  double t1 = usecond();
-  double total_time = (t1 - t0) / 1e6;
-  double interm_time = (interm_t - t0) / 1e6;
-  double halo_time = (halo_t - interm_t) / 1e6;
-  double stencil_time = (t1 - halo_t) / 1e6;
-  std::cout << GridLogIntegrator << "Total time for force computation (s)       : " << total_time << std::endl;
-  std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
-  std::cout << GridLogIntegrator << "Halo time in force computation (s)         : " << halo_time << std::endl;
-  std::cout << GridLogIntegrator << "Stencil time in force computation (s)      : " << stencil_time << std::endl;
-  double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
-  double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
-  double Gflops = flops / (total_time * 1e9);
-  double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
-  std::cout << GridLogIntegrator << "Flops: " << flops << "  - Gflop/s : " << Gflops << std::endl;
-  std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << "  - Gflop/s NS: " << Gflops_no_stencil << std::endl;
-}
+    double t1 = usecond();
+    double total_time = (t1 - t0) / 1e6;
+    double interm_time = (interm_t - t0) / 1e6;
+    double halo_time = (halo_t - interm_t) / 1e6;
+    double stencil_time = (t1 - halo_t) / 1e6;
+    std::cout << GridLogIntegrator << "Total time for force computation (s)       : " << total_time << std::endl;
+    std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
+    std::cout << GridLogIntegrator << "Halo time in force computation (s)         : " << halo_time << std::endl;
+    std::cout << GridLogIntegrator << "Stencil time in force computation (s)      : " << stencil_time << std::endl;
+    double flops = p.Grid()->gSites() * (14 * N * N * N + 18 * N * N + 2);
+    double flops_no_stencil = p.Grid()->gSites() * (14 * N * N * N + 6 * N * N + 2);
+    double Gflops = flops / (total_time * 1e9);
+    double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
+    std::cout << GridLogIntegrator << "Flops: " << flops << "  - Gflop/s : " << Gflops << std::endl;
+    std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << "  - Gflop/s NS: " << Gflops_no_stencil << std::endl;
+  }
 };

 NAMESPACE_END(Grid);
@@ -73,7 +73,7 @@ public:
    if ((traj % Params.saveInterval) == 0) {
      std::string config, rng;
      this->build_filenames(traj, Params, config, rng);
-      GridBase *grid = U._grid;
+      GridBase *grid = U.Grid();
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
      IldgWriter _IldgWriter(grid->IsBoss());
@@ -75,7 +75,7 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
    if ((traj % Params.saveInterval) == 0) {
      std::string config, rng;
      this->build_filenames(traj, Params, config, rng);
-      GridBase *grid = U._grid;
+      GridBase *grid = U.Grid();
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
      ScidacWriter _ScidacWriter(grid->IsBoss());
@@ -128,12 +128,12 @@ public:
  // average over all x,y,z the temporal loop
  //////////////////////////////////////////////////
  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
-    GaugeMat Ut(Umu._grid), P(Umu._grid);
+    GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
    ComplexD out;
-    int T = Umu._grid->GlobalDimensions()[3];
-    int X = Umu._grid->GlobalDimensions()[0];
-    int Y = Umu._grid->GlobalDimensions()[1];
-    int Z = Umu._grid->GlobalDimensions()[2];
+    int T = Umu.Grid()->GlobalDimensions()[3];
+    int X = Umu.Grid()->GlobalDimensions()[0];
+    int Y = Umu.Grid()->GlobalDimensions()[1];
+    int Z = Umu.Grid()->GlobalDimensions()[2];

    Ut = peekLorentz(Umu,3); //Select temporal direction
    P = Ut;
@@ -55,13 +55,13 @@ LOGICAL_BINOP(||);
 LOGICAL_BINOP(&&);

 template <class T>
-strong_inline bool operator==(const iScalar<T> &t1, const iScalar<T> &t2)
+accelerator_inline bool operator==(const iScalar<T> &t1, const iScalar<T> &t2)
 {
  return (t1._internal == t2._internal);
 }

 template <class T, int N>
-strong_inline bool operator==(const iVector<T, N> &t1, const iVector<T, N> &t2)
+accelerator_inline bool operator==(const iVector<T, N> &t1, const iVector<T, N> &t2)
 {
  bool res = true;

@@ -74,7 +74,7 @@ strong_inline bool operator==(const iVector<T, N> &t1, const iVector<T, N> &t2)
 }

 template <class T, int N>
-strong_inline bool operator==(const iMatrix<T, N> &t1, const iMatrix<T, N> &t2)
+accelerator_inline bool operator==(const iMatrix<T, N> &t1, const iMatrix<T, N> &t2)
 {
  bool res = true;