diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h
index cf7147b9..da63d5e6 100644
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -9,6 +9,7 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
+Author: Christoph Lehner <christoph@lhnr.de
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -94,7 +95,7 @@ const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
 template <class lobj> accelerator_inline 
 const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) 
 {
-  auto view = arg.View();
+  auto view = arg.AcceleratorView(ViewRead);
   return view[ss];
 }
 
diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h
index 3543d6aa..c4a67620 100644
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -7,6 +7,7 @@
     Copyright (C) 2015
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -36,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
   conformable(ret,rhs);
   conformable(lhs,rhs);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@@ -55,9 +56,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(ret,rhs);
   conformable(lhs,rhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -72,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(ret,rhs);
   conformable(lhs,rhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -88,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(ret,rhs);
   conformable(lhs,rhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -107,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(lhs,ret);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     mult(&tmp,&lhs_v(ss),&rhs);
@@ -120,8 +121,8 @@ template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(ret,lhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -134,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(ret,lhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -147,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
   ret.Checkerboard() = lhs.Checkerboard();
   conformable(lhs,ret);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
@@ -164,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = rhs.Checkerboard();
   conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto rhs_t=rhs_v(ss);
@@ -178,8 +179,8 @@ template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = rhs.Checkerboard();
   conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto rhs_t=rhs_v(ss);
@@ -192,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = rhs.Checkerboard();
   conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto rhs_t=rhs_v(ss);
@@ -205,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
   ret.Checkerboard() = rhs.Checkerboard();
   conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
     decltype(coalescedRead(obj1())) tmp;
     auto rhs_t=rhs_v(ss);
@@ -220,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
   ret.Checkerboard() = x.Checkerboard();
   conformable(ret,x);
   conformable(x,y);
-  auto ret_v = ret.View();
-  auto x_v = x.View();
-  auto y_v = y.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto x_v = x.AcceleratorView(ViewRead);
+  auto y_v = y.AcceleratorView(ViewRead);
   accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
     auto tmp = a*x_v(ss)+y_v(ss);
     coalescedWrite(ret_v[ss],tmp);
@@ -233,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
   ret.Checkerboard() = x.Checkerboard();
   conformable(ret,x);
   conformable(x,y);
-  auto ret_v = ret.View();
-  auto x_v = x.View();
-  auto y_v = y.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto x_v = x.AcceleratorView(ViewRead);
+  auto y_v = y.AcceleratorView(ViewRead);
   accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
     auto tmp = a*x_v(ss)+b*y_v(ss);
     coalescedWrite(ret_v[ss],tmp);
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 157c647b..30aa6b06 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -54,8 +54,20 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
 // Advise the LatticeAccelerator class
 ////////////////////////////////////////////////////////////////////////////
 enum LatticeAcceleratorAdvise {
-  AdviseInfrequentUse = 0x1    // Advise that the data is used infrequently.  This can
-                               // significantly influence performance of bulk storage.
+  AdviseInfrequentUse = 0x1,    // Advise that the data is used infrequently.  This can
+                                // significantly influence performance of bulk storage.
+  AdviseReadMostly = 0x2,       // Data will mostly be read.  On some architectures
+                                // enables read-only copies of memory to be kept on
+                                // host and device.
+};
+
+////////////////////////////////////////////////////////////////////////////
+// View Access Mode
+////////////////////////////////////////////////////////////////////////////
+enum ViewMode {
+  ViewRead = 0x1,
+  ViewWrite = 0x2,
+  ViewReadWrite = 0x3
 };
 
 ////////////////////////////////////////////////////////////////////////////
@@ -91,6 +103,29 @@ public:
     if (advise & AdviseInfrequentUse) {
       cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
     }
+    if (advise & AdviseReadMostly) {
+      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
+    }
+#endif
+#endif
+  };
+
+  accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
+#ifdef GRID_NVCC
+#ifndef __CUDA_ARCH__ // only on host
+    int target;
+    cudaGetDevice(&target);
+    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
+    std::cout<< GridLogMessage << "To Device " << target << std::endl;
+#endif
+#endif
+  };
+
+  accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
+#ifdef GRID_NVCC
+#ifndef __CUDA_ARCH__ // only on host
+    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
+    std::cout<< GridLogMessage << "To Host" << std::endl;
 #endif
 #endif
   };
@@ -225,9 +260,23 @@ public:
   // The view is trivially copy constructible and may be copied to an accelerator device
   // in device lambdas
   /////////////////////////////////////////////////////////////////////////////////
-  LatticeView<vobj> View (void) const 
+  LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
+  {                                   //                     and HostView        for thread_for
+    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
+    return accessor;
+  }
+
+  LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const 
   {
     LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
+    accessor.AcceleratorPrefetch(mode);
+    return accessor;
+  }
+
+  LatticeView<vobj> HostView(int mode = ViewReadWrite) const 
+  {
+    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
+    accessor.HostPrefetch(mode);
     return accessor;
   }
   
@@ -251,7 +300,7 @@ public:
     assert( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
 
-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
     accelerator_for(ss,me.size(),1,{
       auto tmp = eval(ss,expr);
       vstream(me[ss],tmp);
@@ -270,7 +319,7 @@ public:
     assert( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
 
-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
     accelerator_for(ss,me.size(),1,{
       auto tmp = eval(ss,expr);
       vstream(me[ss],tmp);
@@ -288,7 +337,7 @@ public:
     CBFromExpression(cb,expr);
     assert( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
     accelerator_for(ss,me.size(),1,{
       auto tmp = eval(ss,expr);
       vstream(me[ss],tmp);
@@ -399,8 +448,9 @@ public:
     typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
     conformable(*this,r);
     this->checkerboard = r.Checkerboard();
-    auto me =   View();
-    auto him= r.View();
+    std::cout << GridLogMessage << "Copy other" << std::endl;
+    auto me =   AcceleratorView(ViewWrite);
+    auto him= r.AcceleratorView(ViewRead);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
       coalescedWrite(me[ss],him(ss));
     });
@@ -413,8 +463,9 @@ public:
   inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
     this->checkerboard = r.Checkerboard();
     conformable(*this,r);
-    auto me =   View();
-    auto him= r.View();
+    std::cout << GridLogMessage << "Copy same" << std::endl;
+    auto me =   AcceleratorView(ViewWrite);
+    auto him= r.AcceleratorView(ViewRead);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
       coalescedWrite(me[ss],him(ss));
     });
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 8acbde66..3147823d 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -103,8 +103,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
   GridBase *grid = left.Grid();
   
   // Might make all code paths go this way.
-  auto left_v = left.View();
-  auto right_v=right.View();
+  auto left_v = left.AcceleratorView(ViewRead);
+  auto right_v=right.AcceleratorView(ViewRead);
 
   const uint64_t nsimd = grid->Nsimd();
   const uint64_t sites = grid->oSites();
@@ -175,9 +175,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
   
   GridBase *grid = x.Grid();
 
-  auto x_v=x.View();
-  auto y_v=y.View();
-  auto z_v=z.View();
+  auto x_v=x.AcceleratorView(ViewRead);
+  auto y_v=y.AcceleratorView(ViewRead);
+  auto z_v=z.AcceleratorView(ViewWrite);
 
   const uint64_t nsimd = grid->Nsimd();
   const uint64_t sites = grid->oSites();
@@ -224,8 +224,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
 
   GridBase *grid = left.Grid();
 
-  auto left_v=left.View();
-  auto right_v=right.View();
+  auto left_v=left.AcceleratorView(ViewRead);
+  auto right_v=right.AcceleratorView(ViewRead);
 
   const uint64_t nsimd = grid->Nsimd();
   const uint64_t sites = grid->oSites();