Merge pull request #15 from paboyle/develop

Sync with upstream
2025-11-04 14:04:32 +00:00 · 2020-09-07 14:20:33 +02:00
parent 249e2db87d 65b724bb5f
commit 51d1beb1f3
32 changed files with 930 additions and 341 deletions
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -42,7 +42,7 @@
 #ifdef __NVCC__REDEFINE__
 #pragma pop_macro("__CUDACC__")
 #pragma pop_macro("__NVCC__")
-#pragma pop_macro("GRID_SIMT")
+#pragma pop_macro("__CUDA_ARCH__")
 #pragma pop
 #endif
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -65,8 +65,7 @@ public:
    MemoryManager::CpuFree((void *)__p,bytes);
  }
-  // FIXME: hack for the copy constructor, eventually it must be avoided
+  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
  //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
  void construct(pointer __p, const _Tp& __val) { assert(0);};
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
@@ -74,6 +73,9 @@ public:
 template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 //////////////////////////////////////////////////////////////////////////////////////
 // Unified virtual memory
 //////////////////////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class uvmAllocator {
 public: 
@@ -109,22 +111,63 @@ public:
    MemoryManager::SharedFree((void *)__p,bytes);
  }
  // FIXME: hack for the copy constructor, eventually it must be avoided
  void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
  //void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
 ////////////////////////////////////////////////////////////////////////////////
 // Device memory
 ////////////////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class devAllocator {
 public: 
  typedef std::size_t     size_type;
  typedef std::ptrdiff_t  difference_type;
  typedef _Tp*       pointer;
  typedef const _Tp* const_pointer;
  typedef _Tp&       reference;
  typedef const _Tp& const_reference;
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef devAllocator<_Tp1> other; };
  devAllocator() throw() { }
  devAllocator(const devAllocator&) throw() { }
  template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
  ~devAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) 
  { 
    size_type bytes = __n * sizeof(_Tp);
    profilerFree(bytes);
    MemoryManager::AcceleratorFree((void *)__p,bytes);
  }
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-template<class T> using commAllocator = uvmAllocator<T>;
+//template<class T> using commAllocator = devAllocator<T>;
 template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
-template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
+template<class T> using commVector = std::vector<T,devAllocator<T> >;
 //template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 NAMESPACE_END(Grid);
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -93,12 +93,12 @@ private:
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
  static void *AcceleratorAllocate(size_t bytes);
  static void  AcceleratorFree    (void *ptr,size_t bytes);
  static void PrintBytes(void);
 public:
  static void Init(void);
  static void InitMessage(void);
  static void *AcceleratorAllocate(size_t bytes);
  static void  AcceleratorFree    (void *ptr,size_t bytes);
  static void *SharedAllocate(size_t bytes);
  static void  SharedFree    (void *ptr,size_t bytes);
  static void *CpuAllocate(size_t bytes);
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -302,60 +302,28 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
-  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
+  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
+  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
-  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
+
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) {
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
+  // Enforce no UVM in comms, device or host OK
-    MPI_Request xrq;
+  assert(acceleratorIsCommunicable(xmit));
-    MPI_Request rrq;
+  assert(acceleratorIsCommunicable(recv));
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else {
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  //  printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		    recv,bytes,MPI_CHAR,from, from,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);
  }
 }
  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int dest,
 						     void *recv,
@@ -411,15 +379,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
@@ -430,6 +390,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  assert(ierr==0);
  list.resize(0);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 //{
 //}
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
--- a/Grid/cshift/Cshift.h
+++ b/Grid/cshift/Cshift.h
@@ -52,23 +52,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-template<typename Op, typename T1> 
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
-auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
+auto Cshift(const Expression &expr,int dim,int shift)  -> decltype(closure(expr)) 
    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
 {
  return Cshift(closure(expr),dim,shift);
 }
 template <class Op, class T1, class T2>
 auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> 
 {
  return Cshift(closure(expr),dim,shift);
 }
 template <class Op, class T1, class T2, class T3>
 auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
 				   eval(0, expr.arg2),
 				   eval(0, expr.arg3)))> 
 {
  return Cshift(closure(expr),dim,shift);
 }
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -76,8 +76,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
    autoView(rhs_v , rhs, AcceleratorRead);
    auto buffer_p = & buffer[0];
    auto table = &Cshift_table[0];
-    accelerator_for(i,ent,1,{
+    accelerator_for(i,ent,vobj::Nsimd(),{
-      buffer_p[table[i].first]=rhs_v[table[i].second];
+	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
  }
 }
@@ -185,8 +185,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
    autoView( rhs_v, rhs, AcceleratorWrite);
    auto buffer_p = & buffer[0];
    auto table = &Cshift_table[0];
-    accelerator_for(i,ent,1,{
+    accelerator_for(i,ent,vobj::Nsimd(),{
-	rhs_v[table[i].first]=buffer_p[table[i].second];
+	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
    });
  }
 }
@@ -222,6 +222,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
    // Test_cshift_red_black code.
    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
    std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
    assert(0); // This will fail if hit on GPU
    autoView( rhs_v, rhs, CpuWrite);
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@@ -282,8 +283,8 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
    auto table = &Cshift_table[0];
-    accelerator_for(i,ent,1,{
+    accelerator_for(i,ent,vobj::Nsimd(),{
-      lhs_v[table[i].first]=rhs_v[table[i].second];
+      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
  }
 }
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -37,6 +37,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
 //#include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
 #include <Grid/lattice/Lattice_comparison_utils.h>
 #include <Grid/lattice/Lattice_comparison.h>
 #include <Grid/lattice/Lattice_coordinate.h>
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -42,9 +42,24 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////
 // Predicated where support
 ////////////////////////////////////////////////////
 #ifdef GRID_SIMT
 // drop to scalar in SIMT; cleaner in fact
 template <class iobj, class vobj, class robj>
-accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
+accelerator_inline vobj predicatedWhere(const iobj &predicate, 
-                            const robj &iffalse) {
+					const vobj &iftrue, 
 					const robj &iffalse) 
 {
  Integer mask = TensorRemove(predicate);
  typename std::remove_const<vobj>::type ret= iffalse;
  if (mask) ret=iftrue;
  return ret;
 }
 #else
 template <class iobj, class vobj, class robj>
 accelerator_inline vobj predicatedWhere(const iobj &predicate, 
 					const vobj &iftrue, 
 					const robj &iffalse) 
 {
  typename std::remove_const<vobj>::type ret;
  typedef typename vobj::scalar_object scalar_object;
@@ -68,6 +83,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru
  merge(ret, falsevals);
  return ret;
 }
 #endif
 /////////////////////////////////////////////////////
 //Specialization of getVectorType for lattices
@@ -81,33 +97,62 @@ struct getVectorType<Lattice<T> >{
 //--  recursive evaluation of expressions; --
 // handle leaves of syntax tree
 ///////////////////////////////////////////////////
-template<class sobj> accelerator_inline 
+template<class sobj,
  typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr> 
 accelerator_inline 
 sobj eval(const uint64_t ss, const sobj &arg)
 {
  return arg;
 }
 template <class lobj> accelerator_inline 
-const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg) 
+auto eval(const uint64_t ss, const LatticeView<lobj> &arg) -> decltype(arg(ss))
 {
  return arg(ss);
 }
 ////////////////////////////////////////////
 //--  recursive evaluation of expressions; --
 // whole vector return, used only for expression return type inference
 ///////////////////////////////////////////////////
 template<class sobj> accelerator_inline 
 sobj vecEval(const uint64_t ss, const sobj &arg)
 {
  return arg;
 }
 template <class lobj> accelerator_inline 
 const lobj & vecEval(const uint64_t ss, const LatticeView<lobj> &arg) 
 {
  return arg[ss];
 }
 // What needs this?
 // Cannot be legal on accelerator
 // Comparison must convert
 #if 1
 template <class lobj> accelerator_inline 
 const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) 
 {
  assert(0);
  auto view = arg.View(AcceleratorRead);
  return view[ss];
 }
 #endif
 ///////////////////////////////////////////////////
 // handle nodes in syntax tree- eval one operand
 // vecEval needed (but never called as all expressions offloaded) to infer the return type
 // in SIMT contexts of closure.
 ///////////////////////////////////////////////////
 template <typename Op, typename T1> accelerator_inline 
 auto vecEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)  
  -> decltype(expr.op.func( vecEval(ss, expr.arg1)))
 {
  return expr.op.func( vecEval(ss, expr.arg1) );
 }
 // vecEval two operands
 template <typename Op, typename T1, typename T2> accelerator_inline
 auto vecEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)  
  -> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2)))
 {
  return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) );
 }
 // vecEval three operands
 template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
 auto vecEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
  -> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)))
 {
  return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3));
 }
 ///////////////////////////////////////////////////
 // handle nodes in syntax tree- eval one operand coalesced
 ///////////////////////////////////////////////////
 template <typename Op, typename T1> accelerator_inline 
 auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)  
@@ -115,23 +160,41 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
 {
  return expr.op.func( eval(ss, expr.arg1) );
 }
 ///////////////////////
 // eval two operands
 ///////////////////////
 template <typename Op, typename T1, typename T2> accelerator_inline
 auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)  
  -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
 {
  return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
 }
 ///////////////////////
 // eval three operands
 ///////////////////////
 template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
 auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
-  -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
+  -> decltype(expr.op.func(eval(ss, expr.arg1), 
 			   eval(ss, expr.arg2), 
 			   eval(ss, expr.arg3)))
 {
-  return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
+#ifdef GRID_SIMT
  // Handles Nsimd (vInteger) != Nsimd(ComplexD)
  typedef decltype(vecEval(ss, expr.arg2)) rvobj;
  typedef typename std::remove_reference<rvobj>::type vobj;
  const int Nsimd = vobj::vector_type::Nsimd();
  auto vpred = vecEval(ss,expr.arg1);
  ExtractBuffer<Integer> mask(Nsimd);
  extract<vInteger, Integer>(TensorRemove(vpred), mask);
  int s = acceleratorSIMTlane(Nsimd);
  return expr.op.func(mask[s],
 		      eval(ss, expr.arg2), 
 		      eval(ss, expr.arg3));
 #else
  return expr.op.func(eval(ss, expr.arg1),
 		      eval(ss, expr.arg2), 
 		      eval(ss, expr.arg3));
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@@ -229,7 +292,7 @@ template <typename Op, typename T1, typename T2> inline
 void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr) 
 {
  ExpressionViewOpen(expr.arg1);  // recurse AST
-  ExpressionViewOpen(expr.arg2);  // recurse AST
+  ExpressionViewOpen(expr.arg2);  // rrecurse AST
 }
 template <typename Op, typename T1, typename T2, typename T3>
 inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr) 
@@ -273,9 +336,8 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
 // Unary operators and funcs
 ////////////////////////////////////////////
 #define GridUnopClass(name, ret)					\
  template <class arg>							\
  struct name {								\
-    static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
+    template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \
  };
 GridUnopClass(UnarySub, -a);
@@ -286,8 +348,6 @@ GridUnopClass(UnaryTrace, trace(a));
 GridUnopClass(UnaryTranspose, transpose(a));
 GridUnopClass(UnaryTa, Ta(a));
 GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
 GridUnopClass(UnaryReal, real(a));
 GridUnopClass(UnaryImag, imag(a));
 GridUnopClass(UnaryToReal, toReal(a));
 GridUnopClass(UnaryToComplex, toComplex(a));
 GridUnopClass(UnaryTimesI, timesI(a));
@@ -306,10 +366,10 @@ GridUnopClass(UnaryExp, exp(a));
 // Binary operators
 ////////////////////////////////////////////
 #define GridBinOpClass(name, combination)			\
  template <class left, class right>				\
  struct name {							\
    template <class _left, class _right>			\
    static auto accelerator_inline				\
-    func(const left &lhs, const right &rhs)			\
+    func(const _left &lhs, const _right &rhs)			\
      -> decltype(combination) const				\
    {								\
      return combination;					\
@@ -329,10 +389,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
 // Trinary conditional op
 ////////////////////////////////////////////////////
 #define GridTrinOpClass(name, combination)				\
  template <class predicate, class left, class right>			\
  struct name {								\
    template <class _predicate,class _left, class _right>		\
    static auto accelerator_inline					\
-    func(const predicate &pred, const left &lhs, const right &rhs)	\
+    func(const _predicate &pred, const _left &lhs, const _right &rhs)	\
      -> decltype(combination) const					\
    {									\
      return combination;						\
@@ -340,17 +400,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
  };
 GridTrinOpClass(TrinaryWhere,
-		(predicatedWhere<predicate, 
+		(predicatedWhere<
-		 typename std::remove_reference<left>::type,
+		 typename std::remove_reference<_predicate>::type, 
-		 typename std::remove_reference<right>::type>(pred, lhs,rhs)));
+		 typename std::remove_reference<_left>::type,
 		 typename std::remove_reference<_right>::type>(pred, lhs,rhs)));
 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
-
+#define GRID_UNOP(name)   name
-#define GRID_UNOP(name)   name<decltype(eval(0, arg))>
+#define GRID_BINOP(name)  name
-#define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+#define GRID_TRINOP(name) name
 #define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
 #define GRID_DEF_UNOP(op, name)						\
  template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
@@ -402,8 +462,6 @@ GRID_DEF_UNOP(trace, UnaryTrace);
 GRID_DEF_UNOP(transpose, UnaryTranspose);
 GRID_DEF_UNOP(Ta, UnaryTa);
 GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
 GRID_DEF_UNOP(real, UnaryReal);
 GRID_DEF_UNOP(imag, UnaryImag);
 GRID_DEF_UNOP(toReal, UnaryToReal);
 GRID_DEF_UNOP(toComplex, UnaryToComplex);
 GRID_DEF_UNOP(timesI, UnaryTimesI);
@@ -436,29 +494,36 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
 /////////////////////////////////////////////////////////////
 template <class Op, class T1>
 auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> 
 {
-  Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
+  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr);
  return ret;
 }
 template <class Op, class T1, class T2>
 auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> 
+  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> 
 {
-  Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
+  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr);
  return ret;
 }
 template <class Op, class T1, class T2, class T3>
 auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
+  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
-				   eval(0, expr.arg2),
+				   vecEval(0, expr.arg2),
-				   eval(0, expr.arg3)))> 
+				   vecEval(0, expr.arg3)))> 
 {
-  Lattice<decltype(expr.op.func(eval(0, expr.arg1),
+  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
-				eval(0, expr.arg2),
+				vecEval(0, expr.arg2),
-				eval(0, expr.arg3)))>  ret(expr);
+			        vecEval(0, expr.arg3)))>  ret(expr);
  return ret;
 }
 #define EXPRESSION_CLOSURE(function)					\
  template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> \
    auto function(Expression &expr) -> decltype(function(closure(expr))) \
  {									\
    return function(closure(expr));					\
  }
 #undef GRID_UNOP
 #undef GRID_BINOP
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -123,9 +123,9 @@ public:
    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
    auto me  = View(AcceleratorWriteDiscard);
-    accelerator_for(ss,me.size(),1,{
+    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
-      vstream(me[ss],tmp);
+      coalescedWrite(me[ss],tmp);
    });
    me.ViewClose();
    ExpressionViewClose(exprCopy);
@@ -146,9 +146,9 @@ public:
    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
    auto me  = View(AcceleratorWriteDiscard);
-    accelerator_for(ss,me.size(),1,{
+    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
-      vstream(me[ss],tmp);
+      coalescedWrite(me[ss],tmp);
    });
    me.ViewClose();
    ExpressionViewClose(exprCopy);
@@ -168,9 +168,9 @@ public:
    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
    auto me  = View(AcceleratorWriteDiscard);
-    accelerator_for(ss,me.size(),1,{
+    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
-      vstream(me[ss],tmp);
+      coalescedWrite(me[ss],tmp);
    });
    me.ViewClose();
    ExpressionViewClose(exprCopy);
--- a/Grid/lattice/Lattice_comparison.h
+++ b/Grid/lattice/Lattice_comparison.h
@@ -42,34 +42,6 @@ NAMESPACE_BEGIN(Grid);
 typedef iScalar<vInteger> vPredicate ;
 /*
 template <class iobj, class vobj, class robj> accelerator_inline 
 vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse) 
 {
  typename std::remove_const<vobj>::type ret;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  const int Nsimd = vobj::vector_type::Nsimd();
  ExtractBuffer<Integer> mask(Nsimd);
  ExtractBuffer<scalar_object> truevals(Nsimd);
  ExtractBuffer<scalar_object> falsevals(Nsimd);
  extract(iftrue, truevals);
  extract(iffalse, falsevals);
  extract<vInteger, Integer>(TensorRemove(predicate), mask);
  for (int s = 0; s < Nsimd; s++) {
    if (mask[s]) falsevals[s] = truevals[s];
  }
  merge(ret, falsevals);
  return ret;
 }
 */
 //////////////////////////////////////////////////////////////////////////
 // compare lattice to lattice
 //////////////////////////////////////////////////////////////////////////
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -182,6 +182,14 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  return;
 };
 template<class vobj,class sobj>
 inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site)
 {
  autoView(lv,l,CpuRead);
  peekLocalSite(s,lv,site);
  return;
 };
 // Must be CPU write view
 template<class vobj,class sobj>
 inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
@@ -210,6 +218,14 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
  return;
 };
 template<class vobj,class sobj>
 inline void pokeLocalSite(const sobj &s, Lattice<vobj> &l,Coordinate &site)
 {
  autoView(lv,l,CpuWrite);
  pokeLocalSite(s,lv,site);
  return;
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_real_imag.h
+++ b/Grid/lattice/Lattice_real_imag.h
@@ -0,0 +1,79 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_reality.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_LATTICE_REAL_IMAG_H
 #define GRID_LATTICE_REAL_IMAG_H
 // FIXME .. this is the sector of the code 
 // I am most worried about the directions
 // The choice of burying complex in the SIMD
 // is making the use of "real" and "imag" very cumbersome
 NAMESPACE_BEGIN(Grid);
 template<class vobj> inline Lattice<vobj> real(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
  autoView( lhs_v, lhs, AcceleratorRead);
  autoView( ret_v, ret, AcceleratorWrite);
  ret.Checkerboard()=lhs.Checkerboard();
  accelerator_for( ss, lhs_v.size(), 1, {
    ret_v[ss] =real(lhs_v[ss]);
  });
  return ret;
 };
 template<class vobj> inline Lattice<vobj> imag(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
  autoView( lhs_v, lhs, AcceleratorRead);
  autoView( ret_v, ret, AcceleratorWrite);
  ret.Checkerboard()=lhs.Checkerboard();
  accelerator_for( ss, lhs_v.size(), 1, {
    ret_v[ss] =imag(lhs_v[ss]);
  });
  return ret;
 };
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
  auto real(const Expression &expr) -> decltype(real(closure(expr)))		
 {									
  return real(closure(expr));					
 }
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
  auto imag(const Expression &expr) -> decltype(imag(closure(expr)))		
 {									
  return imag(closure(expr));					
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -208,7 +208,7 @@ public:
  LebesgueOrder LebesgueEvenOdd;
  // Comms buffer
-  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -215,7 +215,7 @@ public:
  LebesgueOrder LebesgueEvenOdd;
  // Comms buffer
-  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
 };
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -59,7 +59,7 @@ public:
  }
  static inline GaugeLinkField
  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-    return Cshift(closure(adj(Link)), mu, -1);
+    return Cshift(adj(Link), mu, -1);
  }
  static inline GaugeLinkField
  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -53,23 +53,21 @@ namespace PeriodicBC {
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
-  template<class gauge,typename Op, typename T1> auto
+  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
-    CovShiftForward(const Lattice<gauge> &Link, 
+    auto  CovShiftForward(const Lattice<gauge> &Link, 
 			  int mu,
-		    const LatticeUnaryExpression<Op,T1> &expr)
+			  const Expr &expr) -> decltype(closure(expr))
    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
  {
-    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
+    auto arg = closure(expr);
    return CovShiftForward(Link,mu,arg);
  }
-  template<class gauge,typename Op, typename T1> auto
+  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
-    CovShiftBackward(const Lattice<gauge> &Link, 
+    auto  CovShiftBackward(const Lattice<gauge> &Link, 
 			   int mu,
-		     const LatticeUnaryExpression<Op,T1> &expr)
+			   const Expr &expr) -> decltype(closure(expr))
    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
  {
-    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
+    auto arg = closure(expr);
-    return CovShiftForward(Link,mu,arg);
+    return CovShiftBackward(Link,mu,arg);
  }
 }
@@ -142,26 +140,23 @@ namespace ConjugateBC {
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
-  template<class gauge,typename Op, typename T1> auto
+  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
-    CovShiftForward(const Lattice<gauge> &Link, 
+    auto  CovShiftForward(const Lattice<gauge> &Link, 
 			  int mu,
-		    const LatticeUnaryExpression<Op,T1> &expr)
+			  const Expr &expr) -> decltype(closure(expr))
    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
  {
-    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
+    auto arg = closure(expr);
    return CovShiftForward(Link,mu,arg);
  }
-  template<class gauge,typename Op, typename T1> auto
+  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
-    CovShiftBackward(const Lattice<gauge> &Link, 
+    auto  CovShiftBackward(const Lattice<gauge> &Link, 
 			   int mu,
-		     const LatticeUnaryExpression<Op,T1> &expr)
+			   const Expr &expr)  -> decltype(closure(expr))
    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
  {
-    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
+    auto arg = closure(expr);
-    return CovShiftForward(Link,mu,arg);
+    return CovShiftBackward(Link,mu,arg);
  }
 }
--- a/Grid/qcd/utils/SUnAdjoint.h
+++ b/Grid/qcd/utils/SUnAdjoint.h
@@ -47,14 +47,9 @@ public:
  typedef Lattice<vAMatrixF> LatticeAdjMatrixF;
  typedef Lattice<vAMatrixD> LatticeAdjMatrixD;
-  typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> >
+  typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> >  LatticeAdjField;
-  LatticeAdjField;
+  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> > LatticeAdjFieldF;
-  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> >
+  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> > LatticeAdjFieldD;
  LatticeAdjFieldF;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> >
  LatticeAdjFieldD;
  template <class cplx>
@@ -128,7 +123,9 @@ public:
  }
  // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 )
-  static void projectOnAlgebra(typename SU<ncolour>::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) {
+  static void projectOnAlgebra(typename SU<ncolour>::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) 
  {
    conformable(h_out, in);
    h_out = Zero();
    AMatrix iTa;
@@ -136,7 +133,7 @@ public:
    for (int a = 0; a < Dimension; a++) {
      generator(a, iTa);
-      auto tmp = real(trace(iTa * in)) * coefficient;
+      LatticeComplex tmp = real(trace(iTa * in)) * coefficient;
      pokeColour(h_out, tmp, a);
    }
  }
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -485,7 +485,7 @@ public:
        // Up staple    ___ ___
        //             |       |
-        tmp = Cshift(closure(adj(U[nu])), nu, -1);
+        tmp = Cshift(adj(U[nu]), nu, -1);
        tmp = adj(U2[mu]) * tmp;
        tmp = Cshift(tmp, mu, -2);
@@ -519,7 +519,7 @@ public:
        //
        //      |  |
-        tmp = Cshift(closure(adj(U2[nu])), nu, -2);
+        tmp = Cshift(adj(U2[nu]), nu, -2);
        tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
        tmp = U2[nu] * Cshift(tmp, nu, 2);
        Stap += Cshift(tmp, mu, 1);
--- a/Grid/simd/Simd.h
+++ b/Grid/simd/Simd.h
@@ -93,6 +93,11 @@ accelerator_inline ComplexF pow(const ComplexF& r,RealF y){ return(std::pow(r,y)
 using std::abs;
 using std::pow;
 using std::sqrt;
 using std::log;
 using std::exp;
 using std::sin;
 using std::cos;
 accelerator_inline RealF    conjugate(const RealF  & r){ return r; }
 accelerator_inline RealD    conjugate(const RealD  & r){ return r; }
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -190,6 +190,36 @@ NAMESPACE_BEGIN(Grid);
    typedef ComplexD DoublePrecision;
    typedef ComplexD DoublePrecision2;
  };
 #ifdef GRID_CUDA
  template<> struct GridTypeMapper<std::complex<float> > : public GridTypeMapper_Base {
    typedef std::complex<float> scalar_type;
    typedef std::complex<double> scalar_typeD;
    typedef scalar_type vector_type;
    typedef scalar_typeD vector_typeD;
    typedef scalar_type tensor_reduced;
    typedef scalar_type scalar_object;
    typedef scalar_typeD scalar_objectD;
    typedef scalar_type Complexified;
    typedef RealF Realified;
    typedef scalar_typeD DoublePrecision;
    typedef scalar_typeD DoublePrecision2;
  };
  template<> struct GridTypeMapper<std::complex<double> > : public GridTypeMapper_Base {
    typedef std::complex<double> scalar_type;
    typedef std::complex<double> scalar_typeD;
    typedef scalar_type vector_type;
    typedef scalar_typeD vector_typeD;
    typedef scalar_type tensor_reduced;
    typedef scalar_type scalar_object;
    typedef scalar_typeD scalar_objectD;
    typedef scalar_type Complexified;
    typedef RealD Realified;
    typedef scalar_typeD DoublePrecision;
    typedef scalar_typeD DoublePrecision2;
  };
 #endif
  template<> struct GridTypeMapper<ComplexD2> : public GridTypeMapper_Base {
    typedef ComplexD2 scalar_type;
    typedef ComplexD2 scalar_typeD;
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -70,6 +70,7 @@ NAMESPACE_BEGIN(Grid);
 //
 // Memory management:
 //
 //    int   acceleratorIsCommunicable(void *pointer);
 //    void *acceleratorAllocShared(size_t bytes);
 //    void acceleratorFreeShared(void *ptr);
 //
@@ -90,6 +91,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////
 #ifdef GRID_CUDA
 #include <cuda.h>
 #ifdef __CUDA_ARCH__
 #define GRID_SIMT
@@ -165,6 +167,16 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline int  acceleratorIsCommunicable(void *ptr)
 {
  int uvm;
  auto 
  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
  assert(cuerr == cudaSuccess );
  if(uvm) return 0;
  else    return 1;
 }
 #endif
 //////////////////////////////////////////////
@@ -219,6 +231,15 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline int  acceleratorIsCommunicable(void *ptr)
 {
 #if 0
  auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
  if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
  else return 0;
 #endif
  return 1;
 }
 #endif
@@ -298,6 +319,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  return malloc(bytes);
 #endif
 };
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 inline void *acceleratorAllocDevice(size_t bytes)
 {
@@ -352,6 +374,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 #ifdef HAVE_MM_MALLOC_H
 inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
--- a/Grid/util/Coordinate.h
+++ b/Grid/util/Coordinate.h
@@ -99,10 +99,10 @@ inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector<T,_nd
 {
  os << "[";
  for(int s=0;s<v.size();s++) {
-    os << v[s] << " ";
+    os << v[s];
    if( s < (v.size()-1) ){
      os << " ";
    }
  if (v.size() > 0) {
    os << "\b";
  }
  os << "]";
  return os;
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -74,90 +74,6 @@ int main (int argc, char ** argv)
  std::vector<double> t_time(Nloop);
  time_statistics timestat;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=8;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){
      Coordinate latt_size  ({lat*mpi_layout[0],
 	                      lat*mpi_layout[1],
      			      lat*mpi_layout[2],
      			      lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      for(int i=0;i<Nloop;i++){
      double start=usecond();
 	std::vector<CommsRequest_t> requests;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.SendToRecvFromBegin(requests,
 				   (void *)&xbuf[mu][0],
 				   xmit_to_rank,
 				   (void *)&rbuf[mu][0],
 				   recv_from_rank,
 				   bytes);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.SendToRecvFromBegin(requests,
 				     (void *)&xbuf[mu+4][0],
 				     xmit_to_rank,
 				     (void *)&rbuf[mu+4][0],
 				     recv_from_rank,
 				     bytes);
 	  }
 	}
 	Grid.SendToRecvFromComplete(requests);
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
      double dbytes    = bytes*ppn;
      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -206,26 +122,22 @@ int main (int argc, char ** argv)
 	    {
 	      std::vector<CommsRequest_t> requests;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFromBegin(requests,
+	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
 				       (void *)&xbuf[mu][0],
 				  xmit_to_rank,
 				  (void *)&rbuf[mu][0],
 				  recv_from_rank,
 				  bytes);
 	      Grid.SendToRecvFromComplete(requests);
 	    }
 	    comm_proc = mpi_layout[mu]-1;
 	    {
 	      std::vector<CommsRequest_t> requests;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFromBegin(requests,
+	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
 				       (void *)&xbuf[mu+4][0],
 				  xmit_to_rank,
 				  (void *)&rbuf[mu+4][0],
 				  recv_from_rank,
 				  bytes);
 	      Grid.SendToRecvFromComplete(requests);
 	    }
 	  }
 	}
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -89,6 +89,7 @@ int main (int argc, char ** argv)
 	  std::cout << GridLogMessage;
 	  std::cout << latt_size;
 	  std::cout << "\t\t";
 	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
 	  GridRedBlackCartesian RBGrid(&Grid);
--- a/configure.ac
+++ b/configure.ac
@@ -154,6 +154,7 @@ AC_ARG_ENABLE([accelerator],
 case ${ac_ACCELERATOR} in
    cuda)
      echo CUDA acceleration
      LIBS="${LIBS} -lcuda"
      AC_DEFINE([GRID_CUDA],[1],[Use CUDA offload]);;
    sycl)
      echo SYCL acceleration
@@ -323,7 +324,6 @@ case ${CXXTEST} in
 #    CXXLD="nvcc -v -link"
    CXX="${CXXBASE} -x cu "
    CXXLD="${CXXBASE} -link"
 #    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
    if test $ac_openmp = yes; then
       CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
@@ -483,8 +483,7 @@ case ${ac_SHM} in
     LDFLAGS_CPY=$LDFLAGS
     CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
     LDFLAGS="$AM_LDFLAGS $LDFLAGS"
-     AC_SEARCH_LIBS([shm_unlink], [rt], [],
+     AC_SEARCH_LIBS([shm_unlink], [rt], [],[AC_MSG_ERROR("no library found for shm_unlink")])
                    [AC_MSG_ERROR("no library found for shm_unlink")])
     CXXFLAGS=$CXXFLAGS_CPY
     LDFLAGS=$LDFLAGS_CPY
     ;;
--- a/tests/IO/Test_serialisation.cc
+++ b/tests/IO/Test_serialisation.cc
@@ -47,14 +47,14 @@ public:
                          bool , b,
                          std::vector<double>, array,
                          std::vector<std::vector<double> >, twodimarray,
-			  std::vector<std::vector<std::vector<Complex> > >, cmplx3darray,
+			  std::vector<std::vector<std::vector<std::complex<double>> > >, cmplx3darray,
 			  SpinColourMatrix, scm
                          );
  myclass() {}
  myclass(int i)
  : array(4,5.1)
  , twodimarray(3,std::vector<double>(5, 1.23456))
-  , cmplx3darray(3,std::vector<std::vector<Complex>>(5, std::vector<Complex>(7, Complex(1.2, 3.4))))
+  , cmplx3darray(3,std::vector<std::vector<std::complex<double>>>(5, std::vector<std::complex<double>>(7, std::complex<double>(1.2, 3.4))))
  , ve(2, myenum::blue)
  {
    e=myenum::red;
@@ -121,11 +121,11 @@ namespace Eigen {
 // Perform I/O tests on a range of tensor types
 // Test coverage: scalars, complex and GridVectors in single, double and default precision
 class TensorIO : public Serializable {
-  using TestScalar = ComplexD;
+  using TestScalar = std::complex<double>;
  using SR3 = Eigen::Sizes<9,4,2>;
  using SR5 = Eigen::Sizes<5,4,3,2,1>;
  using ESO = Eigen::StorageOptions;
-  using TensorRank3  = Eigen::Tensor<ComplexF, 3, ESO::RowMajor>;
+  using TensorRank3  = Eigen::Tensor<std::complex<float>, 3, ESO::RowMajor>;
  using TensorR5     = Eigen::TensorFixedSize<Real, SR5>;
  using TensorR5Alt  = Eigen::TensorFixedSize<Real, SR5, ESO::RowMajor>;
  using Tensor942    = Eigen::TensorFixedSize<TestScalar, SR3, ESO::RowMajor>;
@@ -134,8 +134,8 @@ class TensorIO : public Serializable {
  using LSCTensor    = Eigen::TensorFixedSize<SpinColourMatrix, Eigen::Sizes<6,5>>;
  static const Real       FlagR;
-  static const Complex    Flag;
+  static const std::complex<double>   Flag;
-  static const ComplexF   FlagF;
+  static const std::complex<float>    FlagF;
  static const TestScalar FlagTS;
  static const char * const pszFilePrefix;
@@ -230,8 +230,8 @@ public:
 };
 const Real                 TensorIO::FlagR {1};
-const Complex              TensorIO::Flag  {1,-1};
+const std::complex<double> TensorIO::Flag  {1,-1};
-const ComplexF             TensorIO::FlagF {1,-1};
+const std::complex<float>  TensorIO::FlagF {1,-1};
 const TensorIO::TestScalar TensorIO::FlagTS{1,-1};
 const char * const         TensorIO::pszFilePrefix = "tensor_";
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@@ -137,7 +137,6 @@ int main(int argc, char **argv) {
      LatticeReal iscalar(&Fine);
      SpinMatrix GammaFive;
      iSpinMatrix<vComplex> iGammaFive;
      ColourMatrix cmat;
      random(FineRNG, Foo);
@@ -283,7 +282,6 @@ int main(int argc, char **argv) {
      cMat = mydouble * cMat;
      sMat = adj(sMat);          // LatticeSpinMatrix adjoint
      sMat = iGammaFive * sMat;  // SpinMatrix * LatticeSpinMatrix
      sMat = GammaFive * sMat;   // SpinMatrix * LatticeSpinMatrix
      scMat = adj(scMat);
      cMat = adj(cMat);
--- a/tests/core/Test_staggered_naive.cc
+++ b/tests/core/Test_staggered_naive.cc
@@ -261,11 +261,11 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,phi_o,phi);
  SchurDiagMooeeOperator<NaiveStaggeredFermionR,FermionField> HermOpEO(Ds);
-  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e);
-  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o);
-  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
+  HermOpEO.MpcDagMpc(phi_e,dphi_e);
-  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o);
  pDce = innerProduct(phi_e,dchi_e);
  pDco = innerProduct(phi_o,dchi_o);
--- a/tests/core/Test_where.cc
+++ b/tests/core/Test_where.cc
@@ -0,0 +1,80 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_poisson_fft.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 ;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  int N=16;
  std::vector<int> latt_size  ({N,4,4});
  std::vector<int> simd_layout({vComplexD::Nsimd(),1,1});
  std::vector<int> mpi_layout ({1,1,1});
  int vol = 1;
  int nd  = latt_size.size();
  for(int d=0;d<nd;d++){
    vol = vol * latt_size[d];
  }
  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
  LatticeComplexD      zz(&GRID);
  LatticeInteger     coor(&GRID);
  LatticeComplexD  rn(&GRID);
  LatticeComplexD  sl(&GRID);
  zz  = ComplexD(0.0,0.0);
  GridParallelRNG RNG(&GRID);
  RNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));  
  gaussian(RNG,rn);
  RealD nn=norm2(rn);
  for(int mu=0;mu<nd;mu++){
    RealD ns=0.0;
    for(int t=0;t<latt_size[mu];t++){
      LatticeCoordinate(coor,mu);
      sl=where(coor==Integer(t),rn,zz);
      std::cout <<GridLogMessage<< " sl " << sl<<std::endl;
      std::cout <<GridLogMessage<<" slice "<<t<<" " << norm2(sl)<<std::endl;
      ns=ns+norm2(sl);
    }
    std::cout <<GridLogMessage <<" sliceNorm" <<mu<<" "<< nn <<" "<<ns<<" " << nn-ns<<std::endl;
  }
  Grid_finalize();
 }
--- a/tests/debug/Test_cayley_coarsen_support.cc
+++ b/tests/debug/Test_cayley_coarsen_support.cc
@@ -122,7 +122,7 @@ int main (int argc, char ** argv)
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse5d,FGrid,cb);
-  Aggregates.CreateSubspaceRandom(RNG5);
+  //  Aggregates.CreateSubspaceRandom(RNG5);
  subspace=Aggregates.subspace;
@@ -163,7 +163,7 @@ int main (int argc, char ** argv)
  LittleDiracOp.M(c_src,c_res);
  std::cout<<GridLogMessage<<"Testing hermiticity explicitly by inspecting matrix elements"<<std::endl;
-  LittleDiracOp.AssertHermitian();
+  //  LittleDiracOp.AssertHermitian();
  std::cout<<GridLogMessage << "Testing Hermiticity stochastically "<< std::endl;
  CoarseVector phi(Coarse5d);
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
@@ -311,7 +311,7 @@ int main (int argc, char ** argv) {
  std::cout << GridLogMessage << "Keep " << coarse.Nk << " total vectors" << std::endl;
  assert(Nm2 >= Nm1);
-  const int nbasis= 70;
+  const int nbasis= 32;
  CoarseFineIRL<vSpinColourVector,vTComplex,nbasis> IRL(FrbGrid,CoarseGrid5rb,HermOp,Odd);
  std::cout << GridLogMessage << "Constructed CoarseFine IRL" << std::endl;
--- a/tests/solver/Test_dwf_hdcr_2level.cc
+++ b/tests/solver/Test_dwf_hdcr_2level.cc
@@ -0,0 +1,420 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_dwf_hdcr.cc
    Copyright (C) 2015
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 using namespace std;
 using namespace Grid;
 /* Params
 * Grid: 
 * block1(4)
 * block2(4)
 * 
 * Subspace
 * * Fine  : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100
 * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100
 * Smoother:
 * * Fine: Cheby(hi, lo, order)            --  60,0.5,10
 * * Coarse: Cheby(hi, lo, order)          --  12,0.1,4
 * Lanczos:
 * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order))   24,36,24,0.002,4.0,61 
 */
 RealD InverseApproximation(RealD x){
  return 1.0/x;
 }
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
  Chebyshev<Field> Cheby;
  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) :
    _SmootherOperator(SmootherOperator),
    _SmootherMatrix(SmootherMatrix),
    Cheby(_lo,_hi,_ord,InverseApproximation)
  {};
  void operator() (const Field &in, Field &out) 
  {
    Field tmp(in.Grid());
    MdagMLinearOperator<Matrix,Field>   MdagMOp(_SmootherMatrix); 
    _SmootherOperator.AdjOp(in,tmp);
    Cheby(MdagMOp,tmp,out);         
  }
 };
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
  RealD tol;
  RealD shift;
  int   maxit;
  MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) :
    shift(_shift),tol(_tol),maxit(_maxit),
    SmootherOperator(_SmootherOperator),
    SmootherMatrix(_SmootherMatrix)
  {};
  void operator() (const Field &in, Field &out) 
  {
    ZeroGuesser<Field> Guess;
    ConjugateGradient<Field>  CG(tol,maxit,false);
    Field src(in.Grid());
    ShiftedMdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(SmootherMatrix,shift);
    SmootherOperator.AdjOp(in,src);
    Guess(src,out);
    CG(MdagMOp,src,out); 
  }
 };
 template<class Field,class Matrix> class RedBlackSmoother : public LinearFunction<Field>
 {
 public:
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  RealD tol;
  RealD shift;
  int   maxit;
  RedBlackSmoother(RealD _shift,RealD _tol,int _maxit,Matrix &_SmootherMatrix) :
    shift(_shift),tol(_tol),maxit(_maxit),
    SmootherMatrix(_SmootherMatrix)
  {};
  void operator() (const Field &in, Field &out) 
  {
    std::cout << " Red Black Smootheer "<<norm2(in)<<" " <<norm2(out)<<std::endl;
    ConjugateGradient<Field>  CG(tol,maxit,false);
    out =Zero();
    SchurRedBlackDiagMooeeSolve<Field> RBSolver(CG);
    RBSolver(SmootherMatrix,in,out); 
    std::cout << " Red Black Smootheer "<<norm2(in)<<" " <<norm2(out)<<std::endl;
  }
 };
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
  typedef LinearOperatorBase<FineField>                            FineOperator;
  typedef LinearFunction    <FineField>                            FineSmoother;
  Aggregates     & _Aggregates;
  CoarseOperator & _CoarseOperator;
  Matrix         & _FineMatrix;
  FineOperator   & _FineOperator;
  Guesser        & _Guess;
  FineSmoother   & _Smoother1;
  FineSmoother   & _Smoother2;
  CoarseSolver   & _CoarseSolve;
  int    level;  void Level(int lv) {level = lv; };
 #define GridLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level <<" "
  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
 			  FineOperator &Fine,Matrix &FineMatrix,
 			  FineSmoother &Smoother1,
 			  FineSmoother &Smoother2,
 			  Guesser &Guess_,
 			  CoarseSolver &CoarseSolve_)
    : _Aggregates(Agg),
      _CoarseOperator(Coarse),
      _FineOperator(Fine),
      _FineMatrix(FineMatrix),
      _Smoother1(Smoother1),
      _Smoother2(Smoother2),
      _Guess(Guess_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
 			  FineOperator &Fine,Matrix &FineMatrix,
 			  FineSmoother &Smoother,
 			  Guesser &Guess_,
 			  CoarseSolver &CoarseSolve_)
    : _Aggregates(Agg),
      _CoarseOperator(Coarse),
      _FineOperator(Fine),
      _FineMatrix(FineMatrix),
      _Smoother1(Smoother),
      _Smoother2(Smoother),
      _Guess(Guess_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  virtual void operator()(const FineField &in, FineField & out) 
  {
    CoarseVector Csrc(_CoarseOperator.Grid());
    CoarseVector Csol(_CoarseOperator.Grid()); 
    FineField vec1(in.Grid());
    FineField vec2(in.Grid());
    double t;
    // Fine Smoother
    t=-usecond();
    _Smoother1(in,out);
    t+=usecond();
    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
    // Update the residual
    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
    // Fine to Coarse 
    t=-usecond();
    _Aggregates.ProjectToSubspace  (Csrc,vec1);
    t+=usecond();
    GridLogLevel << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse correction
    t=-usecond();
    _CoarseSolve(Csrc,Csol);
    t+=usecond();
    GridLogLevel << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse to Fine
    t=-usecond();
    _Aggregates.PromoteFromSubspace(Csol,vec1); 
    add(out,out,vec1);
    t+=usecond();
    GridLogLevel << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
    // Residual
    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
    // Fine Smoother
    t=-usecond();
    _Smoother2(vec1,vec2);
    t+=usecond();
    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
    add( out,out,vec2);
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
  const int rLs=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  ///////////////////////////////////////////////////
  // Construct a coarsened grid; utility for this?
  ///////////////////////////////////////////////////
  std::vector<int> block ({2,2,2,2});
  const int nbasis= 32;
  auto clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/block[d];
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
  LatticeFermion    src(FGrid); gaussian(RNG5,src);// src=src+g5*src;
  LatticeFermion result(FGrid); 
  LatticeGaugeField Umu(UGrid); 
  FieldMetaData header;
  std::string file("./ckpoint_lat");
  NerscIO::readConfiguration(Umu,header,file);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  RealD mass=0.001;
  RealD M5=1.8;
  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
  typedef CoarseOperator::CoarseVector                                 CoarseVector;
  typedef CoarseOperator::siteVector siteVector;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
  Subspace Aggregates(Coarse5d,FGrid,0);
  assert ( (nbasis & 0x1)==0);
  {
    int nb=nbasis/2;
    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,100,0.0);// 18s
    //   rAggregates.CreateSubspaceChebyshev(RNG5,rHermDefOp,nb,60.0,0.05,500,200,150,0.0);// 15.7 23iter
    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,150,0.0);//
    // pad out the rAggregates.
    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,500,150,0.0);// 19s 
    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,200,0.0); 15.2s
    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,500,200,0.0); 16.3s
    for(int n=0;n<nb;n++){
      G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
    }
    LatticeFermion A(FGrid);
    LatticeFermion B(FGrid);
    for(int n=0;n<nb;n++){
      A = Aggregates.subspace[n];
      B = Aggregates.subspace[n+nb];
      Aggregates.subspace[n]   = A+B; // 1+G5 // eigen value of G5R5 is +1
      Aggregates.subspace[n+nb]= A-B; // 1-G5 // eigen value of G5R5 is -1
    }
  }
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>    Level1Op;
  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
  Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << " Running Coarse grid Lanczos "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  MdagMLinearOperator<Level1Op,CoarseVector> IRLHermOp(LDOp);
  Chebyshev<CoarseVector> IRLCheby(0.002,12.,151);
  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,IRLHermOp);
  PlainHermOp<CoarseVector> IRLOp    (IRLHermOp);
  int Nk=48;
  int Nm=64;
  int Nstop=48;
  int Nconv;
  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20);
  std::vector<RealD>          eval(Nm);
  std::vector<CoarseVector>   evec(Nm,Coarse5d);
  CoarseVector c_src(Coarse5d);
  gaussian(CRNG,c_src);
  IRL.calc(eval,evec,c_src,Nconv);
  //  ConjugateGradient<CoarseVector>  CoarseCG(0.01,1000);
  ConjugateGradient<CoarseVector>  CoarseCG(0.02,1000);// 14.7s
  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
  NormalEquations<CoarseVector> DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser);
  c_src=1.0;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << " Fine        PowerMethod           "<< std::endl;
  PowerMethod<LatticeFermion>       PM;   PM(HermDefOp,src);
  std::cout<<GridLogMessage << " Coarse       PowerMethod           "<< std::endl;
  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
  PowerMethod<CoarseVector>        cPM;  cPM(PosdefLdop,c_src);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building 2 level Multigrid            "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,DeflatedGuesser<CoarseVector> , NormalEquations<CoarseVector> >   TwoLevelMG;
  // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space
  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 72 iter 63s
  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.1,60.0,20,HermIndefOp,Ddwf); // 66 iter 69s
  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,20,HermIndefOp,Ddwf); // 63 iter 65  s
  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(1.0,60.0,20,HermIndefOp,Ddwf); // 69, 70
  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(1.0,60.0,14,HermIndefOp,Ddwf); // 77
  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); // 23 iter 15.9s
  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 20, 16.9s
  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf); // 21, 15.6s
  //  MirsSmoother<LatticeFermion,DomainWallFermionR> FineCGSmoother(0.05,0.01,20,HermIndefOp,Ddwf);
  //  RedBlackSmoother<LatticeFermion,DomainWallFermionR> FineRBSmoother(0.00,0.001,100,Ddwf);
  // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space
  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
  TwoLevelMG TwoLevelPrecon(Aggregates, LDOp,
 			    HermIndefOp,Ddwf,
 			    FineSmoother,
 			    DeflCoarseGuesser,
 			    DeflCoarseCGNE);
  TwoLevelPrecon.Level(1);
  // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid
  PrecGeneralisedConjugateResidual<LatticeFermion> l1PGCR(1.0e-8,1000,HermIndefOp,TwoLevelPrecon,16,16);
  l1PGCR.Level(1);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Calling 2 level Multigrid            "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  result=Zero();
  l1PGCR(src,result);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Fine CG prec DiagMooee "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  ConjugateGradient<LatticeFermion>          FineCG(1.0e-8,10000);
  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> FineDiagMooee(Ddwf); //  M_ee - Meo Moo^-1 Moe 
  LatticeFermion f_src_e(FrbGrid); f_src_e=1.0;
  LatticeFermion f_res_e(FrbGrid); f_res_e=Zero();
  FineCG(FineDiagMooee,f_src_e,f_res_e);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  Grid_finalize();
 }