From 47b89d2739facbea82f96a52c836173dfc37f35a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 26 Aug 2020 17:04:27 -0400
Subject: [PATCH 01/32] Pragma protection improvementt

---
 Grid/Grid_Eigen_Dense.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h
index 9556c03d..e74d3894 100644
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -42,7 +42,7 @@
 #ifdef __NVCC__REDEFINE__
 #pragma pop_macro("__CUDACC__")
 #pragma pop_macro("__NVCC__")
-#pragma pop_macro("GRID_SIMT")
+#pragma pop_macro("__CUDA_ARCH__")
 #pragma pop
 #endif
 

From 3448b7387c80d9789ce0c18e2e94f1f2d70bdd9e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 26 Aug 2020 17:04:49 -0400
Subject: [PATCH 02/32] Almost there to coalesced ET

---
 Grid/lattice/Lattice_ET.h   | 102 +++++++++++++++++++++++++++++++-----
 Grid/lattice/Lattice_base.h |  15 +++---
 2 files changed, 98 insertions(+), 19 deletions(-)

diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h
index 4407be04..24ec812c 100644
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -42,9 +42,25 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////
 // Predicated where support
 ////////////////////////////////////////////////////
+#ifdef GRID_SIMT
 template <class iobj, class vobj, class robj>
-accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
-                            const robj &iffalse) {
+accelerator_inline vobj predicatedWhere(const iobj &predicate, 
+					const vobj &iftrue, 
+					const robj &iffalse) 
+{
+  // should drop to sccalar in SIMT
+  //  typename std::remove_const<vobj>::type ret;
+  //  Integer mask = TensorRemove(predicate);
+  //  ret = iffalse;
+  //  if (TensorRemove(mask)) ret=iftrue;
+  return iftrue;
+}
+#else
+template <class iobj, class vobj, class robj>
+accelerator_inline vobj predicatedWhere(const iobj &predicate, 
+					const vobj &iftrue, 
+					const robj &iffalse) 
+{
   typename std::remove_const<vobj>::type ret;
 
   typedef typename vobj::scalar_object scalar_object;
@@ -68,6 +84,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru
   merge(ret, falsevals);
   return ret;
 }
+#endif
 
 /////////////////////////////////////////////////////
 //Specialization of getVectorType for lattices
@@ -86,13 +103,45 @@ sobj eval(const uint64_t ss, const sobj &arg)
 {
   return arg;
 }
-
 template <class lobj> accelerator_inline 
 const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg) 
 {
   return arg[ss];
 }
 
+template<class sobj,
+  typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr> 
+accelerator_inline 
+sobj coalescedEval(const uint64_t ss, const sobj &arg)
+{
+  return arg;
+}
+#ifdef GRID_SIMT
+#warning device
+template <class lobj> accelerator_inline 
+typename lobj::scalar_object coalescedEval(const uint64_t ss, const LatticeView<lobj> &arg) 
+{
+  auto ret = arg(ss);
+  return ret;
+}
+#else
+#warning host
+template <class lobj> accelerator_inline 
+lobj coalescedEval(const uint64_t ss, const LatticeView<lobj> &arg) 
+{
+  //   return coalescedRead(arg[ss]);
+  return arg[ss];
+}
+#endif
+/*
+template <class lobj> accelerator_inline 
+typename lobj::scalar_type coalescedEval(const uint64_t ss, const Lattice<lobj> &arg) 
+{
+  assert(0);
+  return coalescedRead(arg[ss]);
+}
+*/
+
 // What needs this?
 // Cannot be legal on accelerator
 // Comparison must convert
@@ -115,18 +164,14 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
 {
   return expr.op.func( eval(ss, expr.arg1) );
 }
-///////////////////////
 // eval two operands
-///////////////////////
 template <typename Op, typename T1, typename T2> accelerator_inline
 auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)  
   -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
 {
   return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
 }
-///////////////////////
 // eval three operands
-///////////////////////
 template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
 auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
   -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
@@ -134,6 +179,34 @@ auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &exp
   return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
 }
 
+///////////////////////////////////////////////////
+// handle nodes in syntax tree- eval one operand coalesced
+///////////////////////////////////////////////////
+template <typename Op, typename T1> accelerator_inline 
+auto coalescedEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)  
+  -> decltype(expr.op.func( coalescedEval(ss, expr.arg1)))
+{
+  return expr.op.func( coalescedEval(ss, expr.arg1) );
+}
+// eval two operands
+template <typename Op, typename T1, typename T2> accelerator_inline
+auto coalescedEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)  
+  -> decltype(expr.op.func( coalescedEval(ss,expr.arg1),coalescedEval(ss,expr.arg2)))
+{
+  return expr.op.func( coalescedEval(ss,expr.arg1), coalescedEval(ss,expr.arg2) );
+}
+// eval three operands
+template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
+auto coalescedEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
+  -> decltype(expr.op.func(coalescedEval(ss, expr.arg1), 
+			   coalescedEval(ss, expr.arg2), 
+			   coalescedEval(ss, expr.arg3)))
+{
+  return expr.op.func(coalescedEval(ss, expr.arg1), 
+		      coalescedEval(ss, expr.arg2), 
+		      coalescedEval(ss, expr.arg3));
+}
+
 //////////////////////////////////////////////////////////////////////////
 // Obtain the grid from an expression, ensuring conformable. This must follow a
 // tree recursion; must retain grid pointer in the LatticeView class which sucks
@@ -275,7 +348,7 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
 #define GridUnopClass(name, ret)					\
   template <class arg>							\
   struct name {								\
-    static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
+    template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \
   };
 
 GridUnopClass(UnarySub, -a);
@@ -308,8 +381,9 @@ GridUnopClass(UnaryExp, exp(a));
 #define GridBinOpClass(name, combination)			\
   template <class left, class right>				\
   struct name {							\
+    template <class _left, class _right>			\
     static auto accelerator_inline				\
-    func(const left &lhs, const right &rhs)			\
+    func(const _left &lhs, const _right &rhs)			\
       -> decltype(combination) const				\
     {								\
       return combination;					\
@@ -331,8 +405,9 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
 #define GridTrinOpClass(name, combination)				\
   template <class predicate, class left, class right>			\
   struct name {								\
+    template <class _predicate,class _left, class _right>		\
     static auto accelerator_inline					\
-    func(const predicate &pred, const left &lhs, const right &rhs)	\
+    func(const _predicate &pred, const _left &lhs, const _right &rhs)	\
       -> decltype(combination) const					\
     {									\
       return combination;						\
@@ -340,9 +415,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
   };
 
 GridTrinOpClass(TrinaryWhere,
-		(predicatedWhere<predicate, 
-		 typename std::remove_reference<left>::type,
-		 typename std::remove_reference<right>::type>(pred, lhs,rhs)));
+		(predicatedWhere<
+		 typename std::remove_reference<_predicate>::type, 
+		 typename std::remove_reference<_left>::type,
+		 typename std::remove_reference<_right>::type>(pred, lhs,rhs)));
 
 ////////////////////////////////////////////
 // Operator syntactical glue
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 73b1b6a1..2d972970 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -124,8 +124,9 @@ public:
     ExpressionViewOpen(exprCopy);
     auto me  = View(AcceleratorWriteDiscard);
     accelerator_for(ss,me.size(),1,{
-      auto tmp = eval(ss,exprCopy);
-      vstream(me[ss],tmp);
+      auto tmp = coalescedEval(ss,exprCopy);
+      coalescedWrite(me[ss],tmp);
+      //      me[ss]=tmp;
     });
     me.ViewClose();
     ExpressionViewClose(exprCopy);
@@ -147,8 +148,9 @@ public:
     ExpressionViewOpen(exprCopy);
     auto me  = View(AcceleratorWriteDiscard);
     accelerator_for(ss,me.size(),1,{
-      auto tmp = eval(ss,exprCopy);
-      vstream(me[ss],tmp);
+      auto tmp = coalescedEval(ss,exprCopy);
+      coalescedWrite(me[ss],tmp);
+      //me[ss]=tmp;
     });
     me.ViewClose();
     ExpressionViewClose(exprCopy);
@@ -169,8 +171,9 @@ public:
     ExpressionViewOpen(exprCopy);
     auto me  = View(AcceleratorWriteDiscard);
     accelerator_for(ss,me.size(),1,{
-      auto tmp = eval(ss,exprCopy);
-      vstream(me[ss],tmp);
+      auto tmp = coalescedEval(ss,exprCopy);
+      coalescedWrite(me[ss],tmp);
+      //      me[ss]=tmp;
     });
     me.ViewClose();
     ExpressionViewClose(exprCopy);

From 65920faebae56dd8e28d43442fe4010c2f491de0 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Mon, 31 Aug 2020 18:39:27 +0200
Subject: [PATCH 03/32] correct formatting of Benchmark_wilson_sweep output

---
 benchmarks/Benchmark_wilson_sweep.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc
index f1473b44..100e09f5 100644
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -89,6 +89,7 @@ int main (int argc, char ** argv)
 
 	  std::cout << GridLogMessage;
 	  std::cout << latt_size;
+    std::cout << "\t\t";
 
 	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
 	  GridRedBlackCartesian RBGrid(&Grid);

From a9b92867a817deed5c041a268d97c54d0674e56a Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Mon, 31 Aug 2020 18:41:17 +0200
Subject: [PATCH 04/32] use tabulator

---
 benchmarks/Benchmark_wilson_sweep.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc
index 100e09f5..986a1831 100644
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -89,7 +89,7 @@ int main (int argc, char ** argv)
 
 	  std::cout << GridLogMessage;
 	  std::cout << latt_size;
-    std::cout << "\t\t";
+	  std::cout << "\t\t";
 
 	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
 	  GridRedBlackCartesian RBGrid(&Grid);

From 54523369a3096750dac18f2150a23d6d98709030 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Mon, 31 Aug 2020 19:39:36 +0200
Subject: [PATCH 05/32] do not use backspace in Coordinate output stream
 operator

---
 Grid/util/Coordinate.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Grid/util/Coordinate.h b/Grid/util/Coordinate.h
index 7f1d31c0..004fbc72 100644
--- a/Grid/util/Coordinate.h
+++ b/Grid/util/Coordinate.h
@@ -99,10 +99,10 @@ inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector<T,_nd
 {
   os << "[";
   for(int s=0;s<v.size();s++) {
-    os << v[s] << " ";
-  }
-  if (v.size() > 0) {
-    os << "\b";
+    os << v[s];
+    if( s < (v.size()-1) ){
+      os << " ";
+    }
   }
   os << "]";
   return os;

From 1eee94a80943740c90d5f79dbef8a52d4192042a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 31 Aug 2020 23:36:49 -0400
Subject: [PATCH 06/32] Sorting real/im in read coalesced GPU ET

---
 Grid/lattice/Lattice.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h
index a3017198..28ea0294 100644
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -37,6 +37,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
 //#include <Grid/lattice/Lattice_reality.h>
+#include <Grid/lattice/Lattice_real_imag.h>
 #include <Grid/lattice/Lattice_comparison_utils.h>
 #include <Grid/lattice/Lattice_comparison.h>
 #include <Grid/lattice/Lattice_coordinate.h>

From ed469898dc4b05ec6f9550f7f936b3de5b405e56 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 31 Aug 2020 23:38:40 -0400
Subject: [PATCH 07/32] coalesced ET expressions

---
 Grid/lattice/Lattice_base.h | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 2d972970..3ad9f913 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -123,10 +123,9 @@ public:
     auto exprCopy = expr;
     ExpressionViewOpen(exprCopy);
     auto me  = View(AcceleratorWriteDiscard);
-    accelerator_for(ss,me.size(),1,{
-      auto tmp = coalescedEval(ss,exprCopy);
+    accelerator_for(ss,me.size(),vobj::Nsimd(),{
+      auto tmp = eval(ss,exprCopy);
       coalescedWrite(me[ss],tmp);
-      //      me[ss]=tmp;
     });
     me.ViewClose();
     ExpressionViewClose(exprCopy);
@@ -147,10 +146,9 @@ public:
     auto exprCopy = expr;
     ExpressionViewOpen(exprCopy);
     auto me  = View(AcceleratorWriteDiscard);
-    accelerator_for(ss,me.size(),1,{
-      auto tmp = coalescedEval(ss,exprCopy);
+    accelerator_for(ss,me.size(),vobj::Nsimd(),{
+      auto tmp = eval(ss,exprCopy);
       coalescedWrite(me[ss],tmp);
-      //me[ss]=tmp;
     });
     me.ViewClose();
     ExpressionViewClose(exprCopy);
@@ -170,10 +168,9 @@ public:
     auto exprCopy = expr;
     ExpressionViewOpen(exprCopy);
     auto me  = View(AcceleratorWriteDiscard);
-    accelerator_for(ss,me.size(),1,{
-      auto tmp = coalescedEval(ss,exprCopy);
+    accelerator_for(ss,me.size(),vobj::Nsimd(),{
+      auto tmp = eval(ss,exprCopy);
       coalescedWrite(me[ss],tmp);
-      //      me[ss]=tmp;
     });
     me.ViewClose();
     ExpressionViewClose(exprCopy);

From 9522dcd61118ef6d98009a48cdd4cce4c53757d1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 31 Aug 2020 23:40:29 -0400
Subject: [PATCH 08/32] Remove dead commented ouot coode

---
 Grid/lattice/Lattice_comparison.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/Grid/lattice/Lattice_comparison.h b/Grid/lattice/Lattice_comparison.h
index 6a29be94..b9912639 100644
--- a/Grid/lattice/Lattice_comparison.h
+++ b/Grid/lattice/Lattice_comparison.h
@@ -42,34 +42,6 @@ NAMESPACE_BEGIN(Grid);
 
 typedef iScalar<vInteger> vPredicate ;
 
-/*
-template <class iobj, class vobj, class robj> accelerator_inline 
-vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse) 
-{
-  typename std::remove_const<vobj>::type ret;
-
-  typedef typename vobj::scalar_object scalar_object;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  const int Nsimd = vobj::vector_type::Nsimd();
-
-  ExtractBuffer<Integer> mask(Nsimd);
-  ExtractBuffer<scalar_object> truevals(Nsimd);
-  ExtractBuffer<scalar_object> falsevals(Nsimd);
-
-  extract(iftrue, truevals);
-  extract(iffalse, falsevals);
-  extract<vInteger, Integer>(TensorRemove(predicate), mask);
-
-  for (int s = 0; s < Nsimd; s++) {
-    if (mask[s]) falsevals[s] = truevals[s];
-  }
-
-  merge(ret, falsevals);
-  return ret;
-}
-*/
 //////////////////////////////////////////////////////////////////////////
 // compare lattice to lattice
 //////////////////////////////////////////////////////////////////////////

From 6c31b99f1f3624f93aeb5a22580969604daddb6f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 31 Aug 2020 23:49:19 -0400
Subject: [PATCH 09/32] I knew coupling Eigen Tensor to Grid serialisation was
 a bad iddea. Now the complex is different on GPU creates probblems

---
 Grid/tensors/Tensor_traits.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h
index 04d7343e..dfa3c043 100644
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -190,6 +190,36 @@ NAMESPACE_BEGIN(Grid);
     typedef ComplexD DoublePrecision;
     typedef ComplexD DoublePrecision2;
   };
+
+#ifdef GRID_CUDA
+  template<> struct GridTypeMapper<std::complex<float> > : public GridTypeMapper_Base {
+    typedef std::complex<float> scalar_type;
+    typedef std::complex<double> scalar_typeD;
+    typedef scalar_type vector_type;
+    typedef scalar_typeD vector_typeD;
+    typedef scalar_type tensor_reduced;
+    typedef scalar_type scalar_object;
+    typedef scalar_typeD scalar_objectD;
+    typedef scalar_type Complexified;
+    typedef RealF Realified;
+    typedef scalar_typeD DoublePrecision;
+    typedef scalar_typeD DoublePrecision2;
+  };
+  template<> struct GridTypeMapper<std::complex<double> > : public GridTypeMapper_Base {
+    typedef std::complex<double> scalar_type;
+    typedef std::complex<double> scalar_typeD;
+    typedef scalar_type vector_type;
+    typedef scalar_typeD vector_typeD;
+    typedef scalar_type tensor_reduced;
+    typedef scalar_type scalar_object;
+    typedef scalar_typeD scalar_objectD;
+    typedef scalar_type Complexified;
+    typedef RealD Realified;
+    typedef scalar_typeD DoublePrecision;
+    typedef scalar_typeD DoublePrecision2;
+  };
+#endif
+
   template<> struct GridTypeMapper<ComplexD2> : public GridTypeMapper_Base {
     typedef ComplexD2 scalar_type;
     typedef ComplexD2 scalar_typeD;

From e14a84317d722808ef738b1789200a9b00a4f364 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 31 Aug 2020 23:50:49 -0400
Subject: [PATCH 10/32] GPU math unary calls

---
 Grid/simd/Simd.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Grid/simd/Simd.h b/Grid/simd/Simd.h
index 37aee2ed..1dc86c1b 100644
--- a/Grid/simd/Simd.h
+++ b/Grid/simd/Simd.h
@@ -93,6 +93,11 @@ accelerator_inline ComplexF pow(const ComplexF& r,RealF y){ return(std::pow(r,y)
 using std::abs;
 using std::pow;
 using std::sqrt;
+using std::log;
+using std::exp;
+using std::sin;
+using std::cos;
+
 
 accelerator_inline RealF    conjugate(const RealF  & r){ return r; }
 accelerator_inline RealD    conjugate(const RealD  & r){ return r; }

From 7d14a3c0866bd76b8f74076bb03262c58e38615a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 31 Aug 2020 23:53:46 -0400
Subject: [PATCH 11/32] Where working

---
 Grid/lattice/Lattice_ET.h | 185 ++++++++++++++++++--------------------
 1 file changed, 87 insertions(+), 98 deletions(-)

diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h
index 24ec812c..c43844f8 100644
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -43,17 +43,16 @@ NAMESPACE_BEGIN(Grid);
 // Predicated where support
 ////////////////////////////////////////////////////
 #ifdef GRID_SIMT
+// drop to scalar in SIMT; cleaner in fact
 template <class iobj, class vobj, class robj>
 accelerator_inline vobj predicatedWhere(const iobj &predicate, 
 					const vobj &iftrue, 
 					const robj &iffalse) 
 {
-  // should drop to sccalar in SIMT
-  //  typename std::remove_const<vobj>::type ret;
-  //  Integer mask = TensorRemove(predicate);
-  //  ret = iffalse;
-  //  if (TensorRemove(mask)) ret=iftrue;
-  return iftrue;
+  Integer mask = TensorRemove(predicate);
+  typename std::remove_const<vobj>::type ret= iffalse;
+  if (mask) ret=iftrue;
+  return ret;
 }
 #else
 template <class iobj, class vobj, class robj>
@@ -98,65 +97,62 @@ struct getVectorType<Lattice<T> >{
 //--  recursive evaluation of expressions; --
 // handle leaves of syntax tree
 ///////////////////////////////////////////////////
-template<class sobj> accelerator_inline 
+template<class sobj,
+  typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr> 
+accelerator_inline 
 sobj eval(const uint64_t ss, const sobj &arg)
 {
   return arg;
 }
 template <class lobj> accelerator_inline 
-const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg) 
+auto eval(const uint64_t ss, const LatticeView<lobj> &arg) -> decltype(arg(ss))
 {
-  return arg[ss];
+  return arg(ss);
 }
 
-template<class sobj,
-  typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr> 
-accelerator_inline 
-sobj coalescedEval(const uint64_t ss, const sobj &arg)
+////////////////////////////////////////////
+//--  recursive evaluation of expressions; --
+// whole vector return, used only for expression return type inference
+///////////////////////////////////////////////////
+template<class sobj> accelerator_inline 
+sobj vecEval(const uint64_t ss, const sobj &arg)
 {
   return arg;
 }
-#ifdef GRID_SIMT
-#warning device
 template <class lobj> accelerator_inline 
-typename lobj::scalar_object coalescedEval(const uint64_t ss, const LatticeView<lobj> &arg) 
+const lobj & vecEval(const uint64_t ss, const LatticeView<lobj> &arg) 
 {
-  auto ret = arg(ss);
-  return ret;
-}
-#else
-#warning host
-template <class lobj> accelerator_inline 
-lobj coalescedEval(const uint64_t ss, const LatticeView<lobj> &arg) 
-{
-  //   return coalescedRead(arg[ss]);
   return arg[ss];
 }
-#endif
-/*
-template <class lobj> accelerator_inline 
-typename lobj::scalar_type coalescedEval(const uint64_t ss, const Lattice<lobj> &arg) 
-{
-  assert(0);
-  return coalescedRead(arg[ss]);
-}
-*/
-
-// What needs this?
-// Cannot be legal on accelerator
-// Comparison must convert
-#if 1
-template <class lobj> accelerator_inline 
-const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) 
-{
-  assert(0);
-  auto view = arg.View(AcceleratorRead);
-  return view[ss];
-}
-#endif
 
 ///////////////////////////////////////////////////
 // handle nodes in syntax tree- eval one operand
+// vecEval needed (but never called as all expressions offloaded) to infer the return type
+// in SIMT contexts of closure.
+///////////////////////////////////////////////////
+template <typename Op, typename T1> accelerator_inline 
+auto vecEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)  
+  -> decltype(expr.op.func( vecEval(ss, expr.arg1)))
+{
+  return expr.op.func( vecEval(ss, expr.arg1) );
+}
+// vecEval two operands
+template <typename Op, typename T1, typename T2> accelerator_inline
+auto vecEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)  
+  -> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2)))
+{
+  return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) );
+}
+// vecEval three operands
+template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
+auto vecEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
+  -> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)))
+{
+  return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3));
+}
+
+///////////////////////////////////////////////////
+// handle nodes in syntax tree- eval one operand coalesced
 ///////////////////////////////////////////////////
 template <typename Op, typename T1> accelerator_inline 
 auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)  
@@ -174,37 +170,31 @@ auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
 // eval three operands
 template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
 auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
-  -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
+  -> decltype(expr.op.func(eval(ss, expr.arg1), 
+			   eval(ss, expr.arg2), 
+			   eval(ss, expr.arg3)))
 {
-  return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
-}
+#ifdef GRID_SIMT
+  // Handles Nsimd (vInteger) != Nsimd(ComplexD)
+  typedef decltype(vecEval(ss, expr.arg2)) rvobj;
+  typedef typename std::remove_reference<rvobj>::type vobj;
 
-///////////////////////////////////////////////////
-// handle nodes in syntax tree- eval one operand coalesced
-///////////////////////////////////////////////////
-template <typename Op, typename T1> accelerator_inline 
-auto coalescedEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)  
-  -> decltype(expr.op.func( coalescedEval(ss, expr.arg1)))
-{
-  return expr.op.func( coalescedEval(ss, expr.arg1) );
-}
-// eval two operands
-template <typename Op, typename T1, typename T2> accelerator_inline
-auto coalescedEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)  
-  -> decltype(expr.op.func( coalescedEval(ss,expr.arg1),coalescedEval(ss,expr.arg2)))
-{
-  return expr.op.func( coalescedEval(ss,expr.arg1), coalescedEval(ss,expr.arg2) );
-}
-// eval three operands
-template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
-auto coalescedEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
-  -> decltype(expr.op.func(coalescedEval(ss, expr.arg1), 
-			   coalescedEval(ss, expr.arg2), 
-			   coalescedEval(ss, expr.arg3)))
-{
-  return expr.op.func(coalescedEval(ss, expr.arg1), 
-		      coalescedEval(ss, expr.arg2), 
-		      coalescedEval(ss, expr.arg3));
+  const int Nsimd = vobj::vector_type::Nsimd();
+
+  auto vpred = vecEval(ss,expr.arg1);
+
+  ExtractBuffer<Integer> mask(Nsimd);
+  extract<vInteger, Integer>(TensorRemove(vpred), mask);
+
+  int s = acceleratorSIMTlane(Nsimd);
+  return expr.op.func(mask[s],
+		      eval(ss, expr.arg2), 
+		      eval(ss, expr.arg3));
+#else
+  return expr.op.func(eval(ss, expr.arg1),
+		      eval(ss, expr.arg2), 
+		      eval(ss, expr.arg3));
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -302,7 +292,7 @@ template <typename Op, typename T1, typename T2> inline
 void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr) 
 {
   ExpressionViewOpen(expr.arg1);  // recurse AST
-  ExpressionViewOpen(expr.arg2);  // recurse AST
+  ExpressionViewOpen(expr.arg2);  // rrecurse AST
 }
 template <typename Op, typename T1, typename T2, typename T3>
 inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr) 
@@ -346,7 +336,6 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
 // Unary operators and funcs
 ////////////////////////////////////////////
 #define GridUnopClass(name, ret)					\
-  template <class arg>							\
   struct name {								\
     template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \
   };
@@ -359,8 +348,6 @@ GridUnopClass(UnaryTrace, trace(a));
 GridUnopClass(UnaryTranspose, transpose(a));
 GridUnopClass(UnaryTa, Ta(a));
 GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
-GridUnopClass(UnaryReal, real(a));
-GridUnopClass(UnaryImag, imag(a));
 GridUnopClass(UnaryToReal, toReal(a));
 GridUnopClass(UnaryToComplex, toComplex(a));
 GridUnopClass(UnaryTimesI, timesI(a));
@@ -379,7 +366,6 @@ GridUnopClass(UnaryExp, exp(a));
 // Binary operators
 ////////////////////////////////////////////
 #define GridBinOpClass(name, combination)			\
-  template <class left, class right>				\
   struct name {							\
     template <class _left, class _right>			\
     static auto accelerator_inline				\
@@ -403,7 +389,6 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
 // Trinary conditional op
 ////////////////////////////////////////////////////
 #define GridTrinOpClass(name, combination)				\
-  template <class predicate, class left, class right>			\
   struct name {								\
     template <class _predicate,class _left, class _right>		\
     static auto accelerator_inline					\
@@ -423,10 +408,9 @@ GridTrinOpClass(TrinaryWhere,
 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
-
-#define GRID_UNOP(name)   name<decltype(eval(0, arg))>
-#define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
-#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+#define GRID_UNOP(name)   name
+#define GRID_BINOP(name)  name
+#define GRID_TRINOP(name) name
 
 #define GRID_DEF_UNOP(op, name)						\
   template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
@@ -478,8 +462,6 @@ GRID_DEF_UNOP(trace, UnaryTrace);
 GRID_DEF_UNOP(transpose, UnaryTranspose);
 GRID_DEF_UNOP(Ta, UnaryTa);
 GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
-GRID_DEF_UNOP(real, UnaryReal);
-GRID_DEF_UNOP(imag, UnaryImag);
 GRID_DEF_UNOP(toReal, UnaryToReal);
 GRID_DEF_UNOP(toComplex, UnaryToComplex);
 GRID_DEF_UNOP(timesI, UnaryTimesI);
@@ -512,29 +494,36 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
 /////////////////////////////////////////////////////////////
 template <class Op, class T1>
 auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> 
 {
-  Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
+  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr);
   return ret;
 }
 template <class Op, class T1, class T2>
 auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> 
+  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> 
 {
-  Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
+  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr);
   return ret;
 }
 template <class Op, class T1, class T2, class T3>
 auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
-				   eval(0, expr.arg2),
-				   eval(0, expr.arg3)))> 
+  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
+				   vecEval(0, expr.arg2),
+				   vecEval(0, expr.arg3)))> 
 {
-  Lattice<decltype(expr.op.func(eval(0, expr.arg1),
-				eval(0, expr.arg2),
-				eval(0, expr.arg3)))>  ret(expr);
+  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
+				vecEval(0, expr.arg2),
+			        vecEval(0, expr.arg3)))>  ret(expr);
   return ret;
 }
+#define EXPRESSION_CLOSURE(function)					\
+  template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> \
+    auto function(Expression &expr) -> decltype(function(closure(expr))) \
+  {									\
+    return function(closure(expr));					\
+  }
+
 
 #undef GRID_UNOP
 #undef GRID_BINOP

From b918744184c13cab718bcaeb021805a1281d616d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 31 Aug 2020 23:54:46 -0400
Subject: [PATCH 12/32] Prettificatoin

---
 Grid/qcd/utils/SUnAdjoint.h | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/Grid/qcd/utils/SUnAdjoint.h b/Grid/qcd/utils/SUnAdjoint.h
index 1d530373..18d6b875 100644
--- a/Grid/qcd/utils/SUnAdjoint.h
+++ b/Grid/qcd/utils/SUnAdjoint.h
@@ -39,7 +39,7 @@ public:
   typedef iSUnAdjointMatrix<ComplexF> AMatrixF;
   typedef iSUnAdjointMatrix<ComplexD> AMatrixD;
 
-  typedef iSUnAdjointMatrix<vComplex> vAMatrix;
+  typedef iSUnAdjointMatrix<vComplex>  vAMatrix;
   typedef iSUnAdjointMatrix<vComplexF> vAMatrixF;
   typedef iSUnAdjointMatrix<vComplexD> vAMatrixD;
 
@@ -47,14 +47,9 @@ public:
   typedef Lattice<vAMatrixF> LatticeAdjMatrixF;
   typedef Lattice<vAMatrixD> LatticeAdjMatrixD;
 
-  typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> >
-  LatticeAdjField;
-  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> >
-  LatticeAdjFieldF;
-  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> >
-  LatticeAdjFieldD;
-
-
+  typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> >  LatticeAdjField;
+  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> > LatticeAdjFieldF;
+  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> > LatticeAdjFieldD;
 
 
   template <class cplx>
@@ -128,7 +123,9 @@ public:
   }
 
   // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 )
-  static void projectOnAlgebra(typename SU<ncolour>::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) {
+  static void projectOnAlgebra(typename SU<ncolour>::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) 
+  {
+    
     conformable(h_out, in);
     h_out = Zero();
     AMatrix iTa;
@@ -136,7 +133,7 @@ public:
 
     for (int a = 0; a < Dimension; a++) {
       generator(a, iTa);
-      auto tmp = real(trace(iTa * in)) * coefficient;
+      LatticeComplex tmp = real(trace(iTa * in)) * coefficient;
       pokeColour(h_out, tmp, a);
     }
   }

From 3d27708f0757d418fbb773dafb5d760a599080b2 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 31 Aug 2020 23:55:49 -0400
Subject: [PATCH 13/32] Basic where test

---
 tests/core/Test_where.cc | 80 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 tests/core/Test_where.cc

diff --git a/tests/core/Test_where.cc b/tests/core/Test_where.cc
new file mode 100644
index 00000000..050b711b
--- /dev/null
+++ b/tests/core/Test_where.cc
@@ -0,0 +1,80 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_poisson_fft.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+ ;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  int N=16;
+  
+  std::vector<int> latt_size  ({N,4,4});
+  std::vector<int> simd_layout({vComplexD::Nsimd(),1,1});
+  std::vector<int> mpi_layout ({1,1,1});
+
+  int vol = 1;
+  int nd  = latt_size.size();
+  for(int d=0;d<nd;d++){
+    vol = vol * latt_size[d];
+  }
+
+  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
+
+  LatticeComplexD      zz(&GRID);
+  LatticeInteger     coor(&GRID);
+  LatticeComplexD  rn(&GRID);
+  LatticeComplexD  sl(&GRID);
+
+  zz  = ComplexD(0.0,0.0);
+
+  GridParallelRNG RNG(&GRID);
+  RNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));  
+  gaussian(RNG,rn);
+
+  RealD nn=norm2(rn);
+  for(int mu=0;mu<nd;mu++){
+    RealD ns=0.0;
+    for(int t=0;t<latt_size[mu];t++){
+      LatticeCoordinate(coor,mu);
+      sl=where(coor==Integer(t),rn,zz);
+      std::cout <<GridLogMessage<< " sl " << sl<<std::endl;
+      std::cout <<GridLogMessage<<" slice "<<t<<" " << norm2(sl)<<std::endl;
+      ns=ns+norm2(sl);
+    }
+    std::cout <<GridLogMessage <<" sliceNorm" <<mu<<" "<< nn <<" "<<ns<<" " << nn-ns<<std::endl;
+  }
+
+  Grid_finalize();
+}

From e21fef17df4d9a6cdb281e916172dbdf957b3b0a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 31 Aug 2020 23:56:26 -0400
Subject: [PATCH 14/32] real and imag part not in ET

---
 Grid/lattice/Lattice_real_imag.h | 79 ++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 Grid/lattice/Lattice_real_imag.h

diff --git a/Grid/lattice/Lattice_real_imag.h b/Grid/lattice/Lattice_real_imag.h
new file mode 100644
index 00000000..003300cc
--- /dev/null
+++ b/Grid/lattice/Lattice_real_imag.h
@@ -0,0 +1,79 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_reality.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_REAL_IMAG_H
+#define GRID_LATTICE_REAL_IMAG_H
+
+
+// FIXME .. this is the sector of the code 
+// I am most worried about the directions
+// The choice of burying complex in the SIMD
+// is making the use of "real" and "imag" very cumbersome
+
+NAMESPACE_BEGIN(Grid);
+
+template<class vobj> inline Lattice<vobj> real(const Lattice<vobj> &lhs){
+  Lattice<vobj> ret(lhs.Grid());
+
+  autoView( lhs_v, lhs, AcceleratorRead);
+  autoView( ret_v, ret, AcceleratorWrite);
+
+  ret.Checkerboard()=lhs.Checkerboard();
+  accelerator_for( ss, lhs_v.size(), 1, {
+    ret_v[ss] =real(lhs_v[ss]);
+  });
+  return ret;
+};
+template<class vobj> inline Lattice<vobj> imag(const Lattice<vobj> &lhs){
+  Lattice<vobj> ret(lhs.Grid());
+
+  autoView( lhs_v, lhs, AcceleratorRead);
+  autoView( ret_v, ret, AcceleratorWrite);
+
+  ret.Checkerboard()=lhs.Checkerboard();
+  accelerator_for( ss, lhs_v.size(), 1, {
+    ret_v[ss] =imag(lhs_v[ss]);
+  });
+  return ret;
+};
+
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+  auto real(const Expression &expr) -> decltype(real(closure(expr)))		
+{									
+  return real(closure(expr));					
+}
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+  auto imag(const Expression &expr) -> decltype(imag(closure(expr)))		
+{									
+  return imag(closure(expr));					
+}
+
+NAMESPACE_END(Grid);
+
+#endif

From 8b74174d7419600cbff0ad364657bef3b9b5bed9 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Sep 2020 00:03:26 -0400
Subject: [PATCH 15/32] Eigen tensor serialisatiino happy undeer GPU. Regret
 agreeing to let us couple Eigen types to Grid IO

---
 tests/IO/Test_serialisation.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc
index 7f3e5bca..27fe589e 100644
--- a/tests/IO/Test_serialisation.cc
+++ b/tests/IO/Test_serialisation.cc
@@ -47,14 +47,14 @@ public:
                           bool , b,
                           std::vector<double>, array,
                           std::vector<std::vector<double> >, twodimarray,
-			  std::vector<std::vector<std::vector<Complex> > >, cmplx3darray,
+			  std::vector<std::vector<std::vector<std::complex<double>> > >, cmplx3darray,
 			  SpinColourMatrix, scm
                           );
   myclass() {}
   myclass(int i)
   : array(4,5.1)
   , twodimarray(3,std::vector<double>(5, 1.23456))
-  , cmplx3darray(3,std::vector<std::vector<Complex>>(5, std::vector<Complex>(7, Complex(1.2, 3.4))))
+  , cmplx3darray(3,std::vector<std::vector<std::complex<double>>>(5, std::vector<std::complex<double>>(7, std::complex<double>(1.2, 3.4))))
   , ve(2, myenum::blue)
   {
     e=myenum::red;
@@ -121,11 +121,11 @@ namespace Eigen {
 // Perform I/O tests on a range of tensor types
 // Test coverage: scalars, complex and GridVectors in single, double and default precision
 class TensorIO : public Serializable {
-  using TestScalar = ComplexD;
+  using TestScalar = std::complex<double>;
   using SR3 = Eigen::Sizes<9,4,2>;
   using SR5 = Eigen::Sizes<5,4,3,2,1>;
   using ESO = Eigen::StorageOptions;
-  using TensorRank3  = Eigen::Tensor<ComplexF, 3, ESO::RowMajor>;
+  using TensorRank3  = Eigen::Tensor<std::complex<float>, 3, ESO::RowMajor>;
   using TensorR5     = Eigen::TensorFixedSize<Real, SR5>;
   using TensorR5Alt  = Eigen::TensorFixedSize<Real, SR5, ESO::RowMajor>;
   using Tensor942    = Eigen::TensorFixedSize<TestScalar, SR3, ESO::RowMajor>;
@@ -134,8 +134,8 @@ class TensorIO : public Serializable {
   using LSCTensor    = Eigen::TensorFixedSize<SpinColourMatrix, Eigen::Sizes<6,5>>;
   
   static const Real       FlagR;
-  static const Complex    Flag;
-  static const ComplexF   FlagF;
+  static const std::complex<double>   Flag;
+  static const std::complex<float>    FlagF;
   static const TestScalar FlagTS;
   static const char * const pszFilePrefix;
 
@@ -230,8 +230,8 @@ public:
 };
 
 const Real                 TensorIO::FlagR {1};
-const Complex              TensorIO::Flag  {1,-1};
-const ComplexF             TensorIO::FlagF {1,-1};
+const std::complex<double> TensorIO::Flag  {1,-1};
+const std::complex<float>  TensorIO::FlagF {1,-1};
 const TensorIO::TestScalar TensorIO::FlagTS{1,-1};
 const char * const         TensorIO::pszFilePrefix = "tensor_";
 

From cbc995b74cf982cfcb1ef59991728490edff7198 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Sep 2020 00:12:54 -0400
Subject: [PATCH 16/32] Made better interface

---
 tests/core/Test_main.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc
index 08752a46..af8b747b 100644
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@@ -137,7 +137,6 @@ int main(int argc, char **argv) {
       LatticeReal iscalar(&Fine);
 
       SpinMatrix GammaFive;
-      iSpinMatrix<vComplex> iGammaFive;
       ColourMatrix cmat;
 
       random(FineRNG, Foo);
@@ -283,7 +282,6 @@ int main(int argc, char **argv) {
       cMat = mydouble * cMat;
 
       sMat = adj(sMat);          // LatticeSpinMatrix adjoint
-      sMat = iGammaFive * sMat;  // SpinMatrix * LatticeSpinMatrix
       sMat = GammaFive * sMat;   // SpinMatrix * LatticeSpinMatrix
       scMat = adj(scMat);
       cMat = adj(cMat);

From 15ca8637f357bc080a75bcd5da46e856b4fb2122 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Sep 2020 00:13:32 -0400
Subject: [PATCH 17/32] No norms in HermOp

---
 tests/core/Test_staggered_naive.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/core/Test_staggered_naive.cc b/tests/core/Test_staggered_naive.cc
index f96bac93..9fe35a54 100644
--- a/tests/core/Test_staggered_naive.cc
+++ b/tests/core/Test_staggered_naive.cc
@@ -261,11 +261,11 @@ int main (int argc, char ** argv)
   pickCheckerboard(Odd ,phi_o,phi);
 
   SchurDiagMooeeOperator<NaiveStaggeredFermionR,FermionField> HermOpEO(Ds);
-  HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
-  HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
+  HermOpEO.MpcDagMpc(chi_e,dchi_e);
+  HermOpEO.MpcDagMpc(chi_o,dchi_o);
 
-  HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
-  HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
+  HermOpEO.MpcDagMpc(phi_e,dphi_e);
+  HermOpEO.MpcDagMpc(phi_o,dphi_o);
 
   pDce = innerProduct(phi_e,dchi_e);
   pDco = innerProduct(phi_o,dchi_o);

From d982a5b6d598af63d4c51443ef173ee6b131ab4a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Sep 2020 00:14:04 -0400
Subject: [PATCH 18/32] Fix coaarsened

---
 tests/debug/Test_cayley_coarsen_support.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/debug/Test_cayley_coarsen_support.cc b/tests/debug/Test_cayley_coarsen_support.cc
index e6e18250..e91b3070 100644
--- a/tests/debug/Test_cayley_coarsen_support.cc
+++ b/tests/debug/Test_cayley_coarsen_support.cc
@@ -122,7 +122,7 @@ int main (int argc, char ** argv)
   typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
 
   Subspace Aggregates(Coarse5d,FGrid,cb);
-  Aggregates.CreateSubspaceRandom(RNG5);
+  //  Aggregates.CreateSubspaceRandom(RNG5);
 
   subspace=Aggregates.subspace;
 
@@ -163,7 +163,7 @@ int main (int argc, char ** argv)
   LittleDiracOp.M(c_src,c_res);
 
   std::cout<<GridLogMessage<<"Testing hermiticity explicitly by inspecting matrix elements"<<std::endl;
-  LittleDiracOp.AssertHermitian();
+  //  LittleDiracOp.AssertHermitian();
 
   std::cout<<GridLogMessage << "Testing Hermiticity stochastically "<< std::endl;
   CoarseVector phi(Coarse5d);

From c54553017021d1643b0561394f0e5ec7719ddd0a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Sep 2020 00:14:33 -0400
Subject: [PATCH 19/32] little worry large Nbasis doesnt compile GPU

---
 tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
index 0b1c3a1f..d9249e0d 100644
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
@@ -311,7 +311,7 @@ int main (int argc, char ** argv) {
   std::cout << GridLogMessage << "Keep " << coarse.Nk << " total vectors" << std::endl;
   assert(Nm2 >= Nm1);
 
-  const int nbasis= 70;
+  const int nbasis= 32;
   CoarseFineIRL<vSpinColourVector,vTComplex,nbasis> IRL(FrbGrid,CoarseGrid5rb,HermOp,Odd);
 
   std::cout << GridLogMessage << "Constructed CoarseFine IRL" << std::endl;

From c273fb051cdb7c739f4eba1ad6a15e203f1fcab0 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Sep 2020 15:27:59 -0400
Subject: [PATCH 20/32] Peek poke laattice

---
 Grid/lattice/Lattice_peekpoke.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h
index c79becf2..5caab214 100644
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -182,6 +182,14 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
       
   return;
 };
+template<class vobj,class sobj>
+inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site)
+{
+  autoView(lv,l,CpuRead);
+  peekLocalSite(s,lv,site);
+  return;
+};
+
 // Must be CPU write view
 template<class vobj,class sobj>
 inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
@@ -210,6 +218,14 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
   return;
 };
 
+template<class vobj,class sobj>
+inline void pokeLocalSite(const sobj &s, Lattice<vobj> &l,Coordinate &site)
+{
+  autoView(lv,l,CpuWrite);
+  pokeLocalSite(s,lv,site);
+  return;
+};
+
 NAMESPACE_END(Grid);
 #endif
 

From 5791021dcd07d619dde942ba3777bca4cf14378c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Sep 2020 15:28:15 -0400
Subject: [PATCH 21/32] Speed up Cshift more with coalesced

---
 Grid/cshift/Cshift.h        | 19 ++-----------------
 Grid/cshift/Cshift_common.h | 12 ++++++------
 2 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/Grid/cshift/Cshift.h b/Grid/cshift/Cshift.h
index 9150579c..c7b9e3cb 100644
--- a/Grid/cshift/Cshift.h
+++ b/Grid/cshift/Cshift.h
@@ -52,23 +52,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 NAMESPACE_BEGIN(Grid);
 
-template<typename Op, typename T1> 
-auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
-    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
-{
-  return Cshift(closure(expr),dim,shift);
-}
-template <class Op, class T1, class T2>
-auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
-  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> 
-{
-  return Cshift(closure(expr),dim,shift);
-}
-template <class Op, class T1, class T2, class T3>
-auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
-  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
-				   eval(0, expr.arg2),
-				   eval(0, expr.arg3)))> 
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+auto Cshift(const Expression &expr,int dim,int shift)  -> decltype(closure(expr)) 
 {
   return Cshift(closure(expr),dim,shift);
 }
diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h
index d296c024..723b88be 100644
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -76,8 +76,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
     autoView(rhs_v , rhs, AcceleratorRead);
     auto buffer_p = & buffer[0];
     auto table = &Cshift_table[0];
-    accelerator_for(i,ent,1,{
-      buffer_p[table[i].first]=rhs_v[table[i].second];
+    accelerator_for(i,ent,vobj::Nsimd(),{
+	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
     });
   }
 }
@@ -185,8 +185,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
     autoView( rhs_v, rhs, AcceleratorWrite);
     auto buffer_p = & buffer[0];
     auto table = &Cshift_table[0];
-    accelerator_for(i,ent,1,{
-	rhs_v[table[i].first]=buffer_p[table[i].second];
+    accelerator_for(i,ent,vobj::Nsimd(),{
+	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
     });
   }
 }
@@ -282,8 +282,8 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
     autoView(rhs_v , rhs, AcceleratorRead);
     autoView(lhs_v , lhs, AcceleratorWrite);
     auto table = &Cshift_table[0];
-    accelerator_for(i,ent,1,{
-      lhs_v[table[i].first]=rhs_v[table[i].second];
+    accelerator_for(i,ent,vobj::Nsimd(),{
+      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
     });
   }
 }

From 8807d998bc7c12ac5c02c1e5dcf1cf1d1bdfb34a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Sep 2020 15:29:11 -0400
Subject: [PATCH 22/32] closure improved

---
 Grid/qcd/action/gauge/GaugeImplementations.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h
index 19bc5aa6..a14aec1b 100644
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -59,7 +59,7 @@ public:
   }
   static inline GaugeLinkField
   CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-    return Cshift(closure(adj(Link)), mu, -1);
+    return Cshift(adj(Link), mu, -1);
   }
   static inline GaugeLinkField
   CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {

From 1654c4f3c0ffec577833f3864aad8fedbce2debe Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Sep 2020 15:29:45 -0400
Subject: [PATCH 23/32] Closure improved

---
 Grid/qcd/utils/WilsonLoops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h
index fdd53698..0367c9fa 100644
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -485,7 +485,7 @@ public:
 
         // Up staple    ___ ___
         //             |       |
-        tmp = Cshift(closure(adj(U[nu])), nu, -1);
+        tmp = Cshift(adj(U[nu]), nu, -1);
         tmp = adj(U2[mu]) * tmp;
         tmp = Cshift(tmp, mu, -2);
 
@@ -519,7 +519,7 @@ public:
         //
         //      |  |
 
-        tmp = Cshift(closure(adj(U2[nu])), nu, -2);
+        tmp = Cshift(adj(U2[nu]), nu, -2);
         tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
         tmp = U2[nu] * Cshift(tmp, nu, 2);
         Stap += Cshift(tmp, mu, 1);

From eac1f08b7b1412eef6be9b53d86a552fc0e427e4 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Sep 2020 15:30:33 -0400
Subject: [PATCH 24/32] Close expressions passed as an argument

---
 Grid/qcd/utils/CovariantCshift.h | 49 ++++++++++++++------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h
index 6ac69150..cee1fa12 100644
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -53,23 +53,21 @@ namespace PeriodicBC {
     return Cshift(tmp,mu,-1);// moves towards positive mu
   }
 
-  template<class gauge,typename Op, typename T1> auto
-    CovShiftForward(const Lattice<gauge> &Link, 
-		    int mu,
-		    const LatticeUnaryExpression<Op,T1> &expr)
-    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
+    auto  CovShiftForward(const Lattice<gauge> &Link, 
+			  int mu,
+			  const Expr &expr) -> decltype(closure(expr))
   {
-    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
+    auto arg = closure(expr);
     return CovShiftForward(Link,mu,arg);
   }
-  template<class gauge,typename Op, typename T1> auto
-    CovShiftBackward(const Lattice<gauge> &Link, 
-		     int mu,
-		     const LatticeUnaryExpression<Op,T1> &expr)
-    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
+    auto  CovShiftBackward(const Lattice<gauge> &Link, 
+			   int mu,
+			   const Expr &expr) -> decltype(closure(expr))
   {
-    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
-    return CovShiftForward(Link,mu,arg);
+    auto arg = closure(expr);
+    return CovShiftBackward(Link,mu,arg);
   }
 
 }
@@ -142,26 +140,23 @@ namespace ConjugateBC {
     return Cshift(tmp,mu,-1);// moves towards positive mu
   }
 
-  template<class gauge,typename Op, typename T1> auto
-    CovShiftForward(const Lattice<gauge> &Link, 
-		    int mu,
-		    const LatticeUnaryExpression<Op,T1> &expr)
-    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
+    auto  CovShiftForward(const Lattice<gauge> &Link, 
+			  int mu,
+			  const Expr &expr) -> decltype(closure(expr))
   {
-    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
+    auto arg = closure(expr);
     return CovShiftForward(Link,mu,arg);
   }
-  template<class gauge,typename Op, typename T1> auto
-    CovShiftBackward(const Lattice<gauge> &Link, 
-		     int mu,
-		     const LatticeUnaryExpression<Op,T1> &expr)
-    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
+    auto  CovShiftBackward(const Lattice<gauge> &Link, 
+			   int mu,
+			   const Expr &expr)  -> decltype(closure(expr))
   {
-    Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
-    return CovShiftForward(Link,mu,arg);
+    auto arg = closure(expr);
+    return CovShiftBackward(Link,mu,arg);
   }
 
-
 }
 
 

From d3ce60713d87dd1baf7d9f7acf1097da1c932d1a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 15:44:13 -0400
Subject: [PATCH 25/32] UVM, Device and Lattice/aligned allocators

---
 Grid/allocator/AlignedAllocator.h | 57 +++++++++++++++++++++++++++----
 Grid/allocator/MemoryManager.h    |  4 +--
 2 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index ebb3162b..249732fb 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -65,8 +65,7 @@ public:
     MemoryManager::CpuFree((void *)__p,bytes);
   }
 
-  // FIXME: hack for the copy constructor, eventually it must be avoided
-  //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
+  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
   void construct(pointer __p, const _Tp& __val) { assert(0);};
   void construct(pointer __p) { };
   void destroy(pointer __p) { };
@@ -74,6 +73,9 @@ public:
 template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 
+//////////////////////////////////////////////////////////////////////////////////////
+// Unified virtual memory
+//////////////////////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class uvmAllocator {
 public: 
@@ -109,22 +111,63 @@ public:
     MemoryManager::SharedFree((void *)__p,bytes);
   }
 
-  // FIXME: hack for the copy constructor, eventually it must be avoided
   void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
-  //void construct(pointer __p, const _Tp& __val) { };
   void construct(pointer __p) { };
   void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
 
+////////////////////////////////////////////////////////////////////////////////
+// Device memory
+////////////////////////////////////////////////////////////////////////////////
+template<typename _Tp>
+class devAllocator {
+public: 
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef _Tp*       pointer;
+  typedef const _Tp* const_pointer;
+  typedef _Tp&       reference;
+  typedef const _Tp& const_reference;
+  typedef _Tp        value_type;
+
+  template<typename _Tp1>  struct rebind { typedef devAllocator<_Tp1> other; };
+  devAllocator() throw() { }
+  devAllocator(const devAllocator&) throw() { }
+  template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
+  ~devAllocator() throw() { }
+  pointer       address(reference __x)       const { return &__x; }
+  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+
+  pointer allocate(size_type __n, const void* _p= 0)
+  { 
+    size_type bytes = __n*sizeof(_Tp);
+    profilerAllocate(bytes);
+    _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
+    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
+    return ptr;
+  }
+
+  void deallocate(pointer __p, size_type __n) 
+  { 
+    size_type bytes = __n * sizeof(_Tp);
+    profilerFree(bytes);
+    MemoryManager::AcceleratorFree((void *)__p,bytes);
+  }
+  void construct(pointer __p, const _Tp& __val) { };
+  void construct(pointer __p) { };
+  void destroy(pointer __p) { };
+};
+template<typename _Tp>  inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
+template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
+
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-template<class T> using commAllocator = uvmAllocator<T>;
+//template<class T> using commAllocator = devAllocator<T>;
 template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
-template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
-//template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
+template<class T> using commVector = std::vector<T,devAllocator<T> >;
 
 NAMESPACE_END(Grid);
 
diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h
index 23065c58..aac13aee 100644
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -93,12 +93,12 @@ private:
   static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
   static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
 
-  static void *AcceleratorAllocate(size_t bytes);
-  static void  AcceleratorFree    (void *ptr,size_t bytes);
   static void PrintBytes(void);
  public:
   static void Init(void);
   static void InitMessage(void);
+  static void *AcceleratorAllocate(size_t bytes);
+  static void  AcceleratorFree    (void *ptr,size_t bytes);
   static void *SharedAllocate(size_t bytes);
   static void  SharedFree    (void *ptr,size_t bytes);
   static void *CpuAllocate(size_t bytes);

From 0c3095e173bc2886b121edb3fafa78646948f5e8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 15:45:35 -0400
Subject: [PATCH 26/32] Comms buffers to device memory

---
 Grid/communicator/Communicator_mpi3.cc | 88 +++++++++-----------------
 1 file changed, 31 insertions(+), 57 deletions(-)

diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 6130195d..28c7b8a4 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -302,60 +302,35 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
   std::vector<CommsRequest_t> reqs(0);
-  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
-  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
-}
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  MPI_Status stat;
-  assert(sender != receiver);
-  int tag = sender;
-  if ( _processor == sender ) {
-    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
-  }
-  if ( _processor == receiver ) {
-    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
-  }
-}
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
+  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
+  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
+
   int myrank = _processor;
   int ierr;
 
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
-    MPI_Request xrq;
-    MPI_Request rrq;
+  // Enforce no UVM in comms, device or host OK
+  int uvm;
+  auto 
+  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) xmit);
+  assert(cuerr == cudaSuccess );
+  assert(uvm==0);
 
-    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) recv);
+  assert(cuerr == cudaSuccess );
+  assert(uvm==0);
 
-    assert(ierr==0);
-    list.push_back(xrq);
-    list.push_back(rrq);
-  } else {
-    // Give the CPU to MPI immediately; can use threads to overlap optionally
-    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
-		      recv,bytes,MPI_CHAR,from, from,
-		      communicator,MPI_STATUS_IGNORE);
-    assert(ierr==0);
-  }
+  // Give the CPU to MPI immediately; can use threads to overlap optionally
+  //  printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
+  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
+		    recv,bytes,MPI_CHAR,from, from,
+		    communicator,MPI_STATUS_IGNORE);
+  assert(ierr==0);
+
+  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
+  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
+  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
-
+// Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int dest,
 						     void *recv,
@@ -411,15 +386,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 
   return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
-{
-  SendToRecvFromComplete(waitall);
-}
-void CartesianCommunicator::StencilBarrier(void)
-{
-  MPI_Barrier  (ShmComm);
-}
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
   int nreq=list.size();
 
@@ -430,6 +397,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
   assert(ierr==0);
   list.resize(0);
 }
+void CartesianCommunicator::StencilBarrier(void)
+{
+  MPI_Barrier  (ShmComm);
+}
+//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+//{
+//}
 void CartesianCommunicator::Barrier(void)
 {
   int ierr = MPI_Barrier(communicator);

From b4255140d611f3cbb1ccf6148450f14f5e0539cb Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 15:47:46 -0400
Subject: [PATCH 27/32] Stale data member eliminated

---
 Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h | 2 +-
 Grid/qcd/action/fermion/WilsonFermion5D.h            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
index 625eda63..ca660610 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -208,7 +208,7 @@ public:
   LebesgueOrder LebesgueEvenOdd;
     
   // Comms buffer
-  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
     
   ///////////////////////////////////////////////////////////////
   // Conserved current utilities
diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h
index 804b1d10..80231bb4 100644
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -215,7 +215,7 @@ public:
   LebesgueOrder LebesgueEvenOdd;
     
   // Comms buffer
-  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
 
 
 };

From 85b1c5df39c33e3de9d3ab2917400b9c985428c8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 15:48:16 -0400
Subject: [PATCH 28/32] A never hit case that is not 100% confident is asserted
 for safety

---
 Grid/cshift/Cshift_common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h
index d296c024..40ab5032 100644
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -222,6 +222,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
     // Test_cshift_red_black code.
     //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
     std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
+    assert(0); // This will fail if hit on GPU
     autoView( rhs_v, rhs, CpuWrite);
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){

From bcd7895362e6c1d1084da44837a8aa353d0ac936 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 15:49:13 -0400
Subject: [PATCH 29/32] Include cuda.h

---
 Grid/threads/Accelerator.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 74a3ea22..29d12904 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -90,6 +90,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////
 
 #ifdef GRID_CUDA
+#include <cuda.h>
 
 #ifdef __CUDA_ARCH__
 #define GRID_SIMT

From 8244caff2507eab0a99a59f275bd1745efa7336d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 18:52:55 -0400
Subject: [PATCH 30/32] Remove the asynchronous non-Stencil calls.

---
 benchmarks/Benchmark_comms.cc | 108 ++++------------------------------
 1 file changed, 10 insertions(+), 98 deletions(-)

diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 44ccbd19..232030c8 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -74,90 +74,6 @@ int main (int argc, char ** argv)
   std::vector<double> t_time(Nloop);
   time_statistics timestat;
 
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
-  for(int lat=8;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=8;Ls*=2){
-
-      Coordinate latt_size  ({lat*mpi_layout[0],
-	                      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
-
-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      RealD Nrank = Grid._Nprocessors;
-      RealD Nnode = Grid.NodeCount();
-      RealD ppn = Nrank/Nnode;
-
-      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
-      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
-
-      int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-      for(int mu=0;mu<8;mu++){
-	xbuf[mu].resize(lat*lat*lat*Ls);
-	rbuf[mu].resize(lat*lat*lat*Ls);
-	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
-      }
-
-      for(int i=0;i<Nloop;i++){
-      double start=usecond();
-
-	std::vector<CommsRequest_t> requests;
-
-	ncomm=0;
-	for(int mu=0;mu<4;mu++){
-	
-	  if (mpi_layout[mu]>1 ) {
-	  
-	    ncomm++;
-	    int comm_proc=1;
-	    int xmit_to_rank;
-	    int recv_from_rank;
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.SendToRecvFromBegin(requests,
-				   (void *)&xbuf[mu][0],
-				   xmit_to_rank,
-				   (void *)&rbuf[mu][0],
-				   recv_from_rank,
-				   bytes);
-	
-	    comm_proc = mpi_layout[mu]-1;
-	  
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.SendToRecvFromBegin(requests,
-				     (void *)&xbuf[mu+4][0],
-				     xmit_to_rank,
-				     (void *)&rbuf[mu+4][0],
-				     recv_from_rank,
-				     bytes);
-	  
-	  }
-	}
-	Grid.SendToRecvFromComplete(requests);
-	Grid.Barrier();
-	double stop=usecond();
-	t_time[i] = stop-start; // microseconds
-      }
-
-      timestat.statistics(t_time);
-
-      double dbytes    = bytes*ppn;
-      double xbytes    = dbytes*2.0*ncomm;
-      double rbytes    = xbytes;
-      double bidibytes = xbytes+rbytes;
-
-      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
-               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
-               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
-               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
-               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
-               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
-
-    }
-  }    
 
 
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -206,26 +122,22 @@ int main (int argc, char ** argv)
 	    {
 	      std::vector<CommsRequest_t> requests;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFromBegin(requests,
-				       (void *)&xbuf[mu][0],
-				       xmit_to_rank,
-				       (void *)&rbuf[mu][0],
-				       recv_from_rank,
-				       bytes);
-	      Grid.SendToRecvFromComplete(requests);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
 	    }
 
 	    comm_proc = mpi_layout[mu]-1;
 	    {
 	      std::vector<CommsRequest_t> requests;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFromBegin(requests,
-				       (void *)&xbuf[mu+4][0],
-				       xmit_to_rank,
-				       (void *)&rbuf[mu+4][0],
-				       recv_from_rank,
-				       bytes);
-	      Grid.SendToRecvFromComplete(requests);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
 	    }
 	  }
 	}

From a8309638d4be9faf49619a8220a9b06a4e16eb73 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 20:29:26 -0400
Subject: [PATCH 31/32] UVM check in MPI calls

---
 Grid/communicator/Communicator_mpi3.cc | 11 ++---------
 Grid/threads/Accelerator.h             | 22 ++++++++++++++++++++++
 configure.ac                           |  5 ++---
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 28c7b8a4..83f71233 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -309,15 +309,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
   int ierr;
 
   // Enforce no UVM in comms, device or host OK
-  int uvm;
-  auto 
-  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) xmit);
-  assert(cuerr == cudaSuccess );
-  assert(uvm==0);
-
-  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) recv);
-  assert(cuerr == cudaSuccess );
-  assert(uvm==0);
+  assert(acceleratorIsCommunicable(xmit));
+  assert(acceleratorIsCommunicable(recv));
 
   // Give the CPU to MPI immediately; can use threads to overlap optionally
   //  printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 29d12904..1a3dfdc2 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -70,6 +70,7 @@ NAMESPACE_BEGIN(Grid);
 //
 // Memory management:
 //
+//    int   acceleratorIsCommunicable(void *pointer);
 //    void *acceleratorAllocShared(size_t bytes);
 //    void acceleratorFreeShared(void *ptr);
 //
@@ -166,6 +167,16 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline int  acceleratorIsCommunicable(void *ptr)
+{
+  int uvm;
+  auto 
+  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
+  assert(cuerr == cudaSuccess );
+  if(uvm) return 0;
+  else    return 1;
+}
+
 #endif
 
 //////////////////////////////////////////////
@@ -220,6 +231,15 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
+inline int  acceleratorIsCommunicable(void *ptr)
+{
+#if 0
+  auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
+  if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
+  else return 0;
+#endif
+  return 1;
+}
 
 #endif
 
@@ -299,6 +319,7 @@ inline void *acceleratorAllocShared(size_t bytes)
   return malloc(bytes);
 #endif
 };
+inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 
 inline void *acceleratorAllocDevice(size_t bytes)
 {
@@ -353,6 +374,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
 
+inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 #ifdef HAVE_MM_MALLOC_H
 inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
diff --git a/configure.ac b/configure.ac
index f0bea6a4..5113757c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -154,6 +154,7 @@ AC_ARG_ENABLE([accelerator],
 case ${ac_ACCELERATOR} in
     cuda)
       echo CUDA acceleration
+      LIBS="${LIBS} -lcuda"
       AC_DEFINE([GRID_CUDA],[1],[Use CUDA offload]);;
     sycl)
       echo SYCL acceleration
@@ -323,7 +324,6 @@ case ${CXXTEST} in
 #    CXXLD="nvcc -v -link"
     CXX="${CXXBASE} -x cu "
     CXXLD="${CXXBASE} -link"
-#    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
     CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
     if test $ac_openmp = yes; then
        CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
@@ -483,8 +483,7 @@ case ${ac_SHM} in
      LDFLAGS_CPY=$LDFLAGS
      CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
      LDFLAGS="$AM_LDFLAGS $LDFLAGS"
-     AC_SEARCH_LIBS([shm_unlink], [rt], [],
-                    [AC_MSG_ERROR("no library found for shm_unlink")])
+     AC_SEARCH_LIBS([shm_unlink], [rt], [],[AC_MSG_ERROR("no library found for shm_unlink")])
      CXXFLAGS=$CXXFLAGS_CPY
      LDFLAGS=$LDFLAGS_CPY
      ;;

From 65b724bb5fbacadf4063278c86178586696fe866 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 21:46:43 -0400
Subject: [PATCH 32/32] 2 level hddcr

---
 tests/solver/Test_dwf_hdcr_2level.cc | 420 +++++++++++++++++++++++++++
 1 file changed, 420 insertions(+)
 create mode 100644 tests/solver/Test_dwf_hdcr_2level.cc

diff --git a/tests/solver/Test_dwf_hdcr_2level.cc b/tests/solver/Test_dwf_hdcr_2level.cc
new file mode 100644
index 00000000..df24c9d2
--- /dev/null
+++ b/tests/solver/Test_dwf_hdcr_2level.cc
@@ -0,0 +1,420 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_hdcr.cc
+
+    Copyright (C) 2015
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+
+using namespace std;
+using namespace Grid;
+/* Params
+ * Grid: 
+ * block1(4)
+ * block2(4)
+ * 
+ * Subspace
+ * * Fine  : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100
+ * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100
+
+ * Smoother:
+ * * Fine: Cheby(hi, lo, order)            --  60,0.5,10
+ * * Coarse: Cheby(hi, lo, order)          --  12,0.1,4
+
+ * Lanczos:
+ * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order))   24,36,24,0.002,4.0,61 
+ */
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;
+  
+  Chebyshev<Field> Cheby;
+
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) :
+    _SmootherOperator(SmootherOperator),
+    _SmootherMatrix(SmootherMatrix),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    MdagMLinearOperator<Matrix,Field>   MdagMOp(_SmootherMatrix); 
+    _SmootherOperator.AdjOp(in,tmp);
+    Cheby(MdagMOp,tmp,out);         
+  }
+};
+template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & SmootherMatrix;
+  FineOperator   & SmootherOperator;
+  RealD tol;
+  RealD shift;
+  int   maxit;
+
+  MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) :
+    shift(_shift),tol(_tol),maxit(_maxit),
+    SmootherOperator(_SmootherOperator),
+    SmootherMatrix(_SmootherMatrix)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    ZeroGuesser<Field> Guess;
+    ConjugateGradient<Field>  CG(tol,maxit,false);
+ 
+    Field src(in.Grid());
+
+    ShiftedMdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(SmootherMatrix,shift);
+    SmootherOperator.AdjOp(in,src);
+    Guess(src,out);
+    CG(MdagMOp,src,out); 
+  }
+};
+template<class Field,class Matrix> class RedBlackSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & SmootherMatrix;
+  RealD tol;
+  RealD shift;
+  int   maxit;
+
+  RedBlackSmoother(RealD _shift,RealD _tol,int _maxit,Matrix &_SmootherMatrix) :
+    shift(_shift),tol(_tol),maxit(_maxit),
+    SmootherMatrix(_SmootherMatrix)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    std::cout << " Red Black Smootheer "<<norm2(in)<<" " <<norm2(out)<<std::endl;
+    ConjugateGradient<Field>  CG(tol,maxit,false);
+    out =Zero();
+    SchurRedBlackDiagMooeeSolve<Field> RBSolver(CG);
+    RBSolver(SmootherMatrix,in,out); 
+    std::cout << " Red Black Smootheer "<<norm2(in)<<" " <<norm2(out)<<std::endl;
+  }
+};
+
+
+template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
+class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+
+  Aggregates     & _Aggregates;
+  CoarseOperator & _CoarseOperator;
+  Matrix         & _FineMatrix;
+  FineOperator   & _FineOperator;
+  Guesser        & _Guess;
+  FineSmoother   & _Smoother1;
+  FineSmoother   & _Smoother2;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+#define GridLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level <<" "
+
+  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
+			  FineOperator &Fine,Matrix &FineMatrix,
+			  FineSmoother &Smoother1,
+			  FineSmoother &Smoother2,
+			  Guesser &Guess_,
+			  CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _CoarseOperator(Coarse),
+      _FineOperator(Fine),
+      _FineMatrix(FineMatrix),
+      _Smoother1(Smoother1),
+      _Smoother2(Smoother2),
+      _Guess(Guess_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
+			  FineOperator &Fine,Matrix &FineMatrix,
+			  FineSmoother &Smoother,
+			  Guesser &Guess_,
+			  CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _CoarseOperator(Coarse),
+      _FineOperator(Fine),
+      _FineMatrix(FineMatrix),
+      _Smoother1(Smoother),
+      _Smoother2(Smoother),
+      _Guess(Guess_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    CoarseVector Csrc(_CoarseOperator.Grid());
+    CoarseVector Csol(_CoarseOperator.Grid()); 
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _Smoother1(in,out);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    GridLogLevel << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    t+=usecond();
+    GridLogLevel << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    GridLogLevel << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+
+    // Fine Smoother
+    t=-usecond();
+    _Smoother2(vec1,vec2);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+  const int rLs=8;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  ///////////////////////////////////////////////////
+  // Construct a coarsened grid; utility for this?
+  ///////////////////////////////////////////////////
+  std::vector<int> block ({2,2,2,2});
+  const int nbasis= 32;
+
+  auto clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/block[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+  LatticeFermion    src(FGrid); gaussian(RNG5,src);// src=src+g5*src;
+  LatticeFermion result(FGrid); 
+  LatticeGaugeField Umu(UGrid); 
+
+  FieldMetaData header;
+  std::string file("./ckpoint_lat");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  RealD mass=0.001;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
+  typedef CoarseOperator::CoarseVector                                 CoarseVector;
+  typedef CoarseOperator::siteVector siteVector;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
+
+  Subspace Aggregates(Coarse5d,FGrid,0);
+
+  assert ( (nbasis & 0x1)==0);
+  {
+    int nb=nbasis/2;
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,100,0.0);// 18s
+    //   rAggregates.CreateSubspaceChebyshev(RNG5,rHermDefOp,nb,60.0,0.05,500,200,150,0.0);// 15.7 23iter
+    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,150,0.0);//
+    // pad out the rAggregates.
+
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,500,150,0.0);// 19s 
+
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,200,0.0); 15.2s
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,500,200,0.0); 16.3s
+
+    for(int n=0;n<nb;n++){
+      G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
+    }
+    LatticeFermion A(FGrid);
+    LatticeFermion B(FGrid);
+    for(int n=0;n<nb;n++){
+      A = Aggregates.subspace[n];
+      B = Aggregates.subspace[n+nb];
+      Aggregates.subspace[n]   = A+B; // 1+G5 // eigen value of G5R5 is +1
+      Aggregates.subspace[n+nb]= A-B; // 1-G5 // eigen value of G5R5 is -1
+    }
+  }
+  
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>    Level1Op;
+
+  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
+
+  Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running Coarse grid Lanczos "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  MdagMLinearOperator<Level1Op,CoarseVector> IRLHermOp(LDOp);
+  Chebyshev<CoarseVector> IRLCheby(0.002,12.,151);
+  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,IRLHermOp);
+  PlainHermOp<CoarseVector> IRLOp    (IRLHermOp);
+  int Nk=48;
+  int Nm=64;
+  int Nstop=48;
+  int Nconv;
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20);
+
+  std::vector<RealD>          eval(Nm);
+  std::vector<CoarseVector>   evec(Nm,Coarse5d);
+  CoarseVector c_src(Coarse5d);
+  gaussian(CRNG,c_src);
+  IRL.calc(eval,evec,c_src,Nconv);
+
+  //  ConjugateGradient<CoarseVector>  CoarseCG(0.01,1000);
+  
+  ConjugateGradient<CoarseVector>  CoarseCG(0.02,1000);// 14.7s
+  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+  NormalEquations<CoarseVector> DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser);
+
+  c_src=1.0;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Fine        PowerMethod           "<< std::endl;
+  PowerMethod<LatticeFermion>       PM;   PM(HermDefOp,src);
+  std::cout<<GridLogMessage << " Coarse       PowerMethod           "<< std::endl;
+  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
+  PowerMethod<CoarseVector>        cPM;  cPM(PosdefLdop,c_src);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building 2 level Multigrid            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,DeflatedGuesser<CoarseVector> , NormalEquations<CoarseVector> >   TwoLevelMG;
+
+  // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 72 iter 63s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.1,60.0,20,HermIndefOp,Ddwf); // 66 iter 69s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,20,HermIndefOp,Ddwf); // 63 iter 65  s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(1.0,60.0,20,HermIndefOp,Ddwf); // 69, 70
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(1.0,60.0,14,HermIndefOp,Ddwf); // 77
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); // 23 iter 15.9s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 20, 16.9s
+  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf); // 21, 15.6s
+
+  //  MirsSmoother<LatticeFermion,DomainWallFermionR> FineCGSmoother(0.05,0.01,20,HermIndefOp,Ddwf);
+  //  RedBlackSmoother<LatticeFermion,DomainWallFermionR> FineRBSmoother(0.00,0.001,100,Ddwf);
+
+  // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  TwoLevelMG TwoLevelPrecon(Aggregates, LDOp,
+			    HermIndefOp,Ddwf,
+			    FineSmoother,
+			    DeflCoarseGuesser,
+			    DeflCoarseCGNE);
+  TwoLevelPrecon.Level(1);
+
+  // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid
+  PrecGeneralisedConjugateResidual<LatticeFermion> l1PGCR(1.0e-8,1000,HermIndefOp,TwoLevelPrecon,16,16);
+  l1PGCR.Level(1);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling 2 level Multigrid            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  result=Zero();
+  l1PGCR(src,result);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Fine CG prec DiagMooee "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+
+  ConjugateGradient<LatticeFermion>          FineCG(1.0e-8,10000);
+  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> FineDiagMooee(Ddwf); //  M_ee - Meo Moo^-1 Moe 
+  LatticeFermion f_src_e(FrbGrid); f_src_e=1.0;
+  LatticeFermion f_res_e(FrbGrid); f_res_e=Zero();
+  FineCG(FineDiagMooee,f_src_e,f_res_e);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Grid_finalize();
+}