From 48e81cf6f8603b3552f0de6852ad6117024fa0d6 Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 00:31:03 +0100
Subject: [PATCH 01/18] Hip Pragmas

---
 Grid/Grid_Eigen_Dense.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)
diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h
index 9556c03d..d9c002bc 100644
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -34,6 +34,12 @@
 #define __SYCL__REDEFINE__
 #endif
 
+/* HIP save and restore compile environment*/
+#ifdef GRID_HIP
+#pragma push
+#pragma push_macro("__HIP_DEVICE_COMPILE__")
+#endif
+#define EIGEN_NO_HIP
 
 #include <Grid/Eigen/Dense>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
@@ -52,6 +58,12 @@
 #pragma pop
 #endif
 
+/*HIP restore*/
+#ifdef __HIP__REDEFINE__
+#pragma pop_macro("__HIP_DEVICE_COMPILE__")
+#pragma pop
+#endif
+
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif

From 288c6157826d0395a591267939b1ea4380dc865a Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 00:31:50 +0100
Subject: [PATCH 02/18] Hip improvements

---
 Grid/lattice/Lattice_reduction_gpu.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index 5f490507..d8a47ae1 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid);
 
 #ifdef GRID_HIP
 extern hipDeviceProp_t *gpu_props;
+#define WARP_SIZE 64
 #endif
 #ifdef GRID_CUDA
 extern cudaDeviceProp *gpu_props;
+#define WARP_SIZE 32
 #endif
 
-#define WARP_SIZE 32
 __device__ unsigned int retirementCount = 0;
 
 template <class Iterator>
@@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
   
   // cannot use overloaded operators for sobj as they are not volatile-qualified
   memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
-  __syncwarp();
+  acceleratorSynchronise();
   
   const Iterator VEC = WARP_SIZE;
   const Iterator vid = tid & (VEC-1);
@@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
       beta += temp;
       memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
     }
-    __syncwarp();
+    acceleratorSynchronise();
   }
-  __syncthreads();
+  acceleratorSynchroniseAll();
   
   if (threadIdx.x == 0) {
     beta  = Zero();
@@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
     }
     memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
   }
-  __syncthreads();
+  acceleratorSynchroniseAll();
 }
 
 

From 4677c40195761b7d5cc967736110f0d5308dee19 Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 00:32:27 +0100
Subject: [PATCH 03/18] HIP improvements

---
 Grid/threads/Accelerator.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc
index 864d90a9..35e10d31 100644
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -55,6 +55,7 @@ void acceleratorInit(void)
 	printf("AcceleratorCudaInit[%d]: ========================\n",rank);
 	printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
 
+
 	GPU_PROP_FMT(totalGlobalMem,"%lld");
 	GPU_PROP(managedMemory);
 	GPU_PROP(isMultiGpuBoard);
@@ -109,20 +110,24 @@ void acceleratorInit(void)
   if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
   if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
 
+  printf("world_rank %d has %d devices\n",world_rank,nDevices);
+  size_t totalDeviceMem=0;
   for (int i = 0; i < nDevices; i++) {
 
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
     
     hipGetDeviceProperties(&gpu_props[i], i);
+    hipDeviceProp_t prop; 
+    prop = gpu_props[i];
+    totalDeviceMem = prop.totalGlobalMem;
     if ( world_rank == 0) {
-      hipDeviceProp_t prop; 
-      prop = gpu_props[i];
       printf("AcceleratorHipInit: ========================\n");
       printf("AcceleratorHipInit: Device Number    : %d\n", i);
       printf("AcceleratorHipInit: ========================\n");
       printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);
 
+      GPU_PROP_FMT(totalGlobalMem,"%lld");
       //      GPU_PROP(managedMemory);
       GPU_PROP(isMultiGpuBoard);
       GPU_PROP(warpSize);
@@ -131,6 +136,7 @@ void acceleratorInit(void)
       //      GPU_PROP(singleToDoublePrecisionPerfRatio);
     }
   }
+  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
 #ifdef GRID_IBM_SUMMIT

From e03b64dc06803c8c0f866e01f528a7cba573d049 Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 00:33:09 +0100
Subject: [PATCH 04/18] HIP default flaags to work on ROCM

---
 configure.ac | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/configure.ac b/configure.ac
index f0bea6a4..11261291 100644
--- a/configure.ac
+++ b/configure.ac
@@ -53,11 +53,11 @@ AC_TYPE_UINT64_T
 ############### OpenMP
 AC_OPENMP
 ac_openmp=no
-if test "${ac_cv_prog_cxx_openmp}X" != "noX"; then
-  ac_openmp=yes
-  AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
-  AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
-fi
+#if test "${ac_cv_prog_cxx_openmp}X" != "noX"; then
+#  ac_openmp=yes
+##  AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
+#  AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
+#fi
 
 
 ############### Checks for header files
@@ -330,7 +330,8 @@ case ${CXXTEST} in
     fi
     ;;
   hipcc)
-    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
+#    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
+    CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
     CXXLD=${CXX}
     if test $ac_openmp = yes; then
        CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"

From a5c35c4024a1289e6394e0244a53bdd7a388b06f Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 00:33:53 +0100
Subject: [PATCH 05/18] Make HIP / Vega happy

---
 .../action/fermion/implementation/StaggeredKernelsHand.h    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
index ee53de49..6bcb22b4 100644
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid);
 
 
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
 					  SiteSpinor *buf, int sF, int sU, 
@@ -221,7 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 
 
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -300,7 +300,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
 
 
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 

From fd9424ef273cb7647d7a08e9b48c3dee3f3b157c Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 00:34:32 +0100
Subject: [PATCH 06/18] innlines required to make HIP happy

---
 .../implementation/StaggeredKernelsImplementation.h  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
index 141725a7..0b6f9fb0 100644
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -78,7 +78,7 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 // Int, Ext, Int+Ext cases for comms overlap
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -126,7 +126,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
   // Only contributions from interior of our node
   ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU, 
@@ -174,7 +174,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
   // Only contributions from exterior of our node
   ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU,
@@ -224,7 +224,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
 ////////////////////////////////////////////////////////////////////////////////////
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 {
@@ -253,7 +253,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
       ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\
   });
 
-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@@ -293,7 +293,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
   }
   assert(0 && " Kernel optimisation case not covered ");
 }
-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 				       DoubledGaugeField &U,
 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)

From d1c0c0197e0243d833a1aee0947265fa6577900b Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 00:35:06 +0100
Subject: [PATCH 07/18] HipCC requires inline on definition

---
 .../WilsonKernelsHandGparityImplementation.h         | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
index 2150938f..a592a798 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
@@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
   HAND_RESULT_EXT(ss,F)
 
 #define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void						\
+  template<> accelerator_inline void						\
   WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
   {									\
@@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
     HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
   }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
   WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
   {									\
@@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
     HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
   }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
   WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
   {									\
@@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
     HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
   }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
   WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
   {									\
@@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
     HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
   }									\
 									\
-  template<> void							\
+  template<> accelerator_inline void							\
   WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
   {									\
@@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
     nmu = 0;								\
     HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
   }									\
-  template<> void						\
+  template<> accelerator_inline void						\
   WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
   {									\

From cc220abd1dda3e31b7cfe685e2b38fc792eae2ce Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 00:35:38 +0100
Subject: [PATCH 08/18] inline for HIP

---
 .../implementation/WilsonKernelsHandImplementation.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
index f7b018fa..89ae5668 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 NAMESPACE_BEGIN(Grid);
 
-template<class Impl> void 
+template<class Impl> accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
   HAND_RESULT(ss);
 }
 
-template<class Impl>
+template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
   HAND_RESULT(ss);
 }
 
-template<class Impl> void 
+template<class Impl>  accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
   HAND_RESULT(ss);
 }
 
-template<class Impl>
+template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
   HAND_RESULT(ss);
 }
 
-template<class Impl> void 
+template<class Impl>  accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
   HAND_RESULT_EXT(ss);
 }
 
-template<class Impl>
+template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {

From 2859955a03b5a95571f680965d296a9308c69443 Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 00:36:13 +0100
Subject: [PATCH 09/18] HIP requires "inline"

---
 .../implementation/WilsonKernelsImplementation.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
index c2b62416..c5f50bbb 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
   ////////////////////////////////////////////////////////////////////
   // All legs kernels ; comms then compute
   ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
   coalescedWrite(out[sF],result,lane);
 };
 
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
 					  SiteHalfSpinor *buf, int sF,
 					  int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
   ////////////////////////////////////////////////////////////////////
   // Interior kernels
   ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFi
   coalescedWrite(out[sF], result,lane);
 };
 
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeField
 ////////////////////////////////////////////////////////////////////
 // Exterior kernels
 ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFi
   }
 };
 
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
 };
 
 #define DhopDirMacro(Dir,spProj,spRecon)	\
-  template <class Impl>							\
+  template <class Impl> accelerator_inline				\
   void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
 					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
   {									\
@@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
 DhopDirMacro(Zm,spProjZm,spReconZm);
 DhopDirMacro(Tm,spProjTm,spReconTm);
 
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
 {

From dacbbdd0516235063a9907be788c90e90b05424d Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 00:37:02 +0100
Subject: [PATCH 10/18] Hip Happy Birthday

---
 Grid/threads/Accelerator.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 74a3ea22..25d48c9e 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -286,17 +286,13 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 
 inline void *acceleratorAllocShared(size_t bytes)
 {
-#if 0
   void *ptr=NULL;
   auto err = hipMallocManaged((void **)&ptr,bytes);
   if( err != hipSuccess ) {
     ptr = (void *) NULL;
-    printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
+    printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err));
   }
   return ptr;
-#else
-  return malloc(bytes);
-#endif
 };
 
 inline void *acceleratorAllocDevice(size_t bytes)
@@ -305,7 +301,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
   auto err = hipMalloc((void **)&ptr,bytes);
   if( err != hipSuccess ) {
     ptr = (void *) NULL;
-    printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
+    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
   }
   return ptr;
 };

From 1c881ce23c2119e5567608d57161fc538d549e82 Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 02:28:33 +0100
Subject: [PATCH 11/18] HIP does not like half2 visible members x and y so must
 define own Half2

---
 Grid/simd/Grid_gpu_vec.h | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h
index b9c6a81b..8b17f75a 100644
--- a/Grid/simd/Grid_gpu_vec.h
+++ b/Grid/simd/Grid_gpu_vec.h
@@ -41,6 +41,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 namespace Grid {
 
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+typedef struct { uint16_t x;} half;
+#endif
+typedef struct Half2_t { half x; half y; } Half2;
+
 #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
 
 template<class pair>
@@ -125,14 +130,14 @@ inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const
 }
 
 constexpr int NSIMD_RealH    = COALESCE_GRANULARITY / sizeof(half);
-constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2);
+constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2);
 constexpr int NSIMD_RealF    = COALESCE_GRANULARITY / sizeof(float);
 constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
 constexpr int NSIMD_RealD    = COALESCE_GRANULARITY / sizeof(double);
 constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
 constexpr int NSIMD_Integer  = COALESCE_GRANULARITY / sizeof(Integer);
 
-typedef GpuComplex<half2  > GpuComplexH;
+typedef GpuComplex<Half2  > GpuComplexH;
 typedef GpuComplex<float2 > GpuComplexF;
 typedef GpuComplex<double2> GpuComplexD;
 
@@ -147,11 +152,9 @@ typedef GpuVector<NSIMD_Integer,  Integer     > GpuVectorI;
 accelerator_inline float half2float(half h)
 {
   float f;
-#ifdef GRID_SIMT
+#if defined(GRID_CUDA) || defined(GRID_HIP)
   f = __half2float(h);
 #else 
-  //f = __half2float(h);
-  __half_raw hr(h);
   Grid_half hh; 
   hh.x = hr.x;
   f=  sfw_half_to_float(hh);
@@ -161,13 +164,11 @@ accelerator_inline float half2float(half h)
 accelerator_inline half float2half(float f)
 {
   half h;
-#ifdef GRID_SIMT
+#if defined(GRID_CUDA) || defined(GRID_HIP)
   h = __float2half(f);
 #else
   Grid_half hh = sfw_float_to_half(f);
-  __half_raw hr;  
-  hr.x = hh.x;
-  h = __half(hr);
+  h.x = hh.x;
 #endif
   return h;
 }
@@ -523,7 +524,7 @@ namespace Optimization {
     ////////////////////////////////////////////////////////////////////////////////////
     // Single / Half
     ////////////////////////////////////////////////////////////////////////////////////
-    static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
+     static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
       int N = GpuVectorCF::N;
       GpuVectorCH h;
       for(int i=0;i<N;i++) {

From 81441e98f4feded0c3b09735713b2cb49ae18841 Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Wed, 16 Sep 2020 03:35:03 +0100
Subject: [PATCH 12/18] HIP runs sensible

---
 Grid/lattice/Lattice_arith.h    |  6 ++---
 Grid/threads/Accelerator.cc     |  2 +-
 benchmarks/Benchmark_su3.cc     | 42 +++++++++++++++++++++++++++++----
 benchmarks/Benchmark_su3_gpu.cc |  3 ++-
 4 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h
index a3ae1f28..3c269c58 100644
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
   autoView( lhs_v , lhs, AcceleratorRead);
   autoView( rhs_v , rhs, AcceleratorRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
     auto lhs_t=lhs_v(ss);
     auto rhs_t=rhs_v(ss);
+    auto tmp  =ret_v(ss);
     mac(&tmp,&lhs_t,&rhs_t);
     coalescedWrite(ret_v[ss],tmp);
   });
@@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
   autoView( ret_v , ret, AcceleratorWrite);
   autoView( lhs_v , lhs, AcceleratorRead);
   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
+    auto tmp  =ret_v(ss);
     auto lhs_t=lhs_v(ss);
     mac(&tmp,&lhs_t,&rhs);
     coalescedWrite(ret_v[ss],tmp);
@@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
   autoView( ret_v , ret, AcceleratorWrite);
   autoView( rhs_v , lhs, AcceleratorRead);
   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
+    auto tmp  =ret_v(ss);
     auto rhs_t=rhs_v(ss);
     mac(&tmp,&lhs,&rhs_t);
     coalescedWrite(ret_v[ss],tmp);
diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc
index 35e10d31..2134d158 100644
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -127,7 +127,7 @@ void acceleratorInit(void)
       printf("AcceleratorHipInit: ========================\n");
       printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);
 
-      GPU_PROP_FMT(totalGlobalMem,"%lld");
+      GPU_PROP_FMT(totalGlobalMem,"%lu");
       //      GPU_PROP(managedMemory);
       GPU_PROP(isMultiGpuBoard);
       GPU_PROP(warpSize);
diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index d24a3e25..d094da0c 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -36,12 +36,12 @@ int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
-#define LMAX (48)
+#define LMAX (40)
 #define LMIN (8)
 #define LADD (8)
 
-  int64_t Nwarm=50;
-  int64_t Nloop=500;
+  int64_t Nwarm=10;
+  int64_t Nloop=100;
 
   Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
   Coordinate mpi_layout  = GridDefaultMpi();
@@ -118,6 +118,41 @@ int main (int argc, char ** argv)
 
     }
 
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z=z+ x*y"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+
+  for(int lat=LMIN;lat<=LMAX;lat+=LADD){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
+
+      for(int64_t i=0;i<Nwarm;i++){
+	z=z+x*y;
+      }
+      double start=usecond();
+      for(int64_t i=0;i<Nloop;i++){
+	z=z+x*y;
+      }
+      double stop=usecond();
+      double time = (stop-start)/Nloop*1000.0;
+      
+      double bytes=4*vol*Nc*Nc*sizeof(Complex);
+      double flops=Nc*Nc*(6+8+8)*vol;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+
+    }
+
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -143,7 +178,6 @@ int main (int argc, char ** argv)
       double start=usecond();
       for(int64_t i=0;i<Nloop;i++){
 	mult(z,x,y);
-	//	mac(z,x,y);
       }
       double stop=usecond();
       double time = (stop-start)/Nloop*1000.0;
diff --git a/benchmarks/Benchmark_su3_gpu.cc b/benchmarks/Benchmark_su3_gpu.cc
index 93592ca2..f7412054 100644
--- a/benchmarks/Benchmark_su3_gpu.cc
+++ b/benchmarks/Benchmark_su3_gpu.cc
@@ -187,7 +187,8 @@ int main (int argc, char ** argv)
 	  auto xx = coalescedRead(x_v[ss]);
 	  auto yy = coalescedRead(y_v[ss]);
 	  auto zz = coalescedRead(z_v[ss]);
-	  zz = zz+xx*yy;
+	  //zz = zz+xx*yy;
+	  mac(&zz,&xx,&yy);
 	  coalescedWrite(z_v[ss],zz);
         });
       }

From 446ef4057041470f81b097116886011af1806169 Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Thu, 17 Sep 2020 20:31:46 +0100
Subject: [PATCH 13/18]  HIP IPC

---
 Grid/communicator/SharedMemoryMPI.cc | 49 +++++++++++++++++++---------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc
index 8b27ab7a..0cbde9eb 100644
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -32,6 +32,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
 #endif
+#ifdef GRID_HIP
+#include <hip/hip_runtime_api.h>
+#endif
 
 NAMESPACE_BEGIN(Grid); 
 #define header "SharedMemoryMpi: "
@@ -425,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef GRID_CUDA
+#if defined(GRID_CUDA) ||defined(GRID_HIP)
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
   void * ShmCommBuf ; 
@@ -448,21 +451,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
   // Each MPI rank should allocate our own buffer
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-#ifndef GRID_MPI3_SHM_NONE
-  auto err =  cudaMalloc(&ShmCommBuf, bytes);
-#else
-  auto err =  cudaMallocManaged(&ShmCommBuf, bytes);
-#endif
-  if ( err !=  cudaSuccess) {
-    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
-    exit(EXIT_FAILURE);  
-  }
+  ShmCommBuf = acceleratorAllocDevice(bytes);
+
   if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
     exit(EXIT_FAILURE);  
   }
   if ( WorldRank == 0 ){
-    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes 
+	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
   }
   SharedMemoryZero(ShmCommBuf,bytes);
 
@@ -475,15 +472,26 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
     //////////////////////////////////////////////////
     // If it is me, pass around the IPC access key
     //////////////////////////////////////////////////
+#ifdef GRID_CUDA
     cudaIpcMemHandle_t handle;
-    
     if ( r==WorldShmRank ) { 
-      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
+      auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
       if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
       }
     }
+#endif
+#ifdef GRID_HIP
+    hipIpcMemHandle_t handle;    
+    if ( r==WorldShmRank ) { 
+      auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
+      if ( err !=  hipSuccess) {
+	std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+#endif
     //////////////////////////////////////////////////
     // Share this IPC handle across the Shm Comm
     //////////////////////////////////////////////////
@@ -500,13 +508,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
     // If I am not the source, overwrite thisBuf with remote buffer
     ///////////////////////////////////////////////////////////////
     void * thisBuf = ShmCommBuf;
+#ifdef GRID_CUDA
     if ( r!=WorldShmRank ) { 
-      err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
+      auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
       if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
       }
     }
+#endif
+#ifdef GRID_HIP
+    if ( r!=WorldShmRank ) { 
+      auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
+      if ( err !=  hipSuccess) {
+	std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+#endif
     ///////////////////////////////////////////////////////////////
     // Save a copy of the device buffers
     ///////////////////////////////////////////////////////////////

From c48909590bff534fc20d6957678515f70f411d90 Mon Sep 17 00:00:00 2001
From: Peter Boyle <Peter Boyle>
Date: Thu, 17 Sep 2020 20:47:32 +0100
Subject: [PATCH 14/18] MPI asynch call removal

---
 Grid/communicator/Communicator_base.h  | 15 ---------------
 Grid/communicator/Communicator_none.cc | 23 -----------------------
 2 files changed, 38 deletions(-)

diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
index 436d75ef..bb06d43f 100644
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -138,21 +138,6 @@ public:
 		      int recv_from_rank,
 		      int bytes);
   
-  void SendRecvPacket(void *xmit,
-		      void *recv,
-		      int xmit_to_rank,
-		      int recv_from_rank,
-		      int bytes);
-  
-  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-			   void *xmit,
-			   int xmit_to_rank,
-			   void *recv,
-			   int recv_from_rank,
-			   int bytes);
-  
-  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-
   double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,
 			       void *recv,
diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc
index 81900371..b4f51ca2 100644
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}
 
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int xmit_to_rank,
-					   int recv_from_rank,
-					   int bytes)
-{
-  assert(0);
-}
-
 
 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
@@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
   assert(0);
 }
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  assert(0);
-}
-
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  assert(0);
-}
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
   bcopy(in,out,bytes*words);

From b71a081cba989be0916376ac0331401ba786738c Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Mon, 21 Sep 2020 09:33:23 +0100
Subject: [PATCH 15/18] Asynchronous calls removed - reflect this in
 Communicator_none.cc (Opportunistic doc update - OpenMP support on Mac OS)

---
 Grid/communicator/Communicator_none.cc | 7 -------
 documentation/GridXcode/readme.md      | 6 +++---
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc
index b4f51ca2..6cb431a2 100644
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -114,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int recv_from_rank,
 						     int bytes, int dir)
 {
-  std::vector<CommsRequest_t> list;
-  // Discard the "dir"
-  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
-  SendToRecvFromComplete(list);
   return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -127,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
-  // Discard the "dir"
-  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
   return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
-  SendToRecvFromComplete(waitall);
 }
 
 void CartesianCommunicator::StencilBarrier(void){};
diff --git a/documentation/GridXcode/readme.md b/documentation/GridXcode/readme.md
index 11deb6fa..b8342828 100644
--- a/documentation/GridXcode/readme.md
+++ b/documentation/GridXcode/readme.md
@@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu
 
 This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:
 
-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridDebug --enable-comms=none
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=double --prefix=$GridPre/Debug
 
 #### 2. `Release`
 
 Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):
 
-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=single --prefix=$GridPre/GridRelease --enable-comms=none
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=single --prefix=$GridPre/Release
 
 #### 3. `MPIDebug`
 
 Debug configuration with MPI:
 
-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --enable-precision=double --prefix=$GridPre/MPIDebug
 
 ### 5.3 Build Grid
 

From 049dd257856a075b52a2b45e68ad5e0eb139ce04 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 23 Sep 2020 04:13:50 -0400
Subject: [PATCH 16/18] Revert accidental commit thanks michael

---
 configure.ac | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/configure.ac b/configure.ac
index 96de5b07..f13f06fc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -53,11 +53,11 @@ AC_TYPE_UINT64_T
 ############### OpenMP
 AC_OPENMP
 ac_openmp=no
-#if test "${ac_cv_prog_cxx_openmp}X" != "noX"; then
-#  ac_openmp=yes
-##  AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
-#  AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
-#fi
+if test "${ac_cv_prog_cxx_openmp}X" != "noX"; then
+  ac_openmp=yes
+  AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
+  AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
+fi
 
 
 ############### Checks for header files

From a3e2aeb603f297661b6d81165b9be141e023d8d5 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 29 Sep 2020 06:50:10 -0700
Subject: [PATCH 17/18] dpcpp options happiness

---
 configure.ac | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/configure.ac b/configure.ac
index f13f06fc..90f66f44 100644
--- a/configure.ac
+++ b/configure.ac
@@ -337,6 +337,11 @@ case ${CXXTEST} in
        CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
     fi
     ;;
+  dpcpp)
+    LDFLAGS="$LDFLAGS"
+    CXXFLAGS="$CXXFLAGS"
+    CXXLD=${CXX}
+    ;;
   *)
     CXXLD=${CXX}
     CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"

From ace9cd64bb7f2e471a1728e421b6a271576cdbc9 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 29 Sep 2020 08:03:46 -0700
Subject: [PATCH 18/18] dpcpp happy

---
 Grid/qcd/action/fermion/StaggeredKernels.h    | 19 +++++++++++++------
 .../CayleyFermion5DImplementation.h           |  4 ++--
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/Grid/qcd/action/fermion/StaggeredKernels.h b/Grid/qcd/action/fermion/StaggeredKernels.h
index 30deee06..d67105bb 100644
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@@ -63,17 +63,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
    ///////////////////////////////////////////////////////////////////////////////////////
    // Generic Nc kernels
    ///////////////////////////////////////////////////////////////////////////////////////
-   template<int Naik> accelerator_inline
+   template<int Naik> 
+   static accelerator_inline
    void DhopSiteGeneric(StencilView &st, 
 			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
    void DhopSiteGenericInt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
    void DhopSiteGenericExt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
@@ -82,17 +85,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
    ///////////////////////////////////////////////////////////////////////////////////////
    // Nc=3 specific kernels
    ///////////////////////////////////////////////////////////////////////////////////////
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
    void DhopSiteHand(StencilView &st, 
 		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
 		     const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
    void DhopSiteHandInt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
    void DhopSiteHandExt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
@@ -101,6 +107,7 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
    ///////////////////////////////////////////////////////////////////////////////////////
    // Asm Nc=3 specific kernels
    ///////////////////////////////////////////////////////////////////////////////////////
+   
    void DhopSiteAsm(StencilView &st, 
 		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
index e79b64dc..b3fbe096 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 
   PropagatorField tmp(UGrid);
   PropagatorField Utmp(UGrid);
-  LatticeInteger zz (UGrid);   zz=0.0;
+  PropagatorField zz (UGrid);   zz=0.0;
   LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
   for (int s=0;s<Ls;s++) {
 
@@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
   PropagatorField tmp(UGrid);
   PropagatorField Utmp(UGrid);
 
-  LatticeInteger zz (UGrid);   zz=0.0;
+  PropagatorField  zz (UGrid);   zz=0.0;
   LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
 
   for(int s=0;s<Ls;s++){