more I/O benchmark code cleaning

script to convert I/O benchmark logs to CSV
I/O benchmark code cleaning
2026-03-02 10:36:13 +00:00 · 2020-10-07 15:38:41 +01:00 · 2020-10-07 15:35:03 +01:00 · 2020-10-07 15:31:51 +01:00 · 2020-10-06 17:58:16 +01:00 · 2020-10-06 17:57:00 +01:00
25 changed files with 464 additions and 168 deletions
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -34,6 +34,12 @@
 #define __SYCL__REDEFINE__
 #endif

+/* HIP save and restore compile environment*/
+#ifdef GRID_HIP
+#pragma push
+#pragma push_macro("__HIP_DEVICE_COMPILE__")
+#endif
+#define EIGEN_NO_HIP

 #include <Grid/Eigen/Dense>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
@@ -52,6 +58,12 @@
 #pragma pop
 #endif

+/*HIP restore*/
+#ifdef __HIP__REDEFINE__
+#pragma pop_macro("__HIP_DEVICE_COMPILE__")
+#pragma pop
+#endif
+
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -138,21 +138,6 @@ public:
 		      int recv_from_rank,
 		      int bytes);
  
-  void SendRecvPacket(void *xmit,
-		      void *recv,
-		      int xmit_to_rank,
-		      int recv_from_rank,
-		      int bytes);
-  
-  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-			   void *xmit,
-			   int xmit_to_rank,
-			   void *recv,
-			   int recv_from_rank,
-			   int bytes);
-  
-  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,
 			       void *recv,
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}

-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int xmit_to_rank,
-					   int recv_from_rank,
-					   int bytes)
-{
-  assert(0);
-}
-

 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
@@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  assert(0);
-}
-
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  assert(0);
-}
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
@@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int recv_from_rank,
 						     int bytes, int dir)
 {
-  std::vector<CommsRequest_t> list;
-  // Discard the "dir"
-  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
-  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
-  // Discard the "dir"
-  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
-  SendToRecvFromComplete(waitall);
 }

 void CartesianCommunicator::StencilBarrier(void){};
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -32,6 +32,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
 #endif
+#ifdef GRID_HIP
+#include <hip/hip_runtime_api.h>
+#endif

 NAMESPACE_BEGIN(Grid); 
 #define header "SharedMemoryMpi: "
@@ -425,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef GRID_CUDA
+#if defined(GRID_CUDA) ||defined(GRID_HIP)
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@@ -448,21 +451,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-#ifndef GRID_MPI3_SHM_NONE
-  auto err =  cudaMalloc(&ShmCommBuf, bytes);
-#else
-  auto err =  cudaMallocManaged(&ShmCommBuf, bytes);
-#endif
-  if ( err !=  cudaSuccess) {
-    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
-    exit(EXIT_FAILURE);  
-  }
+  ShmCommBuf = acceleratorAllocDevice(bytes);
+
  if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes 
+	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);

@@ -475,15 +472,26 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
+#ifdef GRID_CUDA
    cudaIpcMemHandle_t handle;
-    
    if ( r==WorldShmRank ) { 
-      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
+      auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
+#endif
+#ifdef GRID_HIP
+    hipIpcMemHandle_t handle;    
+    if ( r==WorldShmRank ) { 
+      auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
+      if ( err !=  hipSuccess) {
+	std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+#endif
    //////////////////////////////////////////////////
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
@@ -500,13 +508,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // If I am not the source, overwrite thisBuf with remote buffer
    ///////////////////////////////////////////////////////////////
    void * thisBuf = ShmCommBuf;
+#ifdef GRID_CUDA
    if ( r!=WorldShmRank ) { 
-      err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
+      auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
+#endif
+#ifdef GRID_HIP
+    if ( r!=WorldShmRank ) { 
+      auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
+      if ( err !=  hipSuccess) {
+	std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+#endif
    ///////////////////////////////////////////////////////////////
    // Save a copy of the device buffers
    ///////////////////////////////////////////////////////////////
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  autoView( lhs_v , lhs, AcceleratorRead);
  autoView( rhs_v , rhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
    auto rhs_t=rhs_v(ss);
+    auto tmp  =ret_v(ss);
    mac(&tmp,&lhs_t,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
  });
@@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
+    auto tmp  =ret_v(ss);
    auto lhs_t=lhs_v(ss);
    mac(&tmp,&lhs_t,&rhs);
    coalescedWrite(ret_v[ss],tmp);
@@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( rhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
+    auto tmp  =ret_v(ss);
    auto rhs_t=rhs_v(ss);
    mac(&tmp,&lhs,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid);

 #ifdef GRID_HIP
 extern hipDeviceProp_t *gpu_props;
+#define WARP_SIZE 64
 #endif
 #ifdef GRID_CUDA
 extern cudaDeviceProp *gpu_props;
+#define WARP_SIZE 32
 #endif

-#define WARP_SIZE 32
 __device__ unsigned int retirementCount = 0;

 template <class Iterator>
@@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
  
  // cannot use overloaded operators for sobj as they are not volatile-qualified
  memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
-  __syncwarp();
+  acceleratorSynchronise();
  
  const Iterator VEC = WARP_SIZE;
  const Iterator vid = tid & (VEC-1);
@@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
      beta += temp;
      memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
    }
-    __syncwarp();
+    acceleratorSynchronise();
  }
-  __syncthreads();
+  acceleratorSynchroniseAll();
  
  if (threadIdx.x == 0) {
    beta  = Zero();
@@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
    }
    memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
  }
-  __syncthreads();
+  acceleratorSynchroniseAll();
 }


--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -130,6 +130,8 @@ public:
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){

    if ( log.active ) {
+      std::ios_base::fmtflags f(stream.flags());
+
      stream << log.background()<<  std::left;
      if (log.topWidth > 0)
      {
@@ -152,6 +154,8 @@ public:
 	       << now	       << log.background() << " : " ;
      }
      stream << log.colour();
+      stream.flags(f);
+
      return stream;
    } else { 
      return devnull;
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@@ -63,17 +63,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   template<int Naik> accelerator_inline
+   template<int Naik> 
+   static accelerator_inline
   void DhopSiteGeneric(StencilView &st, 
 			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteGenericInt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteGenericExt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
@@ -82,17 +85,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteHand(StencilView &st, 
 		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
 		     const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteHandInt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteHandExt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
@@ -101,6 +107,7 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
+   
   void DhopSiteAsm(StencilView &st, 
 		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
-  LatticeInteger zz (UGrid);   zz=0.0;
+  PropagatorField zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
  for (int s=0;s<Ls;s++) {

@@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);

-  LatticeInteger zz (UGrid);   zz=0.0;
+  PropagatorField  zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);

  for(int s=0;s<Ls;s++){
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid);


 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
 					  SiteSpinor *buf, int sF, int sU, 
@@ -221,7 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,


 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -300,7 +300,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,


 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -78,7 +78,7 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 // Int, Ext, Int+Ext cases for comms overlap
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -126,7 +126,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
  // Only contributions from interior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU, 
@@ -174,7 +174,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
  // Only contributions from exterior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU,
@@ -224,7 +224,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
 ////////////////////////////////////////////////////////////////////////////////////
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 {
@@ -253,7 +253,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
      ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\
  });

-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@@ -293,7 +293,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 				       DoubledGaugeField &U,
 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
@@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
  HAND_RESULT_EXT(ss,F)

 #define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void							\
+  template<> accelerator_inline void							\
  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
    nmu = 0;								\
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Impl> void 
+template<class Impl> accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_RESULT(ss);
 }

-template<class Impl>
+template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_RESULT(ss);
 }

-template<class Impl> void 
+template<class Impl>  accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_RESULT(ss);
 }

-template<class Impl>
+template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_RESULT(ss);
 }

-template<class Impl> void 
+template<class Impl>  accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_RESULT_EXT(ss);
 }

-template<class Impl>
+template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  ////////////////////////////////////////////////////////////////////
  // All legs kernels ; comms then compute
  ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
  coalescedWrite(out[sF],result,lane);
 };

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
 					  SiteHalfSpinor *buf, int sF,
 					  int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
  ////////////////////////////////////////////////////////////////////
  // Interior kernels
  ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFi
  coalescedWrite(out[sF], result,lane);
 };

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeField
 ////////////////////////////////////////////////////////////////////
 // Exterior kernels
 ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFi
  }
 };

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
 };

 #define DhopDirMacro(Dir,spProj,spRecon)	\
-  template <class Impl>							\
+  template <class Impl> accelerator_inline				\
  void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
 					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
  {									\
@@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
 DhopDirMacro(Zm,spProjZm,spReconZm);
 DhopDirMacro(Tm,spProjTm,spReconTm);

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
 {
--- a/Grid/simd/Grid_gpu_vec.h
+++ b/Grid/simd/Grid_gpu_vec.h
@@ -41,6 +41,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+typedef struct { uint16_t x;} half;
+#endif
+typedef struct Half2_t { half x; half y; } Half2;
+
 #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )

 template<class pair>
@@ -125,14 +130,14 @@ inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const
 }

 constexpr int NSIMD_RealH    = COALESCE_GRANULARITY / sizeof(half);
-constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2);
+constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2);
 constexpr int NSIMD_RealF    = COALESCE_GRANULARITY / sizeof(float);
 constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
 constexpr int NSIMD_RealD    = COALESCE_GRANULARITY / sizeof(double);
 constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
 constexpr int NSIMD_Integer  = COALESCE_GRANULARITY / sizeof(Integer);

-typedef GpuComplex<half2  > GpuComplexH;
+typedef GpuComplex<Half2  > GpuComplexH;
 typedef GpuComplex<float2 > GpuComplexF;
 typedef GpuComplex<double2> GpuComplexD;

@@ -147,11 +152,9 @@ typedef GpuVector<NSIMD_Integer,  Integer     > GpuVectorI;
 accelerator_inline float half2float(half h)
 {
  float f;
-#ifdef GRID_SIMT
+#if defined(GRID_CUDA) || defined(GRID_HIP)
  f = __half2float(h);
 #else 
-  //f = __half2float(h);
-  __half_raw hr(h);
  Grid_half hh; 
  hh.x = hr.x;
  f=  sfw_half_to_float(hh);
@@ -161,13 +164,11 @@ accelerator_inline float half2float(half h)
 accelerator_inline half float2half(float f)
 {
  half h;
-#ifdef GRID_SIMT
+#if defined(GRID_CUDA) || defined(GRID_HIP)
  h = __float2half(f);
 #else
  Grid_half hh = sfw_float_to_half(f);
-  __half_raw hr;  
-  hr.x = hh.x;
-  h = __half(hr);
+  h.x = hh.x;
 #endif
  return h;
 }
@@ -523,7 +524,7 @@ namespace Optimization {
    ////////////////////////////////////////////////////////////////////////////////////
    // Single / Half
    ////////////////////////////////////////////////////////////////////////////////////
-    static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
+     static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
      int N = GpuVectorCF::N;
      GpuVectorCH h;
      for(int i=0;i<N;i++) {
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -55,6 +55,7 @@ void acceleratorInit(void)
 	printf("AcceleratorCudaInit[%d]: ========================\n",rank);
 	printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);

+
 	GPU_PROP_FMT(totalGlobalMem,"%lld");
 	GPU_PROP(managedMemory);
 	GPU_PROP(isMultiGpuBoard);
@@ -109,20 +110,24 @@ void acceleratorInit(void)
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}

+  printf("world_rank %d has %d devices\n",world_rank,nDevices);
+  size_t totalDeviceMem=0;
  for (int i = 0; i < nDevices; i++) {

 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
    hipGetDeviceProperties(&gpu_props[i], i);
+    hipDeviceProp_t prop; 
+    prop = gpu_props[i];
+    totalDeviceMem = prop.totalGlobalMem;
    if ( world_rank == 0) {
-      hipDeviceProp_t prop; 
-      prop = gpu_props[i];
      printf("AcceleratorHipInit: ========================\n");
      printf("AcceleratorHipInit: Device Number    : %d\n", i);
      printf("AcceleratorHipInit: ========================\n");
      printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);

+      GPU_PROP_FMT(totalGlobalMem,"%lu");
      //      GPU_PROP(managedMemory);
      GPU_PROP(isMultiGpuBoard);
      GPU_PROP(warpSize);
@@ -131,6 +136,7 @@ void acceleratorInit(void)
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
+  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
 #ifdef GRID_IBM_SUMMIT
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -307,17 +307,13 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 inline void *acceleratorAllocShared(size_t bytes)
 {
-#if 0
  void *ptr=NULL;
  auto err = hipMallocManaged((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
+    printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
-#else
-  return malloc(bytes);
-#endif
 };
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }

@@ -327,7 +323,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = hipMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
+    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
 };
--- a/benchmarks/Benchmark_IO.cc
+++ b/benchmarks/Benchmark_IO.cc
@@ -14,35 +14,62 @@ std::string filestem(const int l)

 int main (int argc, char ** argv)
 {
-#ifdef HAVE_LIME
  Grid_init(&argc,&argv);

-  int64_t threads = GridThread::GetThreads();
+  int64_t          threads = GridThread::GetThreads();
+  auto             mpi     = GridDefaultMpi();
+  std::vector<int> latt;
+
  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
+  MSG << "MPI partition " << mpi << std::endl;
+
  MSG << SEP << std::endl;
-  MSG << "Benchmark Lime write" << std::endl;
+  MSG << "Benchmark std write" << std::endl;
  MSG << SEP << std::endl;
  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
  {
-    auto             mpi  = GridDefaultMpi();
-    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};

-    std::cout << "-- Local volume " << l << "^4" << std::endl;
+    MSG << "-- Local volume " << l << "^4" << std::endl;
+    writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
+  }
+
+  MSG << SEP << std::endl;
+  MSG << "Benchmark std read" << std::endl;
+  MSG << SEP << std::endl;
+  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
+  {
+    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+    MSG << "-- Local volume " << l << "^4" << std::endl;
+    readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
+  }
+
+#ifdef HAVE_LIME
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid C-Lime write" << std::endl;
+  MSG << SEP << std::endl;
+  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
+  {
+    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+    MSG << "-- Local volume " << l << "^4" << std::endl;
    writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
  }

-  MSG << "Benchmark Lime read" << std::endl;
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid C-Lime read" << std::endl;
  MSG << SEP << std::endl;
  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
  {
-    auto             mpi  = GridDefaultMpi();
-    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};

-    std::cout << "-- Local volume " << l << "^4" << std::endl;
+    MSG << "-- Local volume " << l << "^4" << std::endl;
    readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
  }
+#endif

  Grid_finalize();
-#endif
+
  return EXIT_SUCCESS;
 }
--- a/benchmarks/Benchmark_IO.hpp
+++ b/benchmarks/Benchmark_IO.hpp
@@ -14,13 +14,140 @@ using WriterFn = std::function<void(const std::string, Field &)> ;
 template <typename Field>
 using ReaderFn = std::function<void(Field &, const std::string)>;

+// AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
+// 
+// template <typename Field>
+// void stdWrite(const std::string filestem, Field &vec)
+// {
+//   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+//   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
+//   size_t        size;
+//   uint32_t      crc;
+//   GridStopWatch ioWatch, crcWatch;
+
+//   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+//   autoView(vec_v, vec, CpuRead);
+//   crcWatch.Start();
+//   crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
+//   std::fwrite(&crc, sizeof(uint32_t), 1, file);
+//   crcWatch.Stop();
+//   MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+//   ioWatch.Start();
+//   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+//   ioWatch.Stop();
+//   std::fclose(file);
+//   size *= vec.Grid()->ProcessorCount();
+//   MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() 
+//       << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) 
+//       << " MB/s" << std::endl;
+//   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+// }
+//
+// template <typename Field>
+// void stdRead(Field &vec, const std::string filestem)
+// {
+//   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+//   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
+//   size_t        size;
+//   uint32_t      crcRead, crcData;
+//   GridStopWatch ioWatch, crcWatch;
+
+//   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+//   crcWatch.Start();
+//   std::fread(&crcRead, sizeof(uint32_t), 1, file);
+//   crcWatch.Stop();
+//   {
+//     autoView(vec_v, vec, CpuWrite);
+//     ioWatch.Start();
+//     std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+//     ioWatch.Stop();
+//     std::fclose(file);
+//   }
+//   {
+//     autoView(vec_v, vec, CpuRead);
+//     crcWatch.Start();
+//     crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
+//     crcWatch.Stop();
+//   }
+//   MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+//   assert(crcData == crcRead);
+//   size *= vec.Grid()->ProcessorCount();
+//   MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() 
+//       << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) 
+//       << " MB/s" << std::endl;
+//   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+// }
+
+template <typename Field>
+void stdWrite(const std::string filestem, Field &vec)
+{
+  std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+  std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary);
+  size_t        size, sizec;
+  uint32_t      crc;
+  GridStopWatch ioWatch, crcWatch;
+
+  size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  sizec = size/sizeof(char); // just in case of...
+  autoView(vec_v, vec, CpuRead);
+  crcWatch.Start();
+  crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
+  file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t)/sizeof(char));
+  crcWatch.Stop();
+  MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+  ioWatch.Start();
+  file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
+  file.flush();
+  ioWatch.Stop();
+  size *= vec.Grid()->ProcessorCount();
+  MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() 
+      << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) 
+      << " MB/s" << std::endl;
+  MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+}
+
+template <typename Field>
+void stdRead(Field &vec, const std::string filestem)
+{
+  std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+  std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary);
+  size_t        size, sizec;
+  uint32_t      crcRead, crcData;
+  GridStopWatch ioWatch, crcWatch;
+
+  size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  sizec = size/sizeof(char); // just in case of...
+  crcWatch.Start();
+  file.read(reinterpret_cast<char *>(&crcRead), sizeof(uint32_t)/sizeof(char));
+  crcWatch.Stop();
+  {
+    autoView(vec_v, vec, CpuWrite);
+    ioWatch.Start();
+    file.read(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
+    ioWatch.Stop();
+  }
+  {
+    autoView(vec_v, vec, CpuRead);
+    crcWatch.Start();
+    crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
+    crcWatch.Stop();
+  }
+  MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+  assert(crcData == crcRead);
+  size *= vec.Grid()->ProcessorCount();
+  MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() 
+      << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) 
+      << " MB/s" << std::endl;
+  MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+}
+
 template <typename Field>
 void limeWrite(const std::string filestem, Field &vec)
 {
  emptyUserRecord   record;
  ScidacWriter binWriter(vec.Grid()->IsBoss());

-  binWriter.open(filestem + ".bin");
+  binWriter.open(filestem + ".lime.bin");
  binWriter.writeScidacFieldRecord(vec, record);
  binWriter.close();
 }
@@ -31,7 +158,7 @@ void limeRead(Field &vec, const std::string filestem)
  emptyUserRecord   record;
  ScidacReader binReader;

-  binReader.open(filestem + ".bin");
+  binReader.open(filestem + ".lime.bin");
  binReader.readScidacFieldRecord(vec, record);
  binReader.close();
 }
--- a/benchmarks/Benchmark_IO_vs_dir.cc
+++ b/benchmarks/Benchmark_IO_vs_dir.cc
@@ -8,7 +8,6 @@ using namespace Grid;

 int main (int argc, char ** argv)
 {
-#ifdef HAVE_LIME
  std::vector<std::string> dir;
  unsigned int             Ls;
  bool                     rb;
@@ -34,46 +33,71 @@ int main (int argc, char ** argv)
  }
  Grid_init(&argc,&argv);

-
  int64_t threads = GridThread::GetThreads();
+  auto    mpi     = GridDefaultMpi();
+
  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
-  MSG << SEP << std::endl;
-  MSG << "Benchmark double precision Lime write" << std::endl;
-  MSG << SEP << std::endl;
-  for (auto &d: dir)
-  {
-    MSG << "-- Directory " << d << std::endl;
-    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermion>, Ls, rb);
-  }
+  MSG << "MPI partition " << mpi << std::endl;

  MSG << SEP << std::endl;
-  MSG << "Benchmark double precision Lime read" << std::endl;
+  MSG << "Benchmark Grid std write" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
    MSG << "-- Directory " << d << std::endl;
-    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermion>, Ls, rb);
+    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                   stdWrite<LatticeFermion>, Ls, rb);
+  }
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid std read" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                  stdRead<LatticeFermion>, Ls, rb);
  }

+#ifdef HAVE_LIME
  MSG << SEP << std::endl;
-  MSG << "Benchmark single precision Lime write" << std::endl;
+  MSG << "Benchmark Grid C-Lime write" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
    MSG << "-- Directory " << d << std::endl;
-    writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
+    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                   limeWrite<LatticeFermion>, Ls, rb);
  }
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid C-Lime read" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                  limeRead<LatticeFermion>, Ls, rb);
+  }
+#endif

-  MSG << SEP << std::endl;
-  MSG << "Benchmark single precision Lime read" << std::endl;
-  MSG << SEP << std::endl;
-  for (auto &d: dir)
-  {
-    MSG << "-- Directory " << d << std::endl;
-    readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
-  }
+  // MSG << SEP << std::endl;
+  // MSG << "Benchmark single precision Lime write" << std::endl;
+  // MSG << SEP << std::endl;
+  // for (auto &d: dir)
+  // {
+  //   MSG << "-- Directory " << d << std::endl;
+  //   writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
+  // }
+
+  // MSG << SEP << std::endl;
+  // MSG << "Benchmark single precision Lime read" << std::endl;
+  // MSG << SEP << std::endl;
+  // for (auto &d: dir)
+  // {
+  //   MSG << "-- Directory " << d << std::endl;
+  //   readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
+  // }

  Grid_finalize();
-#endif
+
  return EXIT_SUCCESS;
 }
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -36,12 +36,12 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-#define LMAX (48)
+#define LMAX (40)
 #define LMIN (8)
 #define LADD (8)

-  int64_t Nwarm=50;
-  int64_t Nloop=500;
+  int64_t Nwarm=10;
+  int64_t Nloop=100;

  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();
@@ -118,6 +118,41 @@ int main (int argc, char ** argv)

    }

+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z=z+ x*y"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+
+  for(int lat=LMIN;lat<=LMAX;lat+=LADD){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
+
+      for(int64_t i=0;i<Nwarm;i++){
+	z=z+x*y;
+      }
+      double start=usecond();
+      for(int64_t i=0;i<Nloop;i++){
+	z=z+x*y;
+      }
+      double stop=usecond();
+      double time = (stop-start)/Nloop*1000.0;
+      
+      double bytes=4*vol*Nc*Nc*sizeof(Complex);
+      double flops=Nc*Nc*(6+8+8)*vol;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+
+    }
+
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -143,7 +178,6 @@ int main (int argc, char ** argv)
      double start=usecond();
      for(int64_t i=0;i<Nloop;i++){
 	mult(z,x,y);
-	//	mac(z,x,y);
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000.0;
--- a/benchmarks/Benchmark_su3_gpu.cc
+++ b/benchmarks/Benchmark_su3_gpu.cc
@@ -187,7 +187,8 @@ int main (int argc, char ** argv)
 	  auto xx = coalescedRead(x_v[ss]);
 	  auto yy = coalescedRead(y_v[ss]);
 	  auto zz = coalescedRead(z_v[ss]);
-	  zz = zz+xx*yy;
+	  //zz = zz+xx*yy;
+	  mac(&zz,&xx,&yy);
 	  coalescedWrite(z_v[ss],zz);
        });
      }
--- a/benchmarks/benchmark-io-csv.sh
+++ b/benchmarks/benchmark-io-csv.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+awkscript='
+BEGIN{
+  i = 0;
+  print "local L,std read (MB/s),std write (MB/s),Grid Lime read (MB/s),Grid Lime write (MB/s)"
+}
+
+/Benchmark std write/{
+  i    = 0; 
+  mode = "stdWrite";
+} 
+
+/Benchmark std read/{
+  i    = 0; 
+  mode = "stdRead"
+} 
+
+/Benchmark Grid C-Lime write/{
+  i    = 0; 
+  mode = "gridWrite";
+} 
+
+/Benchmark Grid C-Lime read/{
+  i    = 0; 
+  mode = "gridRead";
+}
+
+/Local volume/{
+  match($0, "[0-9]+\\^4");
+  l[i] = substr($0, RSTART, RLENGTH-2);
+}
+
+/MB\/s/{
+  match($0, "[0-9.eE]+ MB/s");
+  p = substr($0, RSTART, RLENGTH-5);
+  if (mode == "stdWrite")
+  {
+    sw[i] = p;
+  }
+  else if (mode == "stdRead")
+  {
+    sr[i] = p;
+  }
+  else if (mode == "gridWrite")
+  {
+    gw[i] = p;
+  }
+  else if (mode == "gridRead")
+  {
+    gr[i] = p;
+  }
+  i++;
+}
+
+END{
+  s = 0
+  for (a in l)
+  {
+    s++;
+  }
+  for (j = 0; j < s; j++)
+  {
+    printf("%s,%s,%s,%s,%s\n", l[j], sr[j], sw[j], gr[j], gw[j]);
+  }
+  printf("\n");
+}
+'
+
+if (( $# != 1 )); then
+    echo "usage: `basename $0` <log file>" 1>&2
+    exit 1
+fi
+LOG=$1
+
+awk "${awkscript}" ${LOG} 
--- a/configure.ac
+++ b/configure.ac
@@ -330,12 +330,18 @@ case ${CXXTEST} in
    fi
    ;;
  hipcc)
-    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
+#    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
+    CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
    CXXLD=${CXX}
    if test $ac_openmp = yes; then
       CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
    fi
    ;;
+  dpcpp)
+    LDFLAGS="$LDFLAGS"
+    CXXFLAGS="$CXXFLAGS"
+    CXXLD=${CXX}
+    ;;
  *)
    CXXLD=${CXX}
    CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
--- a/documentation/GridXcode/readme.md
+++ b/documentation/GridXcode/readme.md
@@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu

 This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:

-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridDebug --enable-comms=none
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=double --prefix=$GridPre/Debug

 #### 2. `Release`

 Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):

-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=single --prefix=$GridPre/GridRelease --enable-comms=none
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=single --prefix=$GridPre/Release

 #### 3. `MPIDebug`

 Debug configuration with MPI:

-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --enable-precision=double --prefix=$GridPre/MPIDebug

 ### 5.3 Build Grid
Author	SHA1	Message	Date
Antonin Portelli	1ba25a0d8c	more I/O benchmark code cleaning	2020-10-07 15:38:41 +01:00
Antonin Portelli	9ba3647bdf	script to convert I/O benchmark logs to CSV	2020-10-07 15:35:03 +01:00
Antonin Portelli	5ee832f738	I/O benchmark code cleaning	2020-10-07 15:31:51 +01:00
Antonin Portelli	e9c5a271a8	fixing potential issues with log alignment and timer I/O	2020-10-06 17:58:16 +01:00
Antonin Portelli	acac2d6938	standard C/C++ I/O in benchmark	2020-10-06 17:57:00 +01:00
Peter Boyle	ace9cd64bb	dpcpp happy	2020-09-29 08:03:46 -07:00
Peter Boyle	a3e2aeb603	dpcpp options happiness	2020-09-29 06:50:10 -07:00
Peter Boyle	049dd25785	Revert accidental commit thanks michael	2020-09-23 04:13:50 -04:00
Peter Boyle	d43d372294	Merge pull request #311 from mmphys/bugfix/MPIasynch Asynchronous calls removed - reflect this in Communicator_none.cc	2020-09-22 10:41:48 -04:00
Michael Marshall	b71a081cba	Asynchronous calls removed - reflect this in Communicator_none.cc (Opportunistic doc update - OpenMP support on Mac OS)	2020-09-21 09:33:23 +01:00
Peter Boyle	c48909590b	MPI asynch call removal	2020-09-17 20:47:32 +01:00
Peter Boyle	446ef40570	HIP IPC	2020-09-17 20:31:46 +01:00
Peter Boyle	81441e98f4	HIP runs sensible	2020-09-16 03:35:03 +01:00
Peter Boyle	ecd3f890f5	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-09-16 02:30:14 +01:00
Peter Boyle	1c881ce23c	HIP does not like half2 visible members x and y so must define own Half2	2020-09-16 02:28:33 +01:00
Peter Boyle	dacbbdd051	Hip Happy Birthday	2020-09-16 00:37:02 +01:00
Peter Boyle	2859955a03	HIP requires "inline"	2020-09-16 00:36:13 +01:00
Peter Boyle	cc220abd1d	inline for HIP	2020-09-16 00:35:38 +01:00
Peter Boyle	d1c0c0197e	HipCC requires inline on definition	2020-09-16 00:35:06 +01:00
Peter Boyle	fd9424ef27	innlines required to make HIP happy	2020-09-16 00:34:32 +01:00
Peter Boyle	a5c35c4024	Make HIP / Vega happy	2020-09-16 00:33:53 +01:00
Peter Boyle	e03b64dc06	HIP default flaags to work on ROCM	2020-09-16 00:33:09 +01:00
Peter Boyle	4677c40195	HIP improvements	2020-09-16 00:32:27 +01:00
Peter Boyle	288c615782	Hip improvements	2020-09-16 00:31:50 +01:00
Peter Boyle	48e81cf6f8	Hip Pragmas	2020-09-16 00:31:03 +01:00