From 48e81cf6f8603b3552f0de6852ad6117024fa0d6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Sep 2020 00:31:03 +0100 Subject: [PATCH 01/18] Hip Pragmas --- Grid/Grid_Eigen_Dense.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index 9556c03d..d9c002bc 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -34,6 +34,12 @@ #define __SYCL__REDEFINE__ #endif +/* HIP save and restore compile environment*/ +#ifdef GRID_HIP +#pragma push +#pragma push_macro("__HIP_DEVICE_COMPILE__") +#endif +#define EIGEN_NO_HIP #include #include @@ -52,6 +58,12 @@ #pragma pop #endif +/*HIP restore*/ +#ifdef __HIP__REDEFINE__ +#pragma pop_macro("__HIP_DEVICE_COMPILE__") +#pragma pop +#endif + #if defined __GNUC__ #pragma GCC diagnostic pop #endif From 288c6157826d0395a591267939b1ea4380dc865a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Sep 2020 00:31:50 +0100 Subject: [PATCH 02/18] Hip improvements --- Grid/lattice/Lattice_reduction_gpu.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 5f490507..d8a47ae1 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid); #ifdef GRID_HIP extern hipDeviceProp_t *gpu_props; +#define WARP_SIZE 64 #endif #ifdef GRID_CUDA extern cudaDeviceProp *gpu_props; +#define WARP_SIZE 32 #endif -#define WARP_SIZE 32 __device__ unsigned int retirementCount = 0; template @@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid // cannot use overloaded operators for sobj as they are not volatile-qualified memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); - __syncwarp(); + acceleratorSynchronise(); const Iterator VEC = WARP_SIZE; const Iterator vid = tid & (VEC-1); @@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid beta += temp; memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); } - __syncwarp(); + acceleratorSynchronise(); } - __syncthreads(); + acceleratorSynchroniseAll(); if (threadIdx.x == 0) { beta = Zero(); @@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid } memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); } - __syncthreads(); + acceleratorSynchroniseAll(); } From 4677c40195761b7d5cc967736110f0d5308dee19 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Sep 2020 00:32:27 +0100 Subject: [PATCH 03/18] HIP improvements --- Grid/threads/Accelerator.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 864d90a9..35e10d31 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -55,6 +55,7 @@ void acceleratorInit(void) printf("AcceleratorCudaInit[%d]: ========================\n",rank); printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name); + GPU_PROP_FMT(totalGlobalMem,"%lld"); GPU_PROP(managedMemory); GPU_PROP(isMultiGpuBoard); @@ -109,20 +110,24 @@ void acceleratorInit(void) if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + printf("world_rank %d has %d devices\n",world_rank,nDevices); + size_t totalDeviceMem=0; for (int i = 0; i < nDevices; i++) { #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); hipGetDeviceProperties(&gpu_props[i], i); + hipDeviceProp_t prop; + prop = gpu_props[i]; + totalDeviceMem = prop.totalGlobalMem; if ( world_rank == 0) { - hipDeviceProp_t prop; - prop = gpu_props[i]; printf("AcceleratorHipInit: ========================\n"); printf("AcceleratorHipInit: Device Number : %d\n", i); printf("AcceleratorHipInit: ========================\n"); printf("AcceleratorHipInit: Device identifier: %s\n", prop.name); + GPU_PROP_FMT(totalGlobalMem,"%lld"); // GPU_PROP(managedMemory); GPU_PROP(isMultiGpuBoard); GPU_PROP(warpSize); @@ -131,6 +136,7 @@ void acceleratorInit(void) // GPU_PROP(singleToDoublePrecisionPerfRatio); } } + MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours #undef GPU_PROP_FMT #undef GPU_PROP #ifdef GRID_IBM_SUMMIT From e03b64dc06803c8c0f866e01f528a7cba573d049 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Sep 2020 00:33:09 +0100 Subject: [PATCH 04/18] HIP default flaags to work on ROCM --- configure.ac | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/configure.ac b/configure.ac index f0bea6a4..11261291 100644 --- a/configure.ac +++ b/configure.ac @@ -53,11 +53,11 @@ AC_TYPE_UINT64_T ############### OpenMP AC_OPENMP ac_openmp=no -if test "${ac_cv_prog_cxx_openmp}X" != "noX"; then - ac_openmp=yes - AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS" - AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS" -fi +#if test "${ac_cv_prog_cxx_openmp}X" != "noX"; then +# ac_openmp=yes +## AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS" +# AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS" +#fi ############### Checks for header files @@ -330,7 +330,8 @@ case ${CXXTEST} in fi ;; hipcc) - CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" +# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" + CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" CXXLD=${CXX} if test $ac_openmp = yes; then CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" From a5c35c4024a1289e6394e0244a53bdd7a388b06f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Sep 2020 00:33:53 +0100 Subject: [PATCH 05/18] Make HIP / Vega happy --- .../action/fermion/implementation/StaggeredKernelsHand.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h index ee53de49..6bcb22b4 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h @@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid); template -template +template accelerator_inline void StaggeredKernels::DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -221,7 +221,7 @@ void StaggeredKernels::DhopSiteHand(StencilView &st, template -template +template accelerator_inline void StaggeredKernels::DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -300,7 +300,7 @@ void StaggeredKernels::DhopSiteHandInt(StencilView &st, template -template +template accelerator_inline void StaggeredKernels::DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, From fd9424ef273cb7647d7a08e9b48c3dee3f3b157c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Sep 2020 00:34:32 +0100 Subject: [PATCH 06/18] innlines required to make HIP happy --- .../implementation/StaggeredKernelsImplementation.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index 141725a7..0b6f9fb0 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -78,7 +78,7 @@ StaggeredKernels::StaggeredKernels(const ImplParams &p) : Base(p){}; // Int, Ext, Int+Ext cases for comms overlap //////////////////////////////////////////////////////////////////////////////////// template -template +template accelerator_inline void StaggeredKernels::DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -126,7 +126,7 @@ void StaggeredKernels::DhopSiteGeneric(StencilView &st, // Only contributions from interior of our node /////////////////////////////////////////////////// template -template +template accelerator_inline void StaggeredKernels::DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -174,7 +174,7 @@ void StaggeredKernels::DhopSiteGenericInt(StencilView &st, // Only contributions from exterior of our node /////////////////////////////////////////////////// template -template +template accelerator_inline void StaggeredKernels::DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -224,7 +224,7 @@ void StaggeredKernels::DhopSiteGenericExt(StencilView &st, //////////////////////////////////////////////////////////////////////////////////// // Driving / wrapping routine to select right kernel //////////////////////////////////////////////////////////////////////////////////// -template +template void StaggeredKernels::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp) { @@ -253,7 +253,7 @@ void StaggeredKernels::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \ }); -template +template void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior,int exterior) @@ -293,7 +293,7 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, } assert(0 && " Kernel optimisation case not covered "); } -template +template void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior,int exterior) From d1c0c0197e0243d833a1aee0947265fa6577900b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Sep 2020 00:35:06 +0100 Subject: [PATCH 07/18] HipCC requires inline on definition --- .../WilsonKernelsHandGparityImplementation.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h index 2150938f..a592a798 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h @@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid); HAND_RESULT_EXT(ss,F) #define HAND_SPECIALISE_GPARITY(IMPL) \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid); nmu = 0; \ HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ From cc220abd1dda3e31b7cfe685e2b38fc792eae2ce Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Sep 2020 00:35:38 +0100 Subject: [PATCH 08/18] inline for HIP --- .../implementation/WilsonKernelsHandImplementation.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index f7b018fa..89ae5668 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -495,7 +495,7 @@ Author: paboyle NAMESPACE_BEGIN(Grid); -template void +template accelerator_inline void WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -519,7 +519,7 @@ WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site HAND_RESULT(ss); } -template +template accelerator_inline void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -542,7 +542,7 @@ void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView HAND_RESULT(ss); } -template void +template accelerator_inline void WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -566,7 +566,7 @@ WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si HAND_RESULT(ss); } -template +template accelerator_inline void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -589,7 +589,7 @@ void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi HAND_RESULT(ss); } -template void +template accelerator_inline void WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -614,7 +614,7 @@ WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si HAND_RESULT_EXT(ss); } -template +template accelerator_inline void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { From 2859955a03b5a95571f680965d296a9308c69443 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Sep 2020 00:36:13 +0100 Subject: [PATCH 09/18] HIP requires "inline" --- .../implementation/WilsonKernelsImplementation.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index c2b62416..c5f50bbb 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) //////////////////////////////////////////////////////////////////// // All legs kernels ; comms then compute //////////////////////////////////////////////////////////////////// -template +template accelerator_inline void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -140,7 +140,7 @@ void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV coalescedWrite(out[sF],result,lane); }; -template +template accelerator_inline void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -169,7 +169,7 @@ void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView //////////////////////////////////////////////////////////////////// // Interior kernels //////////////////////////////////////////////////////////////////// -template +template accelerator_inline void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -197,7 +197,7 @@ void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi coalescedWrite(out[sF], result,lane); }; -template +template accelerator_inline void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -227,7 +227,7 @@ void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeField //////////////////////////////////////////////////////////////////// // Exterior kernels //////////////////////////////////////////////////////////////////// -template +template accelerator_inline void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -258,7 +258,7 @@ void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi } }; -template +template accelerator_inline void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -290,7 +290,7 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField }; #define DhopDirMacro(Dir,spProj,spRecon) \ - template \ + template accelerator_inline \ void WilsonKernels::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \ int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \ { \ @@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm); DhopDirMacro(Zm,spProjZm,spReconZm); DhopDirMacro(Tm,spProjTm,spReconTm); -template +template accelerator_inline void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) { From dacbbdd0516235063a9907be788c90e90b05424d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Sep 2020 00:37:02 +0100 Subject: [PATCH 10/18] Hip Happy Birthday --- Grid/threads/Accelerator.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 74a3ea22..25d48c9e 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -286,17 +286,13 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) inline void *acceleratorAllocShared(size_t bytes) { -#if 0 void *ptr=NULL; auto err = hipMallocManaged((void **)&ptr,bytes); if( err != hipSuccess ) { ptr = (void *) NULL; - printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err)); + printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); } return ptr; -#else - return malloc(bytes); -#endif }; inline void *acceleratorAllocDevice(size_t bytes) @@ -305,7 +301,7 @@ inline void *acceleratorAllocDevice(size_t bytes) auto err = hipMalloc((void **)&ptr,bytes); if( err != hipSuccess ) { ptr = (void *) NULL; - printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err)); + printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); } return ptr; }; From 1c881ce23c2119e5567608d57161fc538d549e82 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Sep 2020 02:28:33 +0100 Subject: [PATCH 11/18] HIP does not like half2 visible members x and y so must define own Half2 --- Grid/simd/Grid_gpu_vec.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index b9c6a81b..8b17f75a 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -41,6 +41,11 @@ Author: Peter Boyle namespace Grid { +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +typedef struct { uint16_t x;} half; +#endif +typedef struct Half2_t { half x; half y; } Half2; + #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) template @@ -125,14 +130,14 @@ inline accelerator GpuVector operator/(const GpuVector l,const } constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half); -constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2); +constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2); constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float); constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2); constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double); constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2); constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer); -typedef GpuComplex GpuComplexH; +typedef GpuComplex GpuComplexH; typedef GpuComplex GpuComplexF; typedef GpuComplex GpuComplexD; @@ -147,11 +152,9 @@ typedef GpuVector GpuVectorI; accelerator_inline float half2float(half h) { float f; -#ifdef GRID_SIMT +#if defined(GRID_CUDA) || defined(GRID_HIP) f = __half2float(h); #else - //f = __half2float(h); - __half_raw hr(h); Grid_half hh; hh.x = hr.x; f= sfw_half_to_float(hh); @@ -161,13 +164,11 @@ accelerator_inline float half2float(half h) accelerator_inline half float2half(float f) { half h; -#ifdef GRID_SIMT +#if defined(GRID_CUDA) || defined(GRID_HIP) h = __float2half(f); #else Grid_half hh = sfw_float_to_half(f); - __half_raw hr; - hr.x = hh.x; - h = __half(hr); + h.x = hh.x; #endif return h; } @@ -523,7 +524,7 @@ namespace Optimization { //////////////////////////////////////////////////////////////////////////////////// // Single / Half //////////////////////////////////////////////////////////////////////////////////// - static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { + static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { int N = GpuVectorCF::N; GpuVectorCH h; for(int i=0;i Date: Wed, 16 Sep 2020 03:35:03 +0100 Subject: [PATCH 12/18] HIP runs sensible --- Grid/lattice/Lattice_arith.h | 6 ++--- Grid/threads/Accelerator.cc | 2 +- benchmarks/Benchmark_su3.cc | 42 +++++++++++++++++++++++++++++---- benchmarks/Benchmark_su3_gpu.cc | 3 ++- 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index a3ae1f28..3c269c58 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -60,9 +60,9 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ autoView( lhs_v , lhs, AcceleratorRead); autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ - decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); auto rhs_t=rhs_v(ss); + auto tmp =ret_v(ss); mac(&tmp,&lhs_t,&rhs_t); coalescedWrite(ret_v[ss],tmp); }); @@ -124,7 +124,7 @@ void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ autoView( ret_v , ret, AcceleratorWrite); autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ - decltype(coalescedRead(obj1())) tmp; + auto tmp =ret_v(ss); auto lhs_t=lhs_v(ss); mac(&tmp,&lhs_t,&rhs); coalescedWrite(ret_v[ss],tmp); @@ -182,7 +182,7 @@ void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ autoView( ret_v , ret, AcceleratorWrite); autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ - decltype(coalescedRead(obj1())) tmp; + auto tmp =ret_v(ss); auto rhs_t=rhs_v(ss); mac(&tmp,&lhs,&rhs_t); coalescedWrite(ret_v[ss],tmp); diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 35e10d31..2134d158 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -127,7 +127,7 @@ void acceleratorInit(void) printf("AcceleratorHipInit: ========================\n"); printf("AcceleratorHipInit: Device identifier: %s\n", prop.name); - GPU_PROP_FMT(totalGlobalMem,"%lld"); + GPU_PROP_FMT(totalGlobalMem,"%lu"); // GPU_PROP(managedMemory); GPU_PROP(isMultiGpuBoard); GPU_PROP(warpSize); diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index d24a3e25..d094da0c 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -36,12 +36,12 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); -#define LMAX (48) +#define LMAX (40) #define LMIN (8) #define LADD (8) - int64_t Nwarm=50; - int64_t Nloop=500; + int64_t Nwarm=10; + int64_t Nloop=100; Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); @@ -118,6 +118,41 @@ int main (int argc, char ** argv) } + + std::cout<({45,12,81,9})); + + LatticeColourMatrix z(&Grid); random(pRNG,z); + LatticeColourMatrix x(&Grid); random(pRNG,x); + LatticeColourMatrix y(&Grid); random(pRNG,y); + + for(int64_t i=0;i Date: Thu, 17 Sep 2020 20:31:46 +0100 Subject: [PATCH 13/18] HIP IPC --- Grid/communicator/SharedMemoryMPI.cc | 49 +++++++++++++++++++--------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 8b27ab7a..0cbde9eb 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -32,6 +32,9 @@ Author: Peter Boyle #ifdef GRID_CUDA #include #endif +#ifdef GRID_HIP +#include +#endif NAMESPACE_BEGIN(Grid); #define header "SharedMemoryMpi: " @@ -425,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) //////////////////////////////////////////////////////////////////////////////////////////// // Hugetlbfs mapping intended //////////////////////////////////////////////////////////////////////////////////////////// -#ifdef GRID_CUDA +#if defined(GRID_CUDA) ||defined(GRID_HIP) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) { void * ShmCommBuf ; @@ -448,21 +451,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// -#ifndef GRID_MPI3_SHM_NONE - auto err = cudaMalloc(&ShmCommBuf, bytes); -#else - auto err = cudaMallocManaged(&ShmCommBuf, bytes); -#endif - if ( err != cudaSuccess) { - std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " < Date: Thu, 17 Sep 2020 20:47:32 +0100 Subject: [PATCH 14/18] MPI asynch call removal --- Grid/communicator/Communicator_base.h | 15 --------------- Grid/communicator/Communicator_none.cc | 23 ----------------------- 2 files changed, 38 deletions(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 436d75ef..bb06d43f 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -138,21 +138,6 @@ public: int recv_from_rank, int bytes); - void SendRecvPacket(void *xmit, - void *recv, - int xmit_to_rank, - int recv_from_rank, - int bytes); - - void SendToRecvFromBegin(std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); - - void SendToRecvFromComplete(std::vector &waitall); - double StencilSendToRecvFrom(void *xmit, int xmit_to_rank, void *recv, diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 81900371..b4f51ca2 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){} void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){} -void CartesianCommunicator::SendRecvPacket(void *xmit, - void *recv, - int xmit_to_rank, - int recv_from_rank, - int bytes) -{ - assert(0); -} - // Basic Halo comms primitive -- should never call in single node void CartesianCommunicator::SendToRecvFrom(void *xmit, @@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, { assert(0); } -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - assert(0); -} - -void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) -{ - assert(0); -} void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) { bcopy(in,out,bytes*words); From b71a081cba989be0916376ac0331401ba786738c Mon Sep 17 00:00:00 2001 From: Michael Marshall <43034299+mmphys@users.noreply.github.com> Date: Mon, 21 Sep 2020 09:33:23 +0100 Subject: [PATCH 15/18] Asynchronous calls removed - reflect this in Communicator_none.cc (Opportunistic doc update - OpenMP support on Mac OS) --- Grid/communicator/Communicator_none.cc | 7 ------- documentation/GridXcode/readme.md | 6 +++--- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index b4f51ca2..6cb431a2 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -114,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int recv_from_rank, int bytes, int dir) { - std::vector list; - // Discard the "dir" - SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); - SendToRecvFromComplete(list); return 2.0*bytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, @@ -127,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &waitall,int dir) { - SendToRecvFromComplete(waitall); } void CartesianCommunicator::StencilBarrier(void){}; diff --git a/documentation/GridXcode/readme.md b/documentation/GridXcode/readme.md index 11deb6fa..b8342828 100644 --- a/documentation/GridXcode/readme.md +++ b/documentation/GridXcode/readme.md @@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging: - ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridDebug --enable-comms=none + ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=double --prefix=$GridPre/Debug #### 2. `Release` Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation): - ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=single --prefix=$GridPre/GridRelease --enable-comms=none + ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=single --prefix=$GridPre/Release #### 3. `MPIDebug` Debug configuration with MPI: - ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx + ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --enable-precision=double --prefix=$GridPre/MPIDebug ### 5.3 Build Grid From 049dd257856a075b52a2b45e68ad5e0eb139ce04 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 23 Sep 2020 04:13:50 -0400 Subject: [PATCH 16/18] Revert accidental commit thanks michael --- configure.ac | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/configure.ac b/configure.ac index 96de5b07..f13f06fc 100644 --- a/configure.ac +++ b/configure.ac @@ -53,11 +53,11 @@ AC_TYPE_UINT64_T ############### OpenMP AC_OPENMP ac_openmp=no -#if test "${ac_cv_prog_cxx_openmp}X" != "noX"; then -# ac_openmp=yes -## AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS" -# AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS" -#fi +if test "${ac_cv_prog_cxx_openmp}X" != "noX"; then + ac_openmp=yes + AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS" + AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS" +fi ############### Checks for header files From a3e2aeb603f297661b6d81165b9be141e023d8d5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 29 Sep 2020 06:50:10 -0700 Subject: [PATCH 17/18] dpcpp options happiness --- configure.ac | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/configure.ac b/configure.ac index f13f06fc..90f66f44 100644 --- a/configure.ac +++ b/configure.ac @@ -337,6 +337,11 @@ case ${CXXTEST} in CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" fi ;; + dpcpp) + LDFLAGS="$LDFLAGS" + CXXFLAGS="$CXXFLAGS" + CXXLD=${CXX} + ;; *) CXXLD=${CXX} CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" From ace9cd64bb7f2e471a1728e421b6a271576cdbc9 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 29 Sep 2020 08:03:46 -0700 Subject: [PATCH 18/18] dpcpp happy --- Grid/qcd/action/fermion/StaggeredKernels.h | 19 +++++++++++++------ .../CayleyFermion5DImplementation.h | 4 ++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/Grid/qcd/action/fermion/StaggeredKernels.h b/Grid/qcd/action/fermion/StaggeredKernels.h index 30deee06..d67105bb 100644 --- a/Grid/qcd/action/fermion/StaggeredKernels.h +++ b/Grid/qcd/action/fermion/StaggeredKernels.h @@ -63,17 +63,20 @@ template class StaggeredKernels : public FermionOperator , pub /////////////////////////////////////////////////////////////////////////////////////// // Generic Nc kernels /////////////////////////////////////////////////////////////////////////////////////// - template accelerator_inline + template + static accelerator_inline void DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, @@ -82,17 +85,20 @@ template class StaggeredKernels : public FermionOperator , pub /////////////////////////////////////////////////////////////////////////////////////// // Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// - template accelerator_inline + + template static accelerator_inline void DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, @@ -101,6 +107,7 @@ template class StaggeredKernels : public FermionOperator , pub /////////////////////////////////////////////////////////////////////////////////////// // Asm Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// + void DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index e79b64dc..b3fbe096 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -799,7 +799,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, PropagatorField tmp(UGrid); PropagatorField Utmp(UGrid); - LatticeInteger zz (UGrid); zz=0.0; + PropagatorField zz (UGrid); zz=0.0; LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); for (int s=0;s::SeqConservedCurrent(PropagatorField &q_in, PropagatorField tmp(UGrid); PropagatorField Utmp(UGrid); - LatticeInteger zz (UGrid); zz=0.0; + PropagatorField zz (UGrid); zz=0.0; LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); for(int s=0;s