From 28a1fcaaffa1c644d75208aa4fa435b454f99a29 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 May 2020 11:13:27 -0700 Subject: [PATCH] First compile against SYCL --- Grid/Grid_Eigen_Dense.h | 6 +- Grid/allocator/AlignedAllocator.cc | 2 +- Grid/allocator/AlignedAllocator.h | 32 ++++----- Grid/communicator/SharedMemoryMPI.cc | 15 ++-- Grid/lattice/Lattice_base.h | 4 +- Grid/lattice/Lattice_reduction.h | 8 +-- Grid/perfmon/PerfCount.h | 3 +- Grid/pugixml/pugixml.cc | 2 +- Grid/qcd/action/fermion/Fermion.h | 2 +- Grid/qcd/action/fermion/GparityWilsonImpl.h | 2 +- .../CayleyFermion5DImplementation.h | 6 +- .../WilsonKernelsImplementation.h | 41 ++++++----- Grid/serialisation/Serialisation.h | 2 +- Grid/simd/Grid_gpu_vec.h | 4 +- Grid/simd/Simd.h | 4 +- Grid/stencil/Stencil.h | 2 +- Grid/tensors/Tensor_SIMT.h | 12 +++- Grid/tensors/Tensor_exp.h | 2 +- Grid/threads/Pragmas.h | 53 ++++++++++++-- Grid/util/Init.cc | 71 +++++++++++++++++-- benchmarks/Benchmark_dwf.cc | 2 +- benchmarks/Benchmark_su3_gpu.cc | 2 +- configure.ac | 13 ++++ ..._WilsonMixedRepresentationsFermionGauge.cc | 2 +- 24 files changed, 205 insertions(+), 87 deletions(-) diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index 9d779e05..d7119358 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -15,12 +15,12 @@ #ifdef __NVCC__ #pragma push #pragma diag_suppress code_is_unreachable -#pragma push_macro("__CUDA_ARCH__") +#pragma push_macro("GRID_SIMT") #pragma push_macro("__NVCC__") #pragma push_macro("__CUDACC__") #undef __NVCC__ #undef __CUDACC__ -#undef __CUDA_ARCH__ +#undef GRID_SIMT #define __NVCC__REDEFINE__ #endif @@ -41,7 +41,7 @@ #ifdef __NVCC__REDEFINE__ #pragma pop_macro("__CUDACC__") #pragma pop_macro("__NVCC__") -#pragma pop_macro("__CUDA_ARCH__") +#pragma pop_macro("GRID_SIMT") #pragma pop #endif diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index d53c4dc2..18854c95 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -6,7 +6,7 @@ NAMESPACE_BEGIN(Grid); MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; -#ifdef GRID_NVCC +#ifdef GRID_CUDA #define SMALL_LIMIT (0) #else #define SMALL_LIMIT (4096) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 8c189be8..56f937f5 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -51,11 +51,8 @@ class PointerCache { private: /*Pinning pages is costly*/ /*Could maintain separate large and small allocation caches*/ -#ifdef GRID_NVCC + static const int Ncache=128; -#else - static const int Ncache=8; -#endif static int victim; typedef struct { @@ -169,7 +166,7 @@ public: pointer ptr = nullptr; #endif -#ifdef GRID_NVCC +#ifdef GRID_CUDA //////////////////////////////////// // Unified (managed) memory //////////////////////////////////// @@ -183,7 +180,13 @@ public: } } assert( ptr != (_Tp *)NULL); -#else +#endif + +#ifdef GRID_SYCL + if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) malloc_shared(bytes,*theGridAccelerator); +#endif + +#if ( !defined(GRID_CUDA)) && (!defined(GRID_SYCL)) ////////////////////////////////////////////////////////////////////////////////////////// // 2MB align; could make option probably doesn't need configurability ////////////////////////////////////////////////////////////////////////////////////////// @@ -193,14 +196,6 @@ public: if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes); #endif assert( ptr != (_Tp *)NULL); - - ////////////////////////////////////////////////// - // First touch optimise in threaded loop - ////////////////////////////////////////////////// - uint64_t *cp = (uint64_t *)ptr; - thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page - cp[n]=0; - }); #endif return ptr; } @@ -216,9 +211,14 @@ public: pointer __freeme = __p; #endif -#ifdef GRID_NVCC +#ifdef GRID_CUDA if ( __freeme ) cudaFree((void *)__freeme); -#else +#endif +#ifdef GRID_SYCL + if ( __freeme ) free((void *)__freeme,*theGridAccelerator); +#endif + +#if ( !defined(GRID_CUDA)) && (!defined(GRID_SYCL)) #ifdef HAVE_MM_MALLOC_H if ( __freeme ) _mm_free((void *)__freeme); #else diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index ed465252..1d62b8b7 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -29,7 +29,7 @@ Author: Peter Boyle #include #include -#ifdef GRID_NVCC +#ifdef GRID_CUDA #include #endif @@ -413,7 +413,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) //////////////////////////////////////////////////////////////////////////////////////////// // Hugetlbfs mapping intended //////////////////////////////////////////////////////////////////////////////////////////// -#ifdef GRID_NVCC +#ifdef GRID_CUDA void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) { void * ShmCommBuf ; @@ -433,13 +433,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) ////////////////////////////////////////////////////////////////////////////////////////////////////////// // cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); -#ifdef GRID_IBM_SUMMIT - // IBM Jsrun makes cuda Device numbering screwy and not match rank - std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<_odata[i]); } #else accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } @@ -211,7 +211,7 @@ public: LatticeView accessor(*( (LatticeAccelerator *) this)); return accessor; } - + ~Lattice() { if ( this->_odata_size ) { dealloc(); diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 3c5b03e5..1f06ac66 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -24,7 +24,7 @@ Author: paboyle #include -#ifdef GRID_NVCC +#ifdef GRID_CUDA #include #endif @@ -67,7 +67,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites) template inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) { -#ifdef GRID_NVCC +#ifdef GRID_CUDA return sum_gpu(arg,osites); #else return sum_cpu(arg,osites); @@ -108,7 +108,7 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); -#ifdef GRID_NVCC +#ifdef GRID_CUDA // GPU - SIMT lane compliance... typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; Vector inner_tmp(sites); @@ -174,7 +174,7 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); -#ifdef GRID_NVCC +#ifdef GRID_CUDA // GPU typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; Vector inner_tmp(sites); diff --git a/Grid/perfmon/PerfCount.h b/Grid/perfmon/PerfCount.h index 1e2a9528..dd25b41e 100644 --- a/Grid/perfmon/PerfCount.h +++ b/Grid/perfmon/PerfCount.h @@ -44,7 +44,7 @@ Author: paboyle #include #endif #ifdef __x86_64__ -#ifdef GRID_NVCC +#ifdef GRID_CUDA accelerator_inline uint64_t __rdtsc(void) { return 0; } accelerator_inline uint64_t __rdpmc(int ) { return 0; } #else @@ -112,7 +112,6 @@ class PerformanceCounter { private: typedef struct { - public: uint32_t type; uint64_t config; const char *name; diff --git a/Grid/pugixml/pugixml.cc b/Grid/pugixml/pugixml.cc index e7b395ad..45e6496a 100644 --- a/Grid/pugixml/pugixml.cc +++ b/Grid/pugixml/pugixml.cc @@ -12773,7 +12773,7 @@ namespace pugi #undef PUGI__THROW_ERROR #undef PUGI__CHECK_ERROR -#ifdef GRID_NVCC +#ifdef GRID_CUDA #pragma pop #endif diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index fb6f18bb..af5bebcc 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -286,7 +286,7 @@ typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DD; -#ifndef GRID_NVCC +#ifndef GRID_CUDA typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dD; diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index 0b147b3f..77381ff9 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -96,7 +96,7 @@ public: int sl = St._simd_layout[direction]; Coordinate icoor; -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT _Spinor tmp; const int Nsimd =SiteDoubledGaugeField::Nsimd(); diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index c80d2425..082e4b73 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -180,7 +180,7 @@ template void CayleyFermion5D::CayleyReport(void) std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl; std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; -#ifdef GRID_NVCC +#ifdef GRID_CUDA RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; @@ -644,7 +644,7 @@ void CayleyFermion5D::ContractConservedCurrent( PropagatorField &q_in_1, Current curr_type, unsigned int mu) { -#ifndef GRID_NVCC +#ifndef GRID_CUDA Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, @@ -828,7 +828,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } #endif -#ifndef GRID_NVCC +#ifndef GRID_CUDA //////////////////////////////////////////////// // GENERAL CAYLEY CASE //////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 1fff4f5a..c164a1f2 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -39,9 +39,10 @@ NAMESPACE_BEGIN(Grid); // Generic implementation; move to different file? //////////////////////////////////////////// +/* accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) { -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads uint4 * chip_pun = (uint4 *)&chip; @@ -51,7 +52,8 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) #endif return; } - +*/ + #define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ if (SE->_is_local) { \ @@ -358,18 +360,18 @@ void WilsonKernels::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,Site auto out_Yp = out[5].View(); auto out_Zp = out[6].View(); auto out_Tp = out[7].View(); - + auto CBp=st.CommBuf(); accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{ int sU=sss/Ls; int sF =sss; - DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0); - DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1); - DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2); - DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3); - DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4); - DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5); - DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6); - DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7); + DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0); + DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1); + DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2); + DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3); + DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4); + DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5); + DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6); + DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7); }); } @@ -385,13 +387,14 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S auto in_v = in.View(); auto out_v = out.View(); auto st_v = st.View(); + auto CBp=st.CommBuf(); #define LoopBody(Dir) \ - case Dir : \ + case Dir : \ accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \ for(int s=0;s::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;} #endif @@ -473,19 +476,19 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} #endif diff --git a/Grid/serialisation/Serialisation.h b/Grid/serialisation/Serialisation.h index c95226b1..177a65f9 100644 --- a/Grid/serialisation/Serialisation.h +++ b/Grid/serialisation/Serialisation.h @@ -36,7 +36,7 @@ Author: Peter Boyle #include "BinaryIO.h" #include "TextIO.h" #include "XmlIO.h" -#ifndef GRID_NVCC +#ifndef GRID_CUDA #include "JSON_IO.h" #endif diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 4584fb36..0bff4c2f 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -142,7 +142,7 @@ typedef GpuVector GpuVectorI; accelerator_inline float half2float(half h) { float f; -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT f = __half2float(h); #else //f = __half2float(h); @@ -156,7 +156,7 @@ accelerator_inline float half2float(half h) accelerator_inline half float2half(float f) { half h; -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT h = __float2half(f); #else Grid_half hh = sfw_float_to_half(f); diff --git a/Grid/simd/Simd.h b/Grid/simd/Simd.h index bc8cd2fd..80f7c2e7 100644 --- a/Grid/simd/Simd.h +++ b/Grid/simd/Simd.h @@ -31,7 +31,7 @@ directory #ifndef GRID_SIMD_H #define GRID_SIMD_H -#ifdef GRID_NVCC +#ifdef GRID_CUDA #include #endif @@ -65,7 +65,7 @@ typedef RealD Real; typedef RealF Real; #endif -#ifdef GRID_NVCC +#ifdef GRID_CUDA typedef thrust::complex ComplexF; typedef thrust::complex ComplexD; typedef thrust::complex Complex; diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 37b866cb..7a200ba6 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -107,7 +107,7 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic } struct StencilEntry { -#ifdef GRID_NVCC +#ifdef GRID_CUDA uint64_t _byte_offset; // 8 bytes uint32_t _offset; // 4 bytes #else diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index c18b8484..c92edc82 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -34,14 +34,16 @@ NAMESPACE_BEGIN(Grid); //accelerator_inline void SIMTsynchronise(void) accelerator_inline void synchronise(void) { -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT +#ifdef GRID_CUDA // __syncthreads(); __syncwarp(); +#endif #endif return; } -#ifndef __CUDA_ARCH__ +#ifndef GRID_SIMT ////////////////////////////////////////// // Trivial mapping of vectors on host ////////////////////////////////////////// @@ -75,7 +77,13 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ vstream(vec, extracted); } #else +#ifdef GRID_CUDA accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific +#endif +#ifdef GRID_SYCL +//accelerator_inline int SIMTlane(int Nsimd) { return __spirv_BuiltInGlobalInvocationId[2]; } //SYCL specific +accelerator_inline int SIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific +#endif ////////////////////////////////////////// // Extract and insert slices on the GPU diff --git a/Grid/tensors/Tensor_exp.h b/Grid/tensors/Tensor_exp.h index 11d37f9c..0a1d6389 100644 --- a/Grid/tensors/Tensor_exp.h +++ b/Grid/tensors/Tensor_exp.h @@ -55,7 +55,7 @@ template accelerator_inline iVector Exponentiate(c // Specialisation: Cayley-Hamilton exponential for SU(3) -#ifndef GRID_NVCC +#ifndef GRID_CUDA template::TensorLevel == 0>::type * =nullptr> accelerator_inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) { diff --git a/Grid/threads/Pragmas.h b/Grid/threads/Pragmas.h index d05f8ee9..45eca253 100644 --- a/Grid/threads/Pragmas.h +++ b/Grid/threads/Pragmas.h @@ -68,16 +68,17 @@ Author: paboyle ////////////////////////////////////////////////////////////////////////////////// -// Accelerator primitives; fall back to threading +// Accelerator primitives; fall back to threading if not CUDA or SYCL ////////////////////////////////////////////////////////////////////////////////// -#ifdef __NVCC__ -#define GRID_NVCC -#endif -#ifdef GRID_NVCC +#ifdef GRID_CUDA extern uint32_t gpu_threads; +#ifdef __CUDA_ARCH__ +#define GRID_SIMT +#endif + #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline @@ -123,7 +124,47 @@ void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda) accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \ accelerator_barrier(dummy); -#else +#endif + +#ifdef GRID_SYCL + +#ifdef __SYCL_DEVICE_ONLY__ +#define GRID_SIMT +#endif + +#include +#include + +extern cl::sycl::queue *theGridAccelerator; + +extern uint32_t gpu_threads; + +#define accelerator +#define accelerator_inline strong_inline + +#define accelerator_forNB(iterator,num,nsimd, ... ) \ + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ + cl::sycl::range<3> local {gpu_threads,1,nsimd}; \ + cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \ + cgh.parallel_for( \ + cl::sycl::nd_range<3>(global,local), \ + [=] (cl::sycl::nd_item<3> item) mutable { \ + auto iterator = item.get_global_id(0); \ + auto lane = item.get_global_id(2); \ + { __VA_ARGS__ }; \ + }); \ + }); + +#define accelerator_barrier(dummy) theGridAccelerator->wait(); + +#define accelerator_for( iterator, num, nsimd, ... ) \ + accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \ + accelerator_barrier(dummy); + + +#endif + +#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) ) #define accelerator #define accelerator_inline strong_inline diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 570f4234..442c51d3 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -74,6 +74,10 @@ feenableexcept (unsigned int excepts) #endif uint32_t gpu_threads=8; +#ifdef GRID_SYCL +cl::sycl::queue *theGridAccelerator; +#endif + NAMESPACE_BEGIN(Grid); @@ -194,7 +198,7 @@ void GridParseLayout(char **argv,int argc, } if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){ std::vector gputhreads(0); -#ifndef GRID_NVCC +#ifndef GRID_CUDA std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was" << " not compiled with GPU support" << std::endl; #endif @@ -281,12 +285,10 @@ void GridBanner(void) printed=1; } } -#ifdef GRID_NVCC +#ifdef GRID_CUDA cudaDeviceProp *gpu_props; -#endif void GridGpuInit(void) { -#ifdef GRID_NVCC int nDevices = 1; cudaGetDeviceCount(&nDevices); gpu_props = new cudaDeviceProp[nDevices]; @@ -335,11 +337,70 @@ void GridGpuInit(void) // GPU_PROP(singleToDoublePrecisionPerfRatio); } } +#ifdef GRID_IBM_SUMMIT + // IBM Jsrun makes cuda Device numbering screwy and not match rank + if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n"); +#else + if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n"); + cudaSetDevice(rank); +#endif + if ( world_rank == 0 ) printf("GpuInit: ================================================\n"); +} +#endif +#ifdef GRID_SYCL +void GridGpuInit(void) +{ + int nDevices = 1; + cl::sycl::gpu_selector selector; + cl::sycl::device selectedDevice { selector }; + theGridAccelerator = new sycl::queue (selectedDevice); + + char * localRankStr = NULL; + int rank = 0, world_rank=0; +#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" +#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" +#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" +#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + + if ( world_rank == 0 ) { + GridBanner(); + } + /* + for (int i = 0; i < nDevices; i++) { + +#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); +#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); + + cudaGetDeviceProperties(&gpu_props[i], i); + if ( world_rank == 0) { + cudaDeviceProp prop; + prop = gpu_props[i]; + printf("GpuInit: ========================\n"); + printf("GpuInit: Device Number : %d\n", i); + printf("GpuInit: ========================\n"); + printf("GpuInit: Device identifier: %s\n", prop.name); + } + } + */ if ( world_rank == 0 ) { printf("GpuInit: ================================================\n"); } -#endif } +#endif +#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL)) +void GridGpuInit(void){} +#endif void Grid_init(int *argc,char ***argv) { diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 4d6b026f..a3146cbc 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -21,7 +21,7 @@ *************************************************************************************/ /* END LEGAL */ #include -#ifdef GRID_NVCC +#ifdef GRID_CUDA #define CUDA_PROFILE #endif diff --git a/benchmarks/Benchmark_su3_gpu.cc b/benchmarks/Benchmark_su3_gpu.cc index c87f501e..181d5894 100644 --- a/benchmarks/Benchmark_su3_gpu.cc +++ b/benchmarks/Benchmark_su3_gpu.cc @@ -41,7 +41,7 @@ int main (int argc, char ** argv) #define LADD (8) int64_t Nwarm=20; - int64_t Nloop=500; + int64_t Nloop=50; Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); diff --git a/configure.ac b/configure.ac index 93b32002..cf5ca85b 100644 --- a/configure.ac +++ b/configure.ac @@ -147,6 +147,19 @@ case ${ac_SUMMIT} in AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);; esac +############### SYCL +AC_ARG_ENABLE([sycl], + [AC_HELP_STRING([--enable-sycl=yes|no], [enable SYCL])], + [ac_JSRUN=${enable_sycl}], [ac_SYCL=no]) +case ${ac_SYCL} in + no);; + yes) + AC_DEFINE([GRID_SYCL],[1],[Use SYCL offload]);; + *) + AC_DEFINE([GRID_SYCL],[1],[Use SYCL offload]);; +esac + + ############### Intel libraries AC_ARG_ENABLE([mkl], [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], diff --git a/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc b/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc index 6fa90f32..3b8cdda6 100644 --- a/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc @@ -35,7 +35,7 @@ directory int main(int argc, char **argv) { -#ifndef GRID_NVCC +#ifndef GRID_CUDA using namespace Grid;