1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-03 18:55:56 +01:00

First compile against SYCL

This commit is contained in:
Peter Boyle 2020-05-05 11:13:27 -07:00
parent 04927d2e40
commit 28a1fcaaff
24 changed files with 205 additions and 87 deletions

View File

@ -15,12 +15,12 @@
#ifdef __NVCC__
#pragma push
#pragma diag_suppress code_is_unreachable
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("GRID_SIMT")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")
#undef __NVCC__
#undef __CUDACC__
#undef __CUDA_ARCH__
#undef GRID_SIMT
#define __NVCC__REDEFINE__
#endif
@ -41,7 +41,7 @@
#ifdef __NVCC__REDEFINE__
#pragma pop_macro("__CUDACC__")
#pragma pop_macro("__NVCC__")
#pragma pop_macro("__CUDA_ARCH__")
#pragma pop_macro("GRID_SIMT")
#pragma pop
#endif

View File

@ -6,7 +6,7 @@ NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
#ifdef GRID_NVCC
#ifdef GRID_CUDA
#define SMALL_LIMIT (0)
#else
#define SMALL_LIMIT (4096)

View File

@ -51,11 +51,8 @@ class PointerCache {
private:
/*Pinning pages is costly*/
/*Could maintain separate large and small allocation caches*/
#ifdef GRID_NVCC
static const int Ncache=128;
#else
static const int Ncache=8;
#endif
static int victim;
typedef struct {
@ -169,7 +166,7 @@ public:
pointer ptr = nullptr;
#endif
#ifdef GRID_NVCC
#ifdef GRID_CUDA
////////////////////////////////////
// Unified (managed) memory
////////////////////////////////////
@ -183,7 +180,13 @@ public:
}
}
assert( ptr != (_Tp *)NULL);
#else
#endif
#ifdef GRID_SYCL
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) malloc_shared(bytes,*theGridAccelerator);
#endif
#if ( !defined(GRID_CUDA)) && (!defined(GRID_SYCL))
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
@ -193,14 +196,6 @@ public:
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
assert( ptr != (_Tp *)NULL);
//////////////////////////////////////////////////
// First touch optimise in threaded loop
//////////////////////////////////////////////////
uint64_t *cp = (uint64_t *)ptr;
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
cp[n]=0;
});
#endif
return ptr;
}
@ -216,9 +211,14 @@ public:
pointer __freeme = __p;
#endif
#ifdef GRID_NVCC
#ifdef GRID_CUDA
if ( __freeme ) cudaFree((void *)__freeme);
#else
#endif
#ifdef GRID_SYCL
if ( __freeme ) free((void *)__freeme,*theGridAccelerator);
#endif
#if ( !defined(GRID_CUDA)) && (!defined(GRID_SYCL))
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
#include <pwd.h>
#ifdef GRID_NVCC
#ifdef GRID_CUDA
#include <cuda_runtime_api.h>
#endif
@ -413,7 +413,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC
#ifdef GRID_CUDA
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
@ -433,13 +433,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
#else
std::cout << "setting device to WorldShmRank"<<std::endl;
cudaSetDevice(WorldShmRank);
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -677,7 +670,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
#ifdef GRID_NVCC
#ifdef GRID_CUDA
cudaMemset(dest,0,bytes);
#else
bzero(dest,bytes);
@ -685,7 +678,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
{
#ifdef GRID_NVCC
#ifdef GRID_CUDA
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else
bcopy(src,dest,bytes);

View File

@ -89,7 +89,7 @@ public:
// Rvalue
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
@ -211,7 +211,7 @@ public:
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
return accessor;
}
~Lattice() {
if ( this->_odata_size ) {
dealloc();

View File

@ -24,7 +24,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_NVCC
#ifdef GRID_CUDA
#include <Grid/lattice/Lattice_reduction_gpu.h>
#endif
@ -67,7 +67,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#ifdef GRID_NVCC
#ifdef GRID_CUDA
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
@ -108,7 +108,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
#ifdef GRID_CUDA
// GPU - SIMT lane compliance...
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
@ -174,7 +174,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
#ifdef GRID_CUDA
// GPU
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);

View File

@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <sys/syscall.h>
#endif
#ifdef __x86_64__
#ifdef GRID_NVCC
#ifdef GRID_CUDA
accelerator_inline uint64_t __rdtsc(void) { return 0; }
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
#else
@ -112,7 +112,6 @@ class PerformanceCounter {
private:
typedef struct {
public:
uint32_t type;
uint64_t config;
const char *name;

View File

@ -12773,7 +12773,7 @@ namespace pugi
#undef PUGI__THROW_ERROR
#undef PUGI__CHECK_ERROR
#ifdef GRID_NVCC
#ifdef GRID_CUDA
#pragma pop
#endif

View File

@ -286,7 +286,7 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
#ifndef GRID_NVCC
#ifndef GRID_CUDA
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;

View File

@ -96,7 +96,7 @@ public:
int sl = St._simd_layout[direction];
Coordinate icoor;
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
_Spinor tmp;
const int Nsimd =SiteDoubledGaugeField::Nsimd();

View File

@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
#ifdef GRID_NVCC
#ifdef GRID_CUDA
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
@ -644,7 +644,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
Current curr_type,
unsigned int mu)
{
#ifndef GRID_NVCC
#ifndef GRID_CUDA
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
@ -828,7 +828,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
}
#endif
#ifndef GRID_NVCC
#ifndef GRID_CUDA
////////////////////////////////////////////////
// GENERAL CAYLEY CASE
////////////////////////////////////////////////

View File

@ -39,9 +39,10 @@ NAMESPACE_BEGIN(Grid);
// Generic implementation; move to different file?
////////////////////////////////////////////
/*
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
{
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
uint4 * chip_pun = (uint4 *)&chip;
@ -51,7 +52,8 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
#endif
return;
}
*/
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \
if (SE->_is_local) { \
@ -358,18 +360,18 @@ void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,Site
auto out_Yp = out[5].View();
auto out_Zp = out[6].View();
auto out_Tp = out[7].View();
auto CBp=st.CommBuf();
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
int sU=sss/Ls;
int sF =sss;
DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3);
DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4);
DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5);
DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6);
DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7);
});
}
@ -385,13 +387,14 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
auto CBp=st.CommBuf();
#define LoopBody(Dir) \
case Dir : \
case Dir : \
accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \
for(int s=0;s<Ls;s++){ \
int sU=ss; \
int sF = s+Ls*sU; \
DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
} \
}); \
break;
@ -442,19 +445,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;}
#endif
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;}
#endif
} else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;}
#endif
@ -473,19 +476,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif
} else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif

View File

@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include "BinaryIO.h"
#include "TextIO.h"
#include "XmlIO.h"
#ifndef GRID_NVCC
#ifndef GRID_CUDA
#include "JSON_IO.h"
#endif

View File

@ -142,7 +142,7 @@ typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
accelerator_inline float half2float(half h)
{
float f;
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
f = __half2float(h);
#else
//f = __half2float(h);
@ -156,7 +156,7 @@ accelerator_inline float half2float(half h)
accelerator_inline half float2half(float f)
{
half h;
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
h = __float2half(f);
#else
Grid_half hh = sfw_float_to_half(f);

View File

@ -31,7 +31,7 @@ directory
#ifndef GRID_SIMD_H
#define GRID_SIMD_H
#ifdef GRID_NVCC
#ifdef GRID_CUDA
#include <thrust/complex.h>
#endif
@ -65,7 +65,7 @@ typedef RealD Real;
typedef RealF Real;
#endif
#ifdef GRID_NVCC
#ifdef GRID_CUDA
typedef thrust::complex<RealF> ComplexF;
typedef thrust::complex<RealD> ComplexD;
typedef thrust::complex<Real> Complex;

View File

@ -107,7 +107,7 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
}
struct StencilEntry {
#ifdef GRID_NVCC
#ifdef GRID_CUDA
uint64_t _byte_offset; // 8 bytes
uint32_t _offset; // 4 bytes
#else

View File

@ -34,14 +34,16 @@ NAMESPACE_BEGIN(Grid);
//accelerator_inline void SIMTsynchronise(void)
accelerator_inline void synchronise(void)
{
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
#ifdef GRID_CUDA
// __syncthreads();
__syncwarp();
#endif
#endif
return;
}
#ifndef __CUDA_ARCH__
#ifndef GRID_SIMT
//////////////////////////////////////////
// Trivial mapping of vectors on host
//////////////////////////////////////////
@ -75,7 +77,13 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
vstream(vec, extracted);
}
#else
#ifdef GRID_CUDA
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
#endif
#ifdef GRID_SYCL
//accelerator_inline int SIMTlane(int Nsimd) { return __spirv_BuiltInGlobalInvocationId[2]; } //SYCL specific
accelerator_inline int SIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
#endif
//////////////////////////////////////////
// Extract and insert slices on the GPU

View File

@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
// Specialisation: Cayley-Hamilton exponential for SU(3)
#ifndef GRID_NVCC
#ifndef GRID_CUDA
template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr>
accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP )
{

View File

@ -68,16 +68,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
//////////////////////////////////////////////////////////////////////////////////
// Accelerator primitives; fall back to threading
// Accelerator primitives; fall back to threading if not CUDA or SYCL
//////////////////////////////////////////////////////////////////////////////////
#ifdef __NVCC__
#define GRID_NVCC
#endif
#ifdef GRID_NVCC
#ifdef GRID_CUDA
extern uint32_t gpu_threads;
#ifdef __CUDA_ARCH__
#define GRID_SIMT
#endif
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
@ -123,7 +124,47 @@ void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda)
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
#else
#endif
#ifdef GRID_SYCL
#ifdef __SYCL_DEVICE_ONLY__
#define GRID_SIMT
#endif
#include <CL/sycl.hpp>
#include <CL/sycl/usm.hpp>
extern cl::sycl::queue *theGridAccelerator;
extern uint32_t gpu_threads;
#define accelerator
#define accelerator_inline strong_inline
#define accelerator_forNB(iterator,num,nsimd, ... ) \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
cl::sycl::range<3> local {gpu_threads,1,nsimd}; \
cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
cgh.parallel_for<class dslash>( \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) mutable { \
auto iterator = item.get_global_id(0); \
auto lane = item.get_global_id(2); \
{ __VA_ARGS__ }; \
}); \
});
#define accelerator_barrier(dummy) theGridAccelerator->wait();
#define accelerator_for( iterator, num, nsimd, ... ) \
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
#endif
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) )
#define accelerator
#define accelerator_inline strong_inline

View File

@ -74,6 +74,10 @@ feenableexcept (unsigned int excepts)
#endif
uint32_t gpu_threads=8;
#ifdef GRID_SYCL
cl::sycl::queue *theGridAccelerator;
#endif
NAMESPACE_BEGIN(Grid);
@ -194,7 +198,7 @@ void GridParseLayout(char **argv,int argc,
}
if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){
std::vector<int> gputhreads(0);
#ifndef GRID_NVCC
#ifndef GRID_CUDA
std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was"
<< " not compiled with GPU support" << std::endl;
#endif
@ -281,12 +285,10 @@ void GridBanner(void)
printed=1;
}
}
#ifdef GRID_NVCC
#ifdef GRID_CUDA
cudaDeviceProp *gpu_props;
#endif
void GridGpuInit(void)
{
#ifdef GRID_NVCC
int nDevices = 1;
cudaGetDeviceCount(&nDevices);
gpu_props = new cudaDeviceProp[nDevices];
@ -335,11 +337,70 @@ void GridGpuInit(void)
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
#else
if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n");
cudaSetDevice(rank);
#endif
if ( world_rank == 0 ) printf("GpuInit: ================================================\n");
}
#endif
#ifdef GRID_SYCL
void GridGpuInit(void)
{
int nDevices = 1;
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (selectedDevice);
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
if ( world_rank == 0 ) {
GridBanner();
}
/*
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
cudaGetDeviceProperties(&gpu_props[i], i);
if ( world_rank == 0) {
cudaDeviceProp prop;
prop = gpu_props[i];
printf("GpuInit: ========================\n");
printf("GpuInit: Device Number : %d\n", i);
printf("GpuInit: ========================\n");
printf("GpuInit: Device identifier: %s\n", prop.name);
}
}
*/
if ( world_rank == 0 ) {
printf("GpuInit: ================================================\n");
}
#endif
}
#endif
#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))
void GridGpuInit(void){}
#endif
void Grid_init(int *argc,char ***argv)
{

View File

@ -21,7 +21,7 @@
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_NVCC
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif

View File

@ -41,7 +41,7 @@ int main (int argc, char ** argv)
#define LADD (8)
int64_t Nwarm=20;
int64_t Nloop=500;
int64_t Nloop=50;
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();

View File

@ -147,6 +147,19 @@ case ${ac_SUMMIT} in
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
esac
############### SYCL
AC_ARG_ENABLE([sycl],
[AC_HELP_STRING([--enable-sycl=yes|no], [enable SYCL])],
[ac_JSRUN=${enable_sycl}], [ac_SYCL=no])
case ${ac_SYCL} in
no);;
yes)
AC_DEFINE([GRID_SYCL],[1],[Use SYCL offload]);;
*)
AC_DEFINE([GRID_SYCL],[1],[Use SYCL offload]);;
esac
############### Intel libraries
AC_ARG_ENABLE([mkl],
[AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],

View File

@ -35,7 +35,7 @@ directory
int main(int argc, char **argv) {
#ifndef GRID_NVCC
#ifndef GRID_CUDA
using namespace Grid;