mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-03 18:55:56 +01:00
First compile against SYCL
This commit is contained in:
parent
04927d2e40
commit
28a1fcaaff
@ -15,12 +15,12 @@
|
||||
#ifdef __NVCC__
|
||||
#pragma push
|
||||
#pragma diag_suppress code_is_unreachable
|
||||
#pragma push_macro("__CUDA_ARCH__")
|
||||
#pragma push_macro("GRID_SIMT")
|
||||
#pragma push_macro("__NVCC__")
|
||||
#pragma push_macro("__CUDACC__")
|
||||
#undef __NVCC__
|
||||
#undef __CUDACC__
|
||||
#undef __CUDA_ARCH__
|
||||
#undef GRID_SIMT
|
||||
#define __NVCC__REDEFINE__
|
||||
#endif
|
||||
|
||||
@ -41,7 +41,7 @@
|
||||
#ifdef __NVCC__REDEFINE__
|
||||
#pragma pop_macro("__CUDACC__")
|
||||
#pragma pop_macro("__NVCC__")
|
||||
#pragma pop_macro("__CUDA_ARCH__")
|
||||
#pragma pop_macro("GRID_SIMT")
|
||||
#pragma pop
|
||||
#endif
|
||||
|
||||
|
@ -6,7 +6,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
||||
bool MemoryProfiler::debug = false;
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
#define SMALL_LIMIT (0)
|
||||
#else
|
||||
#define SMALL_LIMIT (4096)
|
||||
|
@ -51,11 +51,8 @@ class PointerCache {
|
||||
private:
|
||||
/*Pinning pages is costly*/
|
||||
/*Could maintain separate large and small allocation caches*/
|
||||
#ifdef GRID_NVCC
|
||||
|
||||
static const int Ncache=128;
|
||||
#else
|
||||
static const int Ncache=8;
|
||||
#endif
|
||||
static int victim;
|
||||
|
||||
typedef struct {
|
||||
@ -169,7 +166,7 @@ public:
|
||||
pointer ptr = nullptr;
|
||||
#endif
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
////////////////////////////////////
|
||||
// Unified (managed) memory
|
||||
////////////////////////////////////
|
||||
@ -183,7 +180,13 @@ public:
|
||||
}
|
||||
}
|
||||
assert( ptr != (_Tp *)NULL);
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) malloc_shared(bytes,*theGridAccelerator);
|
||||
#endif
|
||||
|
||||
#if ( !defined(GRID_CUDA)) && (!defined(GRID_SYCL))
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// 2MB align; could make option probably doesn't need configurability
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -193,14 +196,6 @@ public:
|
||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
|
||||
#endif
|
||||
assert( ptr != (_Tp *)NULL);
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// First touch optimise in threaded loop
|
||||
//////////////////////////////////////////////////
|
||||
uint64_t *cp = (uint64_t *)ptr;
|
||||
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
|
||||
cp[n]=0;
|
||||
});
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
@ -216,9 +211,14 @@ public:
|
||||
pointer __freeme = __p;
|
||||
#endif
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
if ( __freeme ) cudaFree((void *)__freeme);
|
||||
#else
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
if ( __freeme ) free((void *)__freeme,*theGridAccelerator);
|
||||
#endif
|
||||
|
||||
#if ( !defined(GRID_CUDA)) && (!defined(GRID_SYCL))
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
if ( __freeme ) _mm_free((void *)__freeme);
|
||||
#else
|
||||
|
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/GridCore.h>
|
||||
#include <pwd.h>
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
#include <cuda_runtime_api.h>
|
||||
#endif
|
||||
|
||||
@ -413,7 +413,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Hugetlbfs mapping intended
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
void * ShmCommBuf ;
|
||||
@ -433,13 +433,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
||||
|
||||
#ifdef GRID_IBM_SUMMIT
|
||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
|
||||
#else
|
||||
std::cout << "setting device to WorldShmRank"<<std::endl;
|
||||
cudaSetDevice(WorldShmRank);
|
||||
#endif
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Each MPI rank should allocate our own buffer
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -677,7 +670,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
{
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
cudaMemset(dest,0,bytes);
|
||||
#else
|
||||
bzero(dest,bytes);
|
||||
@ -685,7 +678,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
}
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
|
||||
{
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
||||
#else
|
||||
bcopy(src,dest,bytes);
|
||||
|
@ -89,7 +89,7 @@ public:
|
||||
|
||||
|
||||
// Rvalue
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef GRID_SIMT
|
||||
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
|
||||
#else
|
||||
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
||||
@ -211,7 +211,7 @@ public:
|
||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
|
||||
return accessor;
|
||||
}
|
||||
|
||||
|
||||
~Lattice() {
|
||||
if ( this->_odata_size ) {
|
||||
dealloc();
|
||||
|
@ -24,7 +24,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/Grid_Eigen_Dense.h>
|
||||
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
||||
#endif
|
||||
|
||||
@ -67,7 +67,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
||||
{
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
return sum_gpu(arg,osites);
|
||||
#else
|
||||
return sum_cpu(arg,osites);
|
||||
@ -108,7 +108,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
||||
const uint64_t nsimd = grid->Nsimd();
|
||||
const uint64_t sites = grid->oSites();
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
// GPU - SIMT lane compliance...
|
||||
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
@ -174,7 +174,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
||||
const uint64_t nsimd = grid->Nsimd();
|
||||
const uint64_t sites = grid->oSites();
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
// GPU
|
||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
|
@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <sys/syscall.h>
|
||||
#endif
|
||||
#ifdef __x86_64__
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
||||
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
|
||||
#else
|
||||
@ -112,7 +112,6 @@ class PerformanceCounter {
|
||||
private:
|
||||
|
||||
typedef struct {
|
||||
public:
|
||||
uint32_t type;
|
||||
uint64_t config;
|
||||
const char *name;
|
||||
|
@ -12773,7 +12773,7 @@ namespace pugi
|
||||
#undef PUGI__THROW_ERROR
|
||||
#undef PUGI__CHECK_ERROR
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
#pragma pop
|
||||
#endif
|
||||
|
||||
|
@ -286,7 +286,7 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
||||
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
|
||||
|
@ -96,7 +96,7 @@ public:
|
||||
int sl = St._simd_layout[direction];
|
||||
Coordinate icoor;
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef GRID_SIMT
|
||||
_Spinor tmp;
|
||||
|
||||
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
||||
|
@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
|
||||
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
@ -644,7 +644,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
|
||||
Current curr_type,
|
||||
unsigned int mu)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
@ -828,7 +828,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
////////////////////////////////////////////////
|
||||
// GENERAL CAYLEY CASE
|
||||
////////////////////////////////////////////////
|
||||
|
@ -39,9 +39,10 @@ NAMESPACE_BEGIN(Grid);
|
||||
// Generic implementation; move to different file?
|
||||
////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef GRID_SIMT
|
||||
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
|
||||
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
|
||||
uint4 * chip_pun = (uint4 *)&chip;
|
||||
@ -51,7 +52,8 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
|
||||
SE = st.GetEntry(ptype, Dir, sF); \
|
||||
if (SE->_is_local) { \
|
||||
@ -358,18 +360,18 @@ void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,Site
|
||||
auto out_Yp = out[5].View();
|
||||
auto out_Zp = out[6].View();
|
||||
auto out_Tp = out[7].View();
|
||||
|
||||
auto CBp=st.CommBuf();
|
||||
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
|
||||
int sU=sss/Ls;
|
||||
int sF =sss;
|
||||
DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
|
||||
DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
|
||||
DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
|
||||
DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
|
||||
DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
|
||||
DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
|
||||
DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
|
||||
DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
|
||||
DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
|
||||
DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
|
||||
DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
|
||||
DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3);
|
||||
DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4);
|
||||
DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5);
|
||||
DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6);
|
||||
DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7);
|
||||
});
|
||||
}
|
||||
|
||||
@ -385,13 +387,14 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
auto CBp=st.CommBuf();
|
||||
#define LoopBody(Dir) \
|
||||
case Dir : \
|
||||
case Dir : \
|
||||
accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \
|
||||
for(int s=0;s<Ls;s++){ \
|
||||
int sU=ss; \
|
||||
int sF = s+Ls*sU; \
|
||||
DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
|
||||
DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
|
||||
} \
|
||||
}); \
|
||||
break;
|
||||
@ -442,19 +445,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
|
||||
if( interior && exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;}
|
||||
#endif
|
||||
} else if( interior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;}
|
||||
#endif
|
||||
@ -473,19 +476,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
|
||||
if( interior && exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
||||
#endif
|
||||
} else if( interior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
||||
#endif
|
||||
|
@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include "BinaryIO.h"
|
||||
#include "TextIO.h"
|
||||
#include "XmlIO.h"
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
#include "JSON_IO.h"
|
||||
#endif
|
||||
|
||||
|
@ -142,7 +142,7 @@ typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
|
||||
accelerator_inline float half2float(half h)
|
||||
{
|
||||
float f;
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef GRID_SIMT
|
||||
f = __half2float(h);
|
||||
#else
|
||||
//f = __half2float(h);
|
||||
@ -156,7 +156,7 @@ accelerator_inline float half2float(half h)
|
||||
accelerator_inline half float2half(float f)
|
||||
{
|
||||
half h;
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef GRID_SIMT
|
||||
h = __float2half(f);
|
||||
#else
|
||||
Grid_half hh = sfw_float_to_half(f);
|
||||
|
@ -31,7 +31,7 @@ directory
|
||||
#ifndef GRID_SIMD_H
|
||||
#define GRID_SIMD_H
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
#include <thrust/complex.h>
|
||||
#endif
|
||||
|
||||
@ -65,7 +65,7 @@ typedef RealD Real;
|
||||
typedef RealF Real;
|
||||
#endif
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
typedef thrust::complex<RealF> ComplexF;
|
||||
typedef thrust::complex<RealD> ComplexD;
|
||||
typedef thrust::complex<Real> Complex;
|
||||
|
@ -107,7 +107,7 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
|
||||
}
|
||||
|
||||
struct StencilEntry {
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
uint64_t _byte_offset; // 8 bytes
|
||||
uint32_t _offset; // 4 bytes
|
||||
#else
|
||||
|
@ -34,14 +34,16 @@ NAMESPACE_BEGIN(Grid);
|
||||
//accelerator_inline void SIMTsynchronise(void)
|
||||
accelerator_inline void synchronise(void)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef GRID_SIMT
|
||||
#ifdef GRID_CUDA
|
||||
// __syncthreads();
|
||||
__syncwarp();
|
||||
#endif
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
#ifndef GRID_SIMT
|
||||
//////////////////////////////////////////
|
||||
// Trivial mapping of vectors on host
|
||||
//////////////////////////////////////////
|
||||
@ -75,7 +77,13 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
|
||||
vstream(vec, extracted);
|
||||
}
|
||||
#else
|
||||
#ifdef GRID_CUDA
|
||||
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
//accelerator_inline int SIMTlane(int Nsimd) { return __spirv_BuiltInGlobalInvocationId[2]; } //SYCL specific
|
||||
accelerator_inline int SIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Extract and insert slices on the GPU
|
||||
|
@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
|
||||
|
||||
|
||||
// Specialisation: Cayley-Hamilton exponential for SU(3)
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr>
|
||||
accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP )
|
||||
{
|
||||
|
@ -68,16 +68,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
// Accelerator primitives; fall back to threading
|
||||
// Accelerator primitives; fall back to threading if not CUDA or SYCL
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef __NVCC__
|
||||
#define GRID_NVCC
|
||||
#endif
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
|
||||
extern uint32_t gpu_threads;
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define GRID_SIMT
|
||||
#endif
|
||||
|
||||
#define accelerator __host__ __device__
|
||||
#define accelerator_inline __host__ __device__ inline
|
||||
|
||||
@ -123,7 +124,47 @@ void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda)
|
||||
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
||||
accelerator_barrier(dummy);
|
||||
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
|
||||
#ifdef __SYCL_DEVICE_ONLY__
|
||||
#define GRID_SIMT
|
||||
#endif
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
#include <CL/sycl/usm.hpp>
|
||||
|
||||
extern cl::sycl::queue *theGridAccelerator;
|
||||
|
||||
extern uint32_t gpu_threads;
|
||||
|
||||
#define accelerator
|
||||
#define accelerator_inline strong_inline
|
||||
|
||||
#define accelerator_forNB(iterator,num,nsimd, ... ) \
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
|
||||
cl::sycl::range<3> local {gpu_threads,1,nsimd}; \
|
||||
cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
|
||||
cgh.parallel_for<class dslash>( \
|
||||
cl::sycl::nd_range<3>(global,local), \
|
||||
[=] (cl::sycl::nd_item<3> item) mutable { \
|
||||
auto iterator = item.get_global_id(0); \
|
||||
auto lane = item.get_global_id(2); \
|
||||
{ __VA_ARGS__ }; \
|
||||
}); \
|
||||
});
|
||||
|
||||
#define accelerator_barrier(dummy) theGridAccelerator->wait();
|
||||
|
||||
#define accelerator_for( iterator, num, nsimd, ... ) \
|
||||
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
||||
accelerator_barrier(dummy);
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) )
|
||||
|
||||
#define accelerator
|
||||
#define accelerator_inline strong_inline
|
||||
|
@ -74,6 +74,10 @@ feenableexcept (unsigned int excepts)
|
||||
#endif
|
||||
|
||||
uint32_t gpu_threads=8;
|
||||
#ifdef GRID_SYCL
|
||||
cl::sycl::queue *theGridAccelerator;
|
||||
#endif
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
@ -194,7 +198,7 @@ void GridParseLayout(char **argv,int argc,
|
||||
}
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){
|
||||
std::vector<int> gputhreads(0);
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was"
|
||||
<< " not compiled with GPU support" << std::endl;
|
||||
#endif
|
||||
@ -281,12 +285,10 @@ void GridBanner(void)
|
||||
printed=1;
|
||||
}
|
||||
}
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
cudaDeviceProp *gpu_props;
|
||||
#endif
|
||||
void GridGpuInit(void)
|
||||
{
|
||||
#ifdef GRID_NVCC
|
||||
int nDevices = 1;
|
||||
cudaGetDeviceCount(&nDevices);
|
||||
gpu_props = new cudaDeviceProp[nDevices];
|
||||
@ -335,11 +337,70 @@ void GridGpuInit(void)
|
||||
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||
}
|
||||
}
|
||||
#ifdef GRID_IBM_SUMMIT
|
||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||
if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
|
||||
#else
|
||||
if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n");
|
||||
cudaSetDevice(rank);
|
||||
#endif
|
||||
if ( world_rank == 0 ) printf("GpuInit: ================================================\n");
|
||||
}
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
void GridGpuInit(void)
|
||||
{
|
||||
int nDevices = 1;
|
||||
cl::sycl::gpu_selector selector;
|
||||
cl::sycl::device selectedDevice { selector };
|
||||
theGridAccelerator = new sycl::queue (selectedDevice);
|
||||
|
||||
char * localRankStr = NULL;
|
||||
int rank = 0, world_rank=0;
|
||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
||||
// We extract the local rank initialization using an environment variable
|
||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||
{
|
||||
rank = atoi(localRankStr);
|
||||
}
|
||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
|
||||
{
|
||||
rank = atoi(localRankStr);
|
||||
}
|
||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||
|
||||
if ( world_rank == 0 ) {
|
||||
GridBanner();
|
||||
}
|
||||
/*
|
||||
for (int i = 0; i < nDevices; i++) {
|
||||
|
||||
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
||||
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
||||
|
||||
cudaGetDeviceProperties(&gpu_props[i], i);
|
||||
if ( world_rank == 0) {
|
||||
cudaDeviceProp prop;
|
||||
prop = gpu_props[i];
|
||||
printf("GpuInit: ========================\n");
|
||||
printf("GpuInit: Device Number : %d\n", i);
|
||||
printf("GpuInit: ========================\n");
|
||||
printf("GpuInit: Device identifier: %s\n", prop.name);
|
||||
}
|
||||
}
|
||||
*/
|
||||
if ( world_rank == 0 ) {
|
||||
printf("GpuInit: ================================================\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))
|
||||
void GridGpuInit(void){}
|
||||
#endif
|
||||
|
||||
void Grid_init(int *argc,char ***argv)
|
||||
{
|
||||
|
@ -21,7 +21,7 @@
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
#define CUDA_PROFILE
|
||||
#endif
|
||||
|
||||
|
@ -41,7 +41,7 @@ int main (int argc, char ** argv)
|
||||
#define LADD (8)
|
||||
|
||||
int64_t Nwarm=20;
|
||||
int64_t Nloop=500;
|
||||
int64_t Nloop=50;
|
||||
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
|
13
configure.ac
13
configure.ac
@ -147,6 +147,19 @@ case ${ac_SUMMIT} in
|
||||
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
|
||||
esac
|
||||
|
||||
############### SYCL
|
||||
AC_ARG_ENABLE([sycl],
|
||||
[AC_HELP_STRING([--enable-sycl=yes|no], [enable SYCL])],
|
||||
[ac_JSRUN=${enable_sycl}], [ac_SYCL=no])
|
||||
case ${ac_SYCL} in
|
||||
no);;
|
||||
yes)
|
||||
AC_DEFINE([GRID_SYCL],[1],[Use SYCL offload]);;
|
||||
*)
|
||||
AC_DEFINE([GRID_SYCL],[1],[Use SYCL offload]);;
|
||||
esac
|
||||
|
||||
|
||||
############### Intel libraries
|
||||
AC_ARG_ENABLE([mkl],
|
||||
[AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
|
||||
|
@ -35,7 +35,7 @@ directory
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
using namespace Grid;
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user