mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
First compile against SYCL
This commit is contained in:
parent
04927d2e40
commit
28a1fcaaff
@ -15,12 +15,12 @@
|
|||||||
#ifdef __NVCC__
|
#ifdef __NVCC__
|
||||||
#pragma push
|
#pragma push
|
||||||
#pragma diag_suppress code_is_unreachable
|
#pragma diag_suppress code_is_unreachable
|
||||||
#pragma push_macro("__CUDA_ARCH__")
|
#pragma push_macro("GRID_SIMT")
|
||||||
#pragma push_macro("__NVCC__")
|
#pragma push_macro("__NVCC__")
|
||||||
#pragma push_macro("__CUDACC__")
|
#pragma push_macro("__CUDACC__")
|
||||||
#undef __NVCC__
|
#undef __NVCC__
|
||||||
#undef __CUDACC__
|
#undef __CUDACC__
|
||||||
#undef __CUDA_ARCH__
|
#undef GRID_SIMT
|
||||||
#define __NVCC__REDEFINE__
|
#define __NVCC__REDEFINE__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -41,7 +41,7 @@
|
|||||||
#ifdef __NVCC__REDEFINE__
|
#ifdef __NVCC__REDEFINE__
|
||||||
#pragma pop_macro("__CUDACC__")
|
#pragma pop_macro("__CUDACC__")
|
||||||
#pragma pop_macro("__NVCC__")
|
#pragma pop_macro("__NVCC__")
|
||||||
#pragma pop_macro("__CUDA_ARCH__")
|
#pragma pop_macro("GRID_SIMT")
|
||||||
#pragma pop
|
#pragma pop
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
MemoryStats *MemoryProfiler::stats = nullptr;
|
||||||
bool MemoryProfiler::debug = false;
|
bool MemoryProfiler::debug = false;
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
#define SMALL_LIMIT (0)
|
#define SMALL_LIMIT (0)
|
||||||
#else
|
#else
|
||||||
#define SMALL_LIMIT (4096)
|
#define SMALL_LIMIT (4096)
|
||||||
|
@ -51,11 +51,8 @@ class PointerCache {
|
|||||||
private:
|
private:
|
||||||
/*Pinning pages is costly*/
|
/*Pinning pages is costly*/
|
||||||
/*Could maintain separate large and small allocation caches*/
|
/*Could maintain separate large and small allocation caches*/
|
||||||
#ifdef GRID_NVCC
|
|
||||||
static const int Ncache=128;
|
static const int Ncache=128;
|
||||||
#else
|
|
||||||
static const int Ncache=8;
|
|
||||||
#endif
|
|
||||||
static int victim;
|
static int victim;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@ -169,7 +166,7 @@ public:
|
|||||||
pointer ptr = nullptr;
|
pointer ptr = nullptr;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
// Unified (managed) memory
|
// Unified (managed) memory
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
@ -183,7 +180,13 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert( ptr != (_Tp *)NULL);
|
assert( ptr != (_Tp *)NULL);
|
||||||
#else
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) malloc_shared(bytes,*theGridAccelerator);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if ( !defined(GRID_CUDA)) && (!defined(GRID_SYCL))
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// 2MB align; could make option probably doesn't need configurability
|
// 2MB align; could make option probably doesn't need configurability
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -193,14 +196,6 @@ public:
|
|||||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
|
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
|
||||||
#endif
|
#endif
|
||||||
assert( ptr != (_Tp *)NULL);
|
assert( ptr != (_Tp *)NULL);
|
||||||
|
|
||||||
//////////////////////////////////////////////////
|
|
||||||
// First touch optimise in threaded loop
|
|
||||||
//////////////////////////////////////////////////
|
|
||||||
uint64_t *cp = (uint64_t *)ptr;
|
|
||||||
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
|
|
||||||
cp[n]=0;
|
|
||||||
});
|
|
||||||
#endif
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
@ -216,9 +211,14 @@ public:
|
|||||||
pointer __freeme = __p;
|
pointer __freeme = __p;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
if ( __freeme ) cudaFree((void *)__freeme);
|
if ( __freeme ) cudaFree((void *)__freeme);
|
||||||
#else
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
if ( __freeme ) free((void *)__freeme,*theGridAccelerator);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if ( !defined(GRID_CUDA)) && (!defined(GRID_SYCL))
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
if ( __freeme ) _mm_free((void *)__freeme);
|
if ( __freeme ) _mm_free((void *)__freeme);
|
||||||
#else
|
#else
|
||||||
|
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
#include <pwd.h>
|
#include <pwd.h>
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -413,7 +413,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Hugetlbfs mapping intended
|
// Hugetlbfs mapping intended
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
{
|
{
|
||||||
void * ShmCommBuf ;
|
void * ShmCommBuf ;
|
||||||
@ -433,13 +433,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
||||||
|
|
||||||
#ifdef GRID_IBM_SUMMIT
|
|
||||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
|
||||||
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
|
|
||||||
#else
|
|
||||||
std::cout << "setting device to WorldShmRank"<<std::endl;
|
|
||||||
cudaSetDevice(WorldShmRank);
|
|
||||||
#endif
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -677,7 +670,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
/////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////
|
||||||
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||||
{
|
{
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
cudaMemset(dest,0,bytes);
|
cudaMemset(dest,0,bytes);
|
||||||
#else
|
#else
|
||||||
bzero(dest,bytes);
|
bzero(dest,bytes);
|
||||||
@ -685,7 +678,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
|||||||
}
|
}
|
||||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
|
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
|
||||||
{
|
{
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
||||||
#else
|
#else
|
||||||
bcopy(src,dest,bytes);
|
bcopy(src,dest,bytes);
|
||||||
|
@ -89,7 +89,7 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
// Rvalue
|
// Rvalue
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef GRID_SIMT
|
||||||
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
|
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
|
||||||
#else
|
#else
|
||||||
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
||||||
@ -211,7 +211,7 @@ public:
|
|||||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
|
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
|
||||||
return accessor;
|
return accessor;
|
||||||
}
|
}
|
||||||
|
|
||||||
~Lattice() {
|
~Lattice() {
|
||||||
if ( this->_odata_size ) {
|
if ( this->_odata_size ) {
|
||||||
dealloc();
|
dealloc();
|
||||||
|
@ -24,7 +24,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/Grid_Eigen_Dense.h>
|
#include <Grid/Grid_Eigen_Dense.h>
|
||||||
|
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -67,7 +67,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
||||||
{
|
{
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
return sum_gpu(arg,osites);
|
return sum_gpu(arg,osites);
|
||||||
#else
|
#else
|
||||||
return sum_cpu(arg,osites);
|
return sum_cpu(arg,osites);
|
||||||
@ -108,7 +108,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
// GPU - SIMT lane compliance...
|
// GPU - SIMT lane compliance...
|
||||||
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
@ -174,7 +174,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
// GPU
|
// GPU
|
||||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <sys/syscall.h>
|
#include <sys/syscall.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
||||||
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
|
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
|
||||||
#else
|
#else
|
||||||
@ -112,7 +112,6 @@ class PerformanceCounter {
|
|||||||
private:
|
private:
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
public:
|
|
||||||
uint32_t type;
|
uint32_t type;
|
||||||
uint64_t config;
|
uint64_t config;
|
||||||
const char *name;
|
const char *name;
|
||||||
|
@ -12773,7 +12773,7 @@ namespace pugi
|
|||||||
#undef PUGI__THROW_ERROR
|
#undef PUGI__THROW_ERROR
|
||||||
#undef PUGI__CHECK_ERROR
|
#undef PUGI__CHECK_ERROR
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
#pragma pop
|
#pragma pop
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -286,7 +286,7 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
|
|||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
||||||
|
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
|
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
|
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
|
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
|
||||||
|
@ -96,7 +96,7 @@ public:
|
|||||||
int sl = St._simd_layout[direction];
|
int sl = St._simd_layout[direction];
|
||||||
Coordinate icoor;
|
Coordinate icoor;
|
||||||
|
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef GRID_SIMT
|
||||||
_Spinor tmp;
|
_Spinor tmp;
|
||||||
|
|
||||||
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
||||||
|
@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
|
|||||||
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
||||||
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
||||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
@ -644,7 +644,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
|
|||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu)
|
unsigned int mu)
|
||||||
{
|
{
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
Gamma::Algebra Gmu [] = {
|
Gamma::Algebra Gmu [] = {
|
||||||
Gamma::Algebra::GammaX,
|
Gamma::Algebra::GammaX,
|
||||||
Gamma::Algebra::GammaY,
|
Gamma::Algebra::GammaY,
|
||||||
@ -828,7 +828,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// GENERAL CAYLEY CASE
|
// GENERAL CAYLEY CASE
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
|
@ -39,9 +39,10 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// Generic implementation; move to different file?
|
// Generic implementation; move to different file?
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
|
|
||||||
|
/*
|
||||||
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef GRID_SIMT
|
||||||
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
|
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
|
||||||
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
|
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
|
||||||
uint4 * chip_pun = (uint4 *)&chip;
|
uint4 * chip_pun = (uint4 *)&chip;
|
||||||
@ -51,7 +52,8 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
#endif
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
|
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
|
||||||
SE = st.GetEntry(ptype, Dir, sF); \
|
SE = st.GetEntry(ptype, Dir, sF); \
|
||||||
if (SE->_is_local) { \
|
if (SE->_is_local) { \
|
||||||
@ -358,18 +360,18 @@ void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,Site
|
|||||||
auto out_Yp = out[5].View();
|
auto out_Yp = out[5].View();
|
||||||
auto out_Zp = out[6].View();
|
auto out_Zp = out[6].View();
|
||||||
auto out_Tp = out[7].View();
|
auto out_Tp = out[7].View();
|
||||||
|
auto CBp=st.CommBuf();
|
||||||
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
|
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
|
||||||
int sU=sss/Ls;
|
int sU=sss/Ls;
|
||||||
int sF =sss;
|
int sF =sss;
|
||||||
DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
|
DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
|
||||||
DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
|
DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
|
||||||
DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
|
DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
|
||||||
DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
|
DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3);
|
||||||
DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
|
DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4);
|
||||||
DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
|
DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5);
|
||||||
DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
|
DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6);
|
||||||
DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
|
DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -385,13 +387,14 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
auto in_v = in.View();
|
auto in_v = in.View();
|
||||||
auto out_v = out.View();
|
auto out_v = out.View();
|
||||||
auto st_v = st.View();
|
auto st_v = st.View();
|
||||||
|
auto CBp=st.CommBuf();
|
||||||
#define LoopBody(Dir) \
|
#define LoopBody(Dir) \
|
||||||
case Dir : \
|
case Dir : \
|
||||||
accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \
|
accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \
|
||||||
for(int s=0;s<Ls;s++){ \
|
for(int s=0;s<Ls;s++){ \
|
||||||
int sU=ss; \
|
int sU=ss; \
|
||||||
int sF = s+Ls*sU; \
|
int sF = s+Ls*sU; \
|
||||||
DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
|
DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
|
||||||
} \
|
} \
|
||||||
}); \
|
}); \
|
||||||
break;
|
break;
|
||||||
@ -442,19 +445,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;}
|
||||||
#endif
|
#endif
|
||||||
@ -473,19 +476,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
||||||
#endif
|
#endif
|
||||||
|
@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include "BinaryIO.h"
|
#include "BinaryIO.h"
|
||||||
#include "TextIO.h"
|
#include "TextIO.h"
|
||||||
#include "XmlIO.h"
|
#include "XmlIO.h"
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
#include "JSON_IO.h"
|
#include "JSON_IO.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -142,7 +142,7 @@ typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
|
|||||||
accelerator_inline float half2float(half h)
|
accelerator_inline float half2float(half h)
|
||||||
{
|
{
|
||||||
float f;
|
float f;
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef GRID_SIMT
|
||||||
f = __half2float(h);
|
f = __half2float(h);
|
||||||
#else
|
#else
|
||||||
//f = __half2float(h);
|
//f = __half2float(h);
|
||||||
@ -156,7 +156,7 @@ accelerator_inline float half2float(half h)
|
|||||||
accelerator_inline half float2half(float f)
|
accelerator_inline half float2half(float f)
|
||||||
{
|
{
|
||||||
half h;
|
half h;
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef GRID_SIMT
|
||||||
h = __float2half(f);
|
h = __float2half(f);
|
||||||
#else
|
#else
|
||||||
Grid_half hh = sfw_float_to_half(f);
|
Grid_half hh = sfw_float_to_half(f);
|
||||||
|
@ -31,7 +31,7 @@ directory
|
|||||||
#ifndef GRID_SIMD_H
|
#ifndef GRID_SIMD_H
|
||||||
#define GRID_SIMD_H
|
#define GRID_SIMD_H
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
#include <thrust/complex.h>
|
#include <thrust/complex.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -65,7 +65,7 @@ typedef RealD Real;
|
|||||||
typedef RealF Real;
|
typedef RealF Real;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
typedef thrust::complex<RealF> ComplexF;
|
typedef thrust::complex<RealF> ComplexF;
|
||||||
typedef thrust::complex<RealD> ComplexD;
|
typedef thrust::complex<RealD> ComplexD;
|
||||||
typedef thrust::complex<Real> Complex;
|
typedef thrust::complex<Real> Complex;
|
||||||
|
@ -107,7 +107,7 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct StencilEntry {
|
struct StencilEntry {
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
uint64_t _byte_offset; // 8 bytes
|
uint64_t _byte_offset; // 8 bytes
|
||||||
uint32_t _offset; // 4 bytes
|
uint32_t _offset; // 4 bytes
|
||||||
#else
|
#else
|
||||||
|
@ -34,14 +34,16 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
//accelerator_inline void SIMTsynchronise(void)
|
//accelerator_inline void SIMTsynchronise(void)
|
||||||
accelerator_inline void synchronise(void)
|
accelerator_inline void synchronise(void)
|
||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef GRID_SIMT
|
||||||
|
#ifdef GRID_CUDA
|
||||||
// __syncthreads();
|
// __syncthreads();
|
||||||
__syncwarp();
|
__syncwarp();
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef GRID_SIMT
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Trivial mapping of vectors on host
|
// Trivial mapping of vectors on host
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
@ -75,7 +77,13 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
|
|||||||
vstream(vec, extracted);
|
vstream(vec, extracted);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
#ifdef GRID_CUDA
|
||||||
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
|
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
//accelerator_inline int SIMTlane(int Nsimd) { return __spirv_BuiltInGlobalInvocationId[2]; } //SYCL specific
|
||||||
|
accelerator_inline int SIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
|
||||||
|
#endif
|
||||||
|
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Extract and insert slices on the GPU
|
// Extract and insert slices on the GPU
|
||||||
|
@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
|
|||||||
|
|
||||||
|
|
||||||
// Specialisation: Cayley-Hamilton exponential for SU(3)
|
// Specialisation: Cayley-Hamilton exponential for SU(3)
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr>
|
template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr>
|
||||||
accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP )
|
accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP )
|
||||||
{
|
{
|
||||||
|
@ -68,16 +68,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////
|
||||||
// Accelerator primitives; fall back to threading
|
// Accelerator primitives; fall back to threading if not CUDA or SYCL
|
||||||
//////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifdef __NVCC__
|
|
||||||
#define GRID_NVCC
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
|
|
||||||
extern uint32_t gpu_threads;
|
extern uint32_t gpu_threads;
|
||||||
|
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
#define GRID_SIMT
|
||||||
|
#endif
|
||||||
|
|
||||||
#define accelerator __host__ __device__
|
#define accelerator __host__ __device__
|
||||||
#define accelerator_inline __host__ __device__ inline
|
#define accelerator_inline __host__ __device__ inline
|
||||||
|
|
||||||
@ -123,7 +124,47 @@ void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda)
|
|||||||
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
||||||
accelerator_barrier(dummy);
|
accelerator_barrier(dummy);
|
||||||
|
|
||||||
#else
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
|
||||||
|
#ifdef __SYCL_DEVICE_ONLY__
|
||||||
|
#define GRID_SIMT
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <CL/sycl.hpp>
|
||||||
|
#include <CL/sycl/usm.hpp>
|
||||||
|
|
||||||
|
extern cl::sycl::queue *theGridAccelerator;
|
||||||
|
|
||||||
|
extern uint32_t gpu_threads;
|
||||||
|
|
||||||
|
#define accelerator
|
||||||
|
#define accelerator_inline strong_inline
|
||||||
|
|
||||||
|
#define accelerator_forNB(iterator,num,nsimd, ... ) \
|
||||||
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
|
||||||
|
cl::sycl::range<3> local {gpu_threads,1,nsimd}; \
|
||||||
|
cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
|
||||||
|
cgh.parallel_for<class dslash>( \
|
||||||
|
cl::sycl::nd_range<3>(global,local), \
|
||||||
|
[=] (cl::sycl::nd_item<3> item) mutable { \
|
||||||
|
auto iterator = item.get_global_id(0); \
|
||||||
|
auto lane = item.get_global_id(2); \
|
||||||
|
{ __VA_ARGS__ }; \
|
||||||
|
}); \
|
||||||
|
});
|
||||||
|
|
||||||
|
#define accelerator_barrier(dummy) theGridAccelerator->wait();
|
||||||
|
|
||||||
|
#define accelerator_for( iterator, num, nsimd, ... ) \
|
||||||
|
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
||||||
|
accelerator_barrier(dummy);
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) )
|
||||||
|
|
||||||
#define accelerator
|
#define accelerator
|
||||||
#define accelerator_inline strong_inline
|
#define accelerator_inline strong_inline
|
||||||
|
@ -74,6 +74,10 @@ feenableexcept (unsigned int excepts)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
uint32_t gpu_threads=8;
|
uint32_t gpu_threads=8;
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
cl::sycl::queue *theGridAccelerator;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
@ -194,7 +198,7 @@ void GridParseLayout(char **argv,int argc,
|
|||||||
}
|
}
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){
|
if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){
|
||||||
std::vector<int> gputhreads(0);
|
std::vector<int> gputhreads(0);
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was"
|
std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was"
|
||||||
<< " not compiled with GPU support" << std::endl;
|
<< " not compiled with GPU support" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
@ -281,12 +285,10 @@ void GridBanner(void)
|
|||||||
printed=1;
|
printed=1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
cudaDeviceProp *gpu_props;
|
cudaDeviceProp *gpu_props;
|
||||||
#endif
|
|
||||||
void GridGpuInit(void)
|
void GridGpuInit(void)
|
||||||
{
|
{
|
||||||
#ifdef GRID_NVCC
|
|
||||||
int nDevices = 1;
|
int nDevices = 1;
|
||||||
cudaGetDeviceCount(&nDevices);
|
cudaGetDeviceCount(&nDevices);
|
||||||
gpu_props = new cudaDeviceProp[nDevices];
|
gpu_props = new cudaDeviceProp[nDevices];
|
||||||
@ -335,11 +337,70 @@ void GridGpuInit(void)
|
|||||||
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_IBM_SUMMIT
|
||||||
|
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||||
|
if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
|
||||||
|
#else
|
||||||
|
if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n");
|
||||||
|
cudaSetDevice(rank);
|
||||||
|
#endif
|
||||||
|
if ( world_rank == 0 ) printf("GpuInit: ================================================\n");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
void GridGpuInit(void)
|
||||||
|
{
|
||||||
|
int nDevices = 1;
|
||||||
|
cl::sycl::gpu_selector selector;
|
||||||
|
cl::sycl::device selectedDevice { selector };
|
||||||
|
theGridAccelerator = new sycl::queue (selectedDevice);
|
||||||
|
|
||||||
|
char * localRankStr = NULL;
|
||||||
|
int rank = 0, world_rank=0;
|
||||||
|
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||||
|
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||||
|
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||||
|
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
||||||
|
// We extract the local rank initialization using an environment variable
|
||||||
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||||
|
{
|
||||||
|
rank = atoi(localRankStr);
|
||||||
|
}
|
||||||
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
|
||||||
|
{
|
||||||
|
rank = atoi(localRankStr);
|
||||||
|
}
|
||||||
|
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
|
||||||
|
if ( world_rank == 0 ) {
|
||||||
|
GridBanner();
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
for (int i = 0; i < nDevices; i++) {
|
||||||
|
|
||||||
|
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
||||||
|
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
||||||
|
|
||||||
|
cudaGetDeviceProperties(&gpu_props[i], i);
|
||||||
|
if ( world_rank == 0) {
|
||||||
|
cudaDeviceProp prop;
|
||||||
|
prop = gpu_props[i];
|
||||||
|
printf("GpuInit: ========================\n");
|
||||||
|
printf("GpuInit: Device Number : %d\n", i);
|
||||||
|
printf("GpuInit: ========================\n");
|
||||||
|
printf("GpuInit: Device identifier: %s\n", prop.name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
if ( world_rank == 0 ) {
|
if ( world_rank == 0 ) {
|
||||||
printf("GpuInit: ================================================\n");
|
printf("GpuInit: ================================================\n");
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))
|
||||||
|
void GridGpuInit(void){}
|
||||||
|
#endif
|
||||||
|
|
||||||
void Grid_init(int *argc,char ***argv)
|
void Grid_init(int *argc,char ***argv)
|
||||||
{
|
{
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid/Grid.h>
|
#include <Grid/Grid.h>
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
#define CUDA_PROFILE
|
#define CUDA_PROFILE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ int main (int argc, char ** argv)
|
|||||||
#define LADD (8)
|
#define LADD (8)
|
||||||
|
|
||||||
int64_t Nwarm=20;
|
int64_t Nwarm=20;
|
||||||
int64_t Nloop=500;
|
int64_t Nloop=50;
|
||||||
|
|
||||||
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||||
Coordinate mpi_layout = GridDefaultMpi();
|
Coordinate mpi_layout = GridDefaultMpi();
|
||||||
|
13
configure.ac
13
configure.ac
@ -147,6 +147,19 @@ case ${ac_SUMMIT} in
|
|||||||
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
|
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
############### SYCL
|
||||||
|
AC_ARG_ENABLE([sycl],
|
||||||
|
[AC_HELP_STRING([--enable-sycl=yes|no], [enable SYCL])],
|
||||||
|
[ac_JSRUN=${enable_sycl}], [ac_SYCL=no])
|
||||||
|
case ${ac_SYCL} in
|
||||||
|
no);;
|
||||||
|
yes)
|
||||||
|
AC_DEFINE([GRID_SYCL],[1],[Use SYCL offload]);;
|
||||||
|
*)
|
||||||
|
AC_DEFINE([GRID_SYCL],[1],[Use SYCL offload]);;
|
||||||
|
esac
|
||||||
|
|
||||||
|
|
||||||
############### Intel libraries
|
############### Intel libraries
|
||||||
AC_ARG_ENABLE([mkl],
|
AC_ARG_ENABLE([mkl],
|
||||||
[AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
|
[AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
|
||||||
|
@ -35,7 +35,7 @@ directory
|
|||||||
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user