1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-24 10:42:03 +01:00

Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc

Aim to reduce the amount of cuda and other code variations floating around all over the place.

Will move GpuInit iinto Accelerator.cc from Init.cc
Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
This commit is contained in:
Peter Boyle
2020-05-08 06:23:55 -07:00
parent 28a1fcaaff
commit f8b8e00090
13 changed files with 557 additions and 718 deletions

345
Grid/threads/Accelerator.h Normal file
View File

@ -0,0 +1,345 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Accelerator.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////
// Accelerator primitives; fall back to threading if not CUDA or SYCL
//////////////////////////////////////////////////////////////////////////////////
//
// Function attributes
//
// accelerator
// accelerator_inline
//
// Parallel looping
//
// accelerator_for
// accelerator_forNB
// uint32_t accelerator_barrier(); // device synchronise
//
// Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd
//
// uint32_t acceleratorThreads(void);
// void acceleratorThreads(uint32_t);
//
// Warp control and info:
//
// void acceleratorSynchronise(void); // synch warp etc..
// int acceleratorSIMTlane(int Nsimd);
//
// Memory management:
//
// void *acceleratorAllocShared(size_t bytes);
// void acceleratorFreeShared(void *ptr);
//
// void *acceleratorAllocDevice(size_t bytes);
// void acceleratorFreeDevice(void *ptr);
//
// void *acceleratorCopyToDevice(void *from,void *to,size_t bytes);
// void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes);
//
//////////////////////////////////////////////////////////////////////////////////
uint32_t acceleratorThreads(void);
void acceleratorThreads(uint32_t);
//////////////////////////////////////////////
// CUDA acceleration
//////////////////////////////////////////////
#ifdef GRID_CUDA
#ifdef __CUDA_ARCH__
#define GRID_SIMT
#endif
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
#define accelerator_barrier(dummy) \
{ \
cudaDeviceSynchronize(); \
cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \
printf("Cuda error %s \n", cudaGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
exit(0); \
} \
}
#define accelerator_forNB( iterator, num, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
__VA_ARGS__; \
}; \
dim3 cu_threads(acceleratorThreads(),nsimd); \
dim3 cu_blocks ((num+acceleratorThreads()-1)/acceleratorThreads()); \
LambdaApply<<<cu_blocks,cu_threads>>>(nsimd,num,lambda); \
}
#define accelerator_for( iterator, num, nsimd, ... ) \
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
inline void *acceleratorAllocShared(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
}
return ptr;
};
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMalloc((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
template<typename lambda> __global__
void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
{
uint64_t isite = threadIdx.y;
uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
if ( (osite <Osites) && (isite<Isites) ) {
Lambda(isite,osite);
}
}
#endif
//////////////////////////////////////////////
// SyCL acceleration
//////////////////////////////////////////////
#ifdef GRID_SYCL
NAMESPACE_END(Grid);
#include <CL/sycl.hpp>
#include <CL/sycl/usm.hpp>
NAMESPACE_BEGIN(Grid);
extern cl::sycl::queue *theGridAccelerator;
#ifdef __SYCL_DEVICE_ONLY__
#define GRID_SIMT
#endif
#define accelerator
#define accelerator_inline strong_inline
#define accelerator_forNB(iterator,num,nsimd, ... ) \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
cl::sycl::range<3> local {acceleratorThreads(),1,nsimd}; \
cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
cgh.parallel_for<class dslash>( \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) mutable { \
auto iterator = item.get_global_id(0); \
auto lane = item.get_global_id(2); \
{ __VA_ARGS__ }; \
}); \
});
#define accelerator_barrier(dummy) theGridAccelerator->wait();
#define accelerator_for( iterator, num, nsimd, ... ) \
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
#endif
//////////////////////////////////////////////
// HIP acceleration
//////////////////////////////////////////////
#ifdef GRID_HIP
#ifdef __HIP_DEVICE_COMPILE__
#define GRID_SIMT
#endif
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
#define accelerator_barrier(dummy) \
{ \
hipDeviceSynchronize(); \
auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \
printf("HIP error %s \n", hipGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
exit(0); \
} \
}
#define accelerator_forNB( iterator, num, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
__VA_ARGS__; \
}; \
dim3 hip_threads(acceleratorThreads(),nsimd); \
dim3 hip_blocks ((num+acceleratorThreads()-1)/acceleratorThreads()); \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,0,0,num,simd,lambda);\
}
#define accelerator_for( iterator, num, nsimd, ... ) \
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
inline void *acceleratorAllocShared(size_t bytes)
{
void *ptr=NULL;
auto err = hipMallocManaged((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (_Tp *) NULL;
printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
}
return ptr;
};
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = hipMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (_Tp *) NULL;
printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
template<typename lambda> __global__
void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
{
uint64_t isite = hipThreadIdx_y;
uint64_t osite = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
if ( (osite <Osites) && (isite<Isites) ) {
Lambda(isite,osite);
}
}
#endif
//////////////////////////////////////////////
// CPU Target - No accelerator just thread instead
//////////////////////////////////////////////
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
#undef GRID_SIMT
#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned
#define accelerator
#define accelerator_inline strong_inline
#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_barrier(dummy)
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
#else
inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
inline void acceleratorFreeShared(void *ptr){free(ptr);};
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
#endif
#endif // CPU target
///////////////////////////////////////////////////
// Synchronise across local threads for divergence resynch
///////////////////////////////////////////////////
accelerator_inline void acceleratorSynchronise(void)
{
#ifdef GRID_SIMT
#ifdef GRID_CUDA
__syncwarp();
#endif
#ifdef GRID_SYCL
// No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier
#endif
#ifdef GRID_HIP
__syncthreads();
#endif
#endif
return;
}
////////////////////////////////////////////////////
// Address subvectors on accelerators
////////////////////////////////////////////////////
#ifdef GRID_SIMT
#ifdef GRID_CUDA
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
#endif
#ifdef GRID_SYCL
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
#endif
#ifdef GRID_HIP
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_y; } // HIP specific
#endif
#else
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
#endif
NAMESPACE_END(Grid);