mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-18 07:47:06 +01:00
Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc
Aim to reduce the amount of cuda and other code variations floating around all over the place. Will move GpuInit iinto Accelerator.cc from Init.cc Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
This commit is contained in:
10
Grid/threads/Accelerator.cc
Normal file
10
Grid/threads/Accelerator.cc
Normal file
@ -0,0 +1,10 @@
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
uint32_t accelerator_threads;
|
||||
uint32_t acceleratorThreads(void) {return accelerator_threads;};
|
||||
void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
|
||||
#ifdef GRID_SYCL
|
||||
cl::sycl::queue *theGridAccelerator;
|
||||
#endif
|
||||
NAMESPACE_END(Grid);
|
345
Grid/threads/Accelerator.h
Normal file
345
Grid/threads/Accelerator.h
Normal file
@ -0,0 +1,345 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/Accelerator.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
// Accelerator primitives; fall back to threading if not CUDA or SYCL
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Function attributes
|
||||
//
|
||||
// accelerator
|
||||
// accelerator_inline
|
||||
//
|
||||
// Parallel looping
|
||||
//
|
||||
// accelerator_for
|
||||
// accelerator_forNB
|
||||
// uint32_t accelerator_barrier(); // device synchronise
|
||||
//
|
||||
// Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd
|
||||
//
|
||||
// uint32_t acceleratorThreads(void);
|
||||
// void acceleratorThreads(uint32_t);
|
||||
//
|
||||
// Warp control and info:
|
||||
//
|
||||
// void acceleratorSynchronise(void); // synch warp etc..
|
||||
// int acceleratorSIMTlane(int Nsimd);
|
||||
//
|
||||
// Memory management:
|
||||
//
|
||||
// void *acceleratorAllocShared(size_t bytes);
|
||||
// void acceleratorFreeShared(void *ptr);
|
||||
//
|
||||
// void *acceleratorAllocDevice(size_t bytes);
|
||||
// void acceleratorFreeDevice(void *ptr);
|
||||
//
|
||||
// void *acceleratorCopyToDevice(void *from,void *to,size_t bytes);
|
||||
// void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes);
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
uint32_t acceleratorThreads(void);
|
||||
void acceleratorThreads(uint32_t);
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// CUDA acceleration
|
||||
//////////////////////////////////////////////
|
||||
#ifdef GRID_CUDA
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define GRID_SIMT
|
||||
#endif
|
||||
|
||||
#define accelerator __host__ __device__
|
||||
#define accelerator_inline __host__ __device__ inline
|
||||
|
||||
#define accelerator_barrier(dummy) \
|
||||
{ \
|
||||
cudaDeviceSynchronize(); \
|
||||
cudaError err = cudaGetLastError(); \
|
||||
if ( cudaSuccess != err ) { \
|
||||
printf("Cuda error %s \n", cudaGetErrorString( err )); \
|
||||
puts(__FILE__); \
|
||||
printf("Line %d\n",__LINE__); \
|
||||
exit(0); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define accelerator_forNB( iterator, num, nsimd, ... ) \
|
||||
{ \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(acceleratorThreads(),nsimd); \
|
||||
dim3 cu_blocks ((num+acceleratorThreads()-1)/acceleratorThreads()); \
|
||||
LambdaApply<<<cu_blocks,cu_threads>>>(nsimd,num,lambda); \
|
||||
}
|
||||
|
||||
#define accelerator_for( iterator, num, nsimd, ... ) \
|
||||
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
||||
accelerator_barrier(dummy);
|
||||
|
||||
inline void *acceleratorAllocShared(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (_Tp *) NULL;
|
||||
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
inline void *acceleratorAllocDevice(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
auto err = cudaMalloc((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (_Tp *) NULL;
|
||||
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||
|
||||
template<typename lambda> __global__
|
||||
void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
|
||||
{
|
||||
uint64_t isite = threadIdx.y;
|
||||
uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
|
||||
if ( (osite <Osites) && (isite<Isites) ) {
|
||||
Lambda(isite,osite);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// SyCL acceleration
|
||||
//////////////////////////////////////////////
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
NAMESPACE_END(Grid);
|
||||
#include <CL/sycl.hpp>
|
||||
#include <CL/sycl/usm.hpp>
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
extern cl::sycl::queue *theGridAccelerator;
|
||||
|
||||
#ifdef __SYCL_DEVICE_ONLY__
|
||||
#define GRID_SIMT
|
||||
#endif
|
||||
|
||||
#define accelerator
|
||||
#define accelerator_inline strong_inline
|
||||
|
||||
#define accelerator_forNB(iterator,num,nsimd, ... ) \
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
|
||||
cl::sycl::range<3> local {acceleratorThreads(),1,nsimd}; \
|
||||
cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
|
||||
cgh.parallel_for<class dslash>( \
|
||||
cl::sycl::nd_range<3>(global,local), \
|
||||
[=] (cl::sycl::nd_item<3> item) mutable { \
|
||||
auto iterator = item.get_global_id(0); \
|
||||
auto lane = item.get_global_id(2); \
|
||||
{ __VA_ARGS__ }; \
|
||||
}); \
|
||||
});
|
||||
|
||||
#define accelerator_barrier(dummy) theGridAccelerator->wait();
|
||||
|
||||
#define accelerator_for( iterator, num, nsimd, ... ) \
|
||||
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
||||
accelerator_barrier(dummy);
|
||||
|
||||
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
|
||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// HIP acceleration
|
||||
//////////////////////////////////////////////
|
||||
#ifdef GRID_HIP
|
||||
|
||||
#ifdef __HIP_DEVICE_COMPILE__
|
||||
#define GRID_SIMT
|
||||
#endif
|
||||
|
||||
#define accelerator __host__ __device__
|
||||
#define accelerator_inline __host__ __device__ inline
|
||||
#define accelerator_barrier(dummy) \
|
||||
{ \
|
||||
hipDeviceSynchronize(); \
|
||||
auto err = hipGetLastError(); \
|
||||
if ( err != hipSuccess ) { \
|
||||
printf("HIP error %s \n", hipGetErrorString( err )); \
|
||||
puts(__FILE__); \
|
||||
printf("Line %d\n",__LINE__); \
|
||||
exit(0); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define accelerator_forNB( iterator, num, nsimd, ... ) \
|
||||
{ \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 hip_threads(acceleratorThreads(),nsimd); \
|
||||
dim3 hip_blocks ((num+acceleratorThreads()-1)/acceleratorThreads()); \
|
||||
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,0,0,num,simd,lambda);\
|
||||
}
|
||||
|
||||
#define accelerator_for( iterator, num, nsimd, ... ) \
|
||||
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
||||
accelerator_barrier(dummy);
|
||||
|
||||
inline void *acceleratorAllocShared(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
auto err = hipMallocManaged((void **)&ptr,bytes);
|
||||
if( err != hipSuccess ) {
|
||||
ptr = (_Tp *) NULL;
|
||||
printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
inline void *acceleratorAllocDevice(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
auto err = hipMalloc((void **)&ptr,bytes);
|
||||
if( err != hipSuccess ) {
|
||||
ptr = (_Tp *) NULL;
|
||||
printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
|
||||
|
||||
template<typename lambda> __global__
|
||||
void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
|
||||
{
|
||||
uint64_t isite = hipThreadIdx_y;
|
||||
uint64_t osite = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
|
||||
if ( (osite <Osites) && (isite<Isites) ) {
|
||||
Lambda(isite,osite);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// CPU Target - No accelerator just thread instead
|
||||
//////////////////////////////////////////////
|
||||
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
|
||||
|
||||
#undef GRID_SIMT
|
||||
|
||||
#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned
|
||||
|
||||
#define accelerator
|
||||
#define accelerator_inline strong_inline
|
||||
#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
|
||||
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
|
||||
#define accelerator_barrier(dummy)
|
||||
|
||||
#ifdef HAVE_MALLOC_MALLOC_H
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
#ifdef HAVE_MALLOC_H
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
#include <mm_malloc.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
||||
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
||||
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
|
||||
#else
|
||||
inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
|
||||
inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
|
||||
inline void acceleratorFreeShared(void *ptr){free(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
||||
#endif
|
||||
|
||||
|
||||
#endif // CPU target
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// Synchronise across local threads for divergence resynch
|
||||
///////////////////////////////////////////////////
|
||||
accelerator_inline void acceleratorSynchronise(void)
|
||||
{
|
||||
#ifdef GRID_SIMT
|
||||
#ifdef GRID_CUDA
|
||||
__syncwarp();
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
// No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
__syncthreads();
|
||||
#endif
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////
|
||||
// Address subvectors on accelerators
|
||||
////////////////////////////////////////////////////
|
||||
#ifdef GRID_SIMT
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_y; } // HIP specific
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -2,7 +2,7 @@
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/Threads.h
|
||||
Source file: ./lib/Pragmas.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
@ -28,148 +28,5 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x,y) ((x)>(y)?(x):(y))
|
||||
#define MIN(x,y) ((x)>(y)?(y):(x))
|
||||
#endif
|
||||
|
||||
#define strong_inline __attribute__((always_inline)) inline
|
||||
#define UNROLL _Pragma("unroll")
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
// New primitives; explicit host thread calls, and accelerator data parallel calls
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef _OPENMP
|
||||
#define GRID_OMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#ifdef GRID_OMP
|
||||
#define DO_PRAGMA_(x) _Pragma (#x)
|
||||
#define DO_PRAGMA(x) DO_PRAGMA_(x)
|
||||
#define thread_num(a) omp_get_thread_num()
|
||||
#define thread_max(a) omp_get_max_threads()
|
||||
#else
|
||||
#define DO_PRAGMA_(x)
|
||||
#define DO_PRAGMA(x)
|
||||
#define thread_num(a) (0)
|
||||
#define thread_max(a) (1)
|
||||
#endif
|
||||
|
||||
#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define thread_foreach( i, container, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
|
||||
#define thread_for_in_region( i, num, ... ) DO_PRAGMA(omp for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define thread_for_collapse2( i, num, ... ) DO_PRAGMA(omp parallel for collapse(2)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define thread_for_collapse( N , i, num, ... ) DO_PRAGMA(omp parallel for collapse ( N ) ) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define thread_for_collapse_in_region( N , i, num, ... ) DO_PRAGMA(omp for collapse ( N )) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define thread_region DO_PRAGMA(omp parallel)
|
||||
#define thread_critical DO_PRAGMA(omp critical)
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
// Accelerator primitives; fall back to threading if not CUDA or SYCL
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
|
||||
extern uint32_t gpu_threads;
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define GRID_SIMT
|
||||
#endif
|
||||
|
||||
#define accelerator __host__ __device__
|
||||
#define accelerator_inline __host__ __device__ inline
|
||||
|
||||
template<typename lambda> __global__
|
||||
void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda)
|
||||
{
|
||||
uint64_t isite = threadIdx.y;
|
||||
uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
|
||||
if ( (osite <Osites) && (isite<Isites) ) {
|
||||
Lambda(isite,osite);
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Internal only really... but need to call when
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define accelerator_barrier(dummy) \
|
||||
{ \
|
||||
cudaDeviceSynchronize(); \
|
||||
cudaError err = cudaGetLastError(); \
|
||||
if ( cudaSuccess != err ) { \
|
||||
printf("Cuda error %s \n", cudaGetErrorString( err )); \
|
||||
puts(__FILE__); \
|
||||
printf("Line %d\n",__LINE__); \
|
||||
exit(0); \
|
||||
} \
|
||||
}
|
||||
|
||||
// Copy the for_each_n style ; Non-blocking variant
|
||||
#define accelerator_forNB( iterator, num, nsimd, ... ) \
|
||||
{ \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(gpu_threads,nsimd); \
|
||||
dim3 cu_blocks ((num+gpu_threads-1)/gpu_threads); \
|
||||
LambdaApplySIMT<<<cu_blocks,cu_threads>>>(nsimd,num,lambda); \
|
||||
}
|
||||
|
||||
// Copy the for_each_n style ; Non-blocking variant (default
|
||||
#define accelerator_for( iterator, num, nsimd, ... ) \
|
||||
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
||||
accelerator_barrier(dummy);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
|
||||
#ifdef __SYCL_DEVICE_ONLY__
|
||||
#define GRID_SIMT
|
||||
#endif
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
#include <CL/sycl/usm.hpp>
|
||||
|
||||
extern cl::sycl::queue *theGridAccelerator;
|
||||
|
||||
extern uint32_t gpu_threads;
|
||||
|
||||
#define accelerator
|
||||
#define accelerator_inline strong_inline
|
||||
|
||||
#define accelerator_forNB(iterator,num,nsimd, ... ) \
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
|
||||
cl::sycl::range<3> local {gpu_threads,1,nsimd}; \
|
||||
cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
|
||||
cgh.parallel_for<class dslash>( \
|
||||
cl::sycl::nd_range<3>(global,local), \
|
||||
[=] (cl::sycl::nd_item<3> item) mutable { \
|
||||
auto iterator = item.get_global_id(0); \
|
||||
auto lane = item.get_global_id(2); \
|
||||
{ __VA_ARGS__ }; \
|
||||
}); \
|
||||
});
|
||||
|
||||
#define accelerator_barrier(dummy) theGridAccelerator->wait();
|
||||
|
||||
#define accelerator_for( iterator, num, nsimd, ... ) \
|
||||
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
||||
accelerator_barrier(dummy);
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) )
|
||||
|
||||
#define accelerator
|
||||
#define accelerator_inline strong_inline
|
||||
#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
|
||||
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
|
||||
#define accelerator_barrier(dummy)
|
||||
|
||||
#endif
|
||||
#include <Grid/threads/Threads.h>
|
||||
#include <Grid/threads/Accelerator.h>
|
||||
|
127
Grid/threads/ThreadReduction.h
Normal file
127
Grid/threads/ThreadReduction.h
Normal file
@ -0,0 +1,127 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/ThreadReduction.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
// Introduce a class to gain deterministic bit reproducible reduction.
|
||||
// make static; perhaps just a namespace is required.
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
class GridThread {
|
||||
public:
|
||||
static int _threads;
|
||||
static int _hyperthreads;
|
||||
static int _cores;
|
||||
|
||||
static void SetCores(int cr) {
|
||||
#ifdef GRID_OMP
|
||||
_cores = cr;
|
||||
#else
|
||||
_cores = 1;
|
||||
#endif
|
||||
}
|
||||
static void SetThreads(int thr) {
|
||||
#ifdef GRID_OMP
|
||||
_threads = MIN(thr,omp_get_max_threads()) ;
|
||||
omp_set_num_threads(_threads);
|
||||
#else
|
||||
_threads = 1;
|
||||
#endif
|
||||
};
|
||||
static void SetMaxThreads(void) {
|
||||
#ifdef GRID_OMP
|
||||
_threads = omp_get_max_threads();
|
||||
omp_set_num_threads(_threads);
|
||||
#else
|
||||
_threads = 1;
|
||||
#endif
|
||||
};
|
||||
static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
|
||||
static int GetCores(void) { return _cores; };
|
||||
static int GetThreads(void) { return _threads; };
|
||||
static int SumArraySize(void) {return _threads;};
|
||||
|
||||
static void GetWork(int nwork, int me, int & mywork, int & myoff){
|
||||
GetWork(nwork,me,mywork,myoff,_threads);
|
||||
}
|
||||
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
|
||||
int basework = nwork/units;
|
||||
int backfill = units-(nwork%units);
|
||||
if ( me >= units ) {
|
||||
mywork = myoff = 0;
|
||||
} else {
|
||||
mywork = (nwork+me)/units;
|
||||
myoff = basework * me;
|
||||
if ( me > backfill )
|
||||
myoff+= (me-backfill);
|
||||
}
|
||||
return;
|
||||
};
|
||||
|
||||
static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
|
||||
me = ThreadBarrier();
|
||||
GetWork(nwork,me,mywork,myoff);
|
||||
};
|
||||
|
||||
static int ThreadBarrier(void) {
|
||||
#ifdef GRID_OMP
|
||||
#pragma omp barrier
|
||||
return omp_get_thread_num();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
};
|
||||
|
||||
template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
|
||||
sum_array[me] = val;
|
||||
val=Zero();
|
||||
ThreadBarrier();
|
||||
for(int i=0;i<_threads;i++) val+= sum_array[i];
|
||||
ThreadBarrier();
|
||||
}
|
||||
|
||||
static void bcopy(const void *src, void *dst, size_t len) {
|
||||
#ifdef GRID_OMP
|
||||
#pragma omp parallel
|
||||
{
|
||||
const char *c_src =(char *) src;
|
||||
char *c_dest=(char *) dst;
|
||||
int me,mywork,myoff;
|
||||
GridThread::GetWorkBarrier(len,me, mywork,myoff);
|
||||
bcopy(&c_src[myoff],&c_dest[myoff],mywork);
|
||||
}
|
||||
#else
|
||||
bcopy(src,dst,len);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -28,101 +28,41 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x,y) ((x)>(y)?(x):(y))
|
||||
#define MIN(x,y) ((x)>(y)?(y):(x))
|
||||
#endif
|
||||
|
||||
// Introduce a class to gain deterministic bit reproducible reduction.
|
||||
// make static; perhaps just a namespace is required.
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
#define strong_inline __attribute__((always_inline)) inline
|
||||
#define UNROLL _Pragma("unroll")
|
||||
|
||||
class GridThread {
|
||||
public:
|
||||
static int _threads;
|
||||
static int _hyperthreads;
|
||||
static int _cores;
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
// New primitives; explicit host thread calls, and accelerator data parallel calls
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef _OPENMP
|
||||
#define GRID_OMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
static void SetCores(int cr) {
|
||||
#ifdef GRID_OMP
|
||||
_cores = cr;
|
||||
#define DO_PRAGMA_(x) _Pragma (#x)
|
||||
#define DO_PRAGMA(x) DO_PRAGMA_(x)
|
||||
#define thread_num(a) omp_get_thread_num()
|
||||
#define thread_max(a) omp_get_max_threads()
|
||||
#else
|
||||
_cores = 1;
|
||||
#define DO_PRAGMA_(x)
|
||||
#define DO_PRAGMA(x)
|
||||
#define thread_num(a) (0)
|
||||
#define thread_max(a) (1)
|
||||
#endif
|
||||
}
|
||||
static void SetThreads(int thr) {
|
||||
#ifdef GRID_OMP
|
||||
_threads = MIN(thr,omp_get_max_threads()) ;
|
||||
omp_set_num_threads(_threads);
|
||||
#else
|
||||
_threads = 1;
|
||||
#endif
|
||||
};
|
||||
static void SetMaxThreads(void) {
|
||||
#ifdef GRID_OMP
|
||||
_threads = omp_get_max_threads();
|
||||
omp_set_num_threads(_threads);
|
||||
#else
|
||||
_threads = 1;
|
||||
#endif
|
||||
};
|
||||
static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
|
||||
static int GetCores(void) { return _cores; };
|
||||
static int GetThreads(void) { return _threads; };
|
||||
static int SumArraySize(void) {return _threads;};
|
||||
|
||||
static void GetWork(int nwork, int me, int & mywork, int & myoff){
|
||||
GetWork(nwork,me,mywork,myoff,_threads);
|
||||
}
|
||||
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
|
||||
int basework = nwork/units;
|
||||
int backfill = units-(nwork%units);
|
||||
if ( me >= units ) {
|
||||
mywork = myoff = 0;
|
||||
} else {
|
||||
mywork = (nwork+me)/units;
|
||||
myoff = basework * me;
|
||||
if ( me > backfill )
|
||||
myoff+= (me-backfill);
|
||||
}
|
||||
return;
|
||||
};
|
||||
|
||||
static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
|
||||
me = ThreadBarrier();
|
||||
GetWork(nwork,me,mywork,myoff);
|
||||
};
|
||||
|
||||
static int ThreadBarrier(void) {
|
||||
#ifdef GRID_OMP
|
||||
#pragma omp barrier
|
||||
return omp_get_thread_num();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
};
|
||||
|
||||
template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
|
||||
sum_array[me] = val;
|
||||
val=Zero();
|
||||
ThreadBarrier();
|
||||
for(int i=0;i<_threads;i++) val+= sum_array[i];
|
||||
ThreadBarrier();
|
||||
}
|
||||
|
||||
static void bcopy(const void *src, void *dst, size_t len) {
|
||||
#ifdef GRID_OMP
|
||||
#pragma omp parallel
|
||||
{
|
||||
const char *c_src =(char *) src;
|
||||
char *c_dest=(char *) dst;
|
||||
int me,mywork,myoff;
|
||||
GridThread::GetWorkBarrier(len,me, mywork,myoff);
|
||||
bcopy(&c_src[myoff],&c_dest[myoff],mywork);
|
||||
}
|
||||
#else
|
||||
bcopy(src,dst,len);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define thread_foreach( i, container, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
|
||||
#define thread_for_in_region( i, num, ... ) DO_PRAGMA(omp for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define thread_for_collapse2( i, num, ... ) DO_PRAGMA(omp parallel for collapse(2)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define thread_for_collapse( N , i, num, ... ) DO_PRAGMA(omp parallel for collapse ( N ) ) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define thread_for_collapse_in_region( N , i, num, ... ) DO_PRAGMA(omp for collapse ( N )) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define thread_region DO_PRAGMA(omp parallel)
|
||||
#define thread_critical DO_PRAGMA(omp critical)
|
||||
|
||||
|
Reference in New Issue
Block a user