/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/Accelerator.h Copyright (C) 2015 Author: Peter Boyle Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #pragma once #include #ifdef HAVE_MALLOC_MALLOC_H #include #endif #ifdef HAVE_MALLOC_H #include #endif #ifdef HAVE_MM_MALLOC_H #include #endif #ifdef __APPLE__ // no memalign inline void *memalign(size_t align, size_t bytes) { return malloc(bytes); } #endif NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////// // Accelerator primitives; fall back to threading if not CUDA or SYCL ////////////////////////////////////////////////////////////////////////////////// // // Function attributes // // accelerator // accelerator_inline // // Parallel looping // // accelerator_for // accelerator_forNB // uint32_t accelerator_barrier(); // device synchronise // // Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd // // uint32_t acceleratorThreads(void); // void acceleratorThreads(uint32_t); // // Warp control and info: // // acceleratorInit; // void acceleratorSynchronise(void); // synch warp etc.. // int acceleratorSIMTlane(int Nsimd); // // Memory management: // // int acceleratorIsCommunicable(void *pointer); // void *acceleratorAllocShared(size_t bytes); // void acceleratorFreeShared(void *ptr); // // void *acceleratorAllocDevice(size_t bytes); // void acceleratorFreeDevice(void *ptr); // // void *acceleratorCopyToDevice(void *from,void *to,size_t bytes); // void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes); // ////////////////////////////////////////////////////////////////////////////////// uint32_t acceleratorThreads(void); void acceleratorThreads(uint32_t); void acceleratorInit(void); ////////////////////////////////////////////// // CUDA acceleration ////////////////////////////////////////////// #ifdef GRID_CUDA #include #ifdef __CUDA_ARCH__ #define GRID_SIMT #endif #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline extern int acceleratorAbortOnGpuError; extern cudaStream_t copyStream; extern cudaStream_t computeStream; accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT return threadIdx.x; #else return 0; #endif } // CUDA specific inline void cuda_mem(void) { size_t free_t,total_t,used_t; cudaMemGetInfo(&free_t,&total_t); used_t=total_t-free_t; std::cout << " MemoryManager : GPU used "<>>(num1,num2,nsimd,lambda); \ } #define accelerator_for6dNB(iter1, num1, \ iter2, num2, \ iter3, num3, \ iter4, num4, \ iter5, num5, \ iter6, num6, ... ) \ { \ typedef uint64_t Iterator; \ auto lambda = [=] accelerator \ (Iterator iter1,Iterator iter2, \ Iterator iter3,Iterator iter4, \ Iterator iter5,Iterator iter6) mutable { \ __VA_ARGS__; \ }; \ dim3 cu_blocks (num1,num2,num3); \ dim3 cu_threads(num4,num5,num6); \ Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ } template __global__ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) { // Weird permute is to make lane coalesce for large blocks uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; uint64_t z = threadIdx.x; if ( (x < num1) && (y __global__ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, uint64_t num4, uint64_t num5, uint64_t num6, lambda Lambda) { uint64_t iter1 = blockIdx.x; uint64_t iter2 = blockIdx.y; uint64_t iter3 = blockIdx.z; uint64_t iter4 = threadIdx.x; uint64_t iter5 = threadIdx.y; uint64_t iter6 = threadIdx.z; if ( (iter1 < num1) && (iter2 #include #include #include #else #include #include #include #include #endif NAMESPACE_BEGIN(Grid); extern cl::sycl::queue *theGridAccelerator; extern cl::sycl::queue *theCopyAccelerator; #ifdef __SYCL_DEVICE_ONLY__ #define GRID_SIMT #endif #define accelerator #define accelerator_inline strong_inline accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; #else return 0; #endif } // SYCL specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ unsigned long nt=acceleratorThreads(); \ unsigned long unum1 = num1; \ unsigned long unum2 = num2; \ if(nt < 8)nt=8; \ cl::sycl::range<3> local {nt,1,nsimd}; \ cl::sycl::range<3> global{unum1,unum2,nsimd}; \ cgh.parallel_for( \ cl::sycl::nd_range<3>(global,local), \ [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ [[intel::reqd_sub_group_size(16)]] \ { \ auto iter1 = item.get_global_id(0); \ auto iter2 = item.get_global_id(1); \ auto lane = item.get_global_id(2); \ { __VA_ARGS__ }; \ }); \ }); #define accelerator_barrier(dummy) { theGridAccelerator->wait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} inline int acceleratorIsCommunicable(void *ptr) { #if 0 auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); if ( uvm = cl::sycl::usm::alloc::shared ) return 1; else return 0; #endif return 1; } #endif ////////////////////////////////////////////// // HIP acceleration ////////////////////////////////////////////// #ifdef GRID_HIP NAMESPACE_END(Grid); #include NAMESPACE_BEGIN(Grid); #ifdef __HIP_DEVICE_COMPILE__ #define GRID_SIMT #endif #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline extern hipStream_t copyStream; extern hipStream_t computeStream; /*These routines define mapping from thread grid to loop & vector lane indexing */ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT return hipThreadIdx_x; #else return 0; #endif } // HIP specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ { \ typedef uint64_t Iterator; \ auto lambda = [=] accelerator \ (Iterator iter1,Iterator iter2,Iterator lane ) mutable { \ { __VA_ARGS__;} \ }; \ int nt=acceleratorThreads(); \ dim3 hip_threads(nsimd, nt, 1); \ dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \ hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \ 0,computeStream, \ num1,num2,nsimd, lambda); \ } else { \ hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ 0,computeStream, \ num1,num2,nsimd, lambda); \ } \ } template __global__ __launch_bounds__(64,1) void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) { // Following the same scheme as CUDA for now uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; uint64_t z = threadIdx.x; if ( (x < numx) && (y __global__ __launch_bounds__(1024,1) void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) { // Following the same scheme as CUDA for now uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; uint64_t z = threadIdx.x; if ( (x < numx) && (yext_oneapi_submit_barrier(); }; #else // Ordering within a stream guaranteed on Nvidia & AMD inline void acceleratorFenceComputeStream(void){ }; #endif /////////////////////////////////////////////////// // Synchronise across local threads for divergence resynch /////////////////////////////////////////////////// accelerator_inline void acceleratorSynchronise(void) // Only Nvidia needs { #ifdef GRID_SIMT #ifdef GRID_CUDA __syncwarp(); #endif #endif return; } accelerator_inline void acceleratorSynchroniseAll(void) { #ifdef GRID_SIMT #ifdef GRID_CUDA __syncthreads(); #endif #ifdef GRID_SYCL // No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier #endif #ifdef GRID_HIP __syncthreads(); #endif #endif return; } accelerator_inline void acceleratorFence(void) { #ifdef GRID_SIMT #ifdef GRID_CUDA __threadfence(); #endif #ifdef GRID_SYCL // FIXMEE #endif #ifdef GRID_HIP __threadfence(); #endif #endif return; } NAMESPACE_END(Grid);