removed print flags

2025-08-10 08:27:04 +01:00 · 2023-12-04 15:12:03 -05:00
parent 157368ed04
commit 867abeaf8e
9 changed files with 11566 additions and 95 deletions
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -14,7 +14,15 @@ void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"

-#ifdef GRID_CUDA
+#ifdef __CUDA_ARCH__
+#warning "ifdef cuda arch"
+#endif
+
+// fold omptarget into device specific acceleratorInit()
+#include <cuda_runtime_api.h>
+//#if defined(GRID_CUDA) || (defined(GRID_OMPTARGET) && defined(__CUDA_ARCH__))
+#if defined(GRID_OMPTARGET)
+#warning "using cuda from opmtarget"
 cudaDeviceProp *gpu_props;
 cudaStream_t copyStream;
 cudaStream_t computeStream;
@@ -113,7 +121,7 @@ void acceleratorInit(void)
 }
 #endif

-#ifdef GRID_HIP
+#if defined(GRID_HIP) || (defined(GRID_OMPTARGET) && defined(__HIP_DEVICE_COMPILE__))
 hipDeviceProp_t *gpu_props;
 hipStream_t copyStream;
 hipStream_t computeStream;
@@ -198,7 +206,7 @@ void acceleratorInit(void)
 #endif


-#ifdef GRID_SYCL
+#if defined(GRID_SYCL) || (defined(GRID_OMPTARGET) && defined(__SYCL_DEVICE_ONLY__))

 cl::sycl::queue *theGridAccelerator;
 cl::sycl::queue *theCopyAccelerator;
@@ -270,7 +278,7 @@ void acceleratorInit(void)
 }
 #endif

-#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
+#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP)) && (!defined(GRID_OMPTARGET))
 void acceleratorInit(void){}
 #endif

--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -477,16 +477,20 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 #endif

 //////////////////////////////////////////////
-// CPU Target - No accelerator just thread instead
+// OpenMP Target acceleration
 //////////////////////////////////////////////
-
-#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
-
-#undef GRID_SIMT
-
-//OpenMP Target Offloading
-#ifdef OMPTARGET
+#ifdef GRID_OMPTARGET
+//TODO GRID_SIMT for OMPTARGET
+#define GRID_ACCELERATED
 #include<omp.h>
+#ifdef __CUDA_ARCH__
+#include <cuda_runtime_api.h>
+#elif defined __HIP_DEVICE_COMPILE__
+#include <hip/hip_runtime.h>
+#elif defined __SYCL_DEVICE_ONLY__
+#include <CL/sycl.hpp>
+#include <CL/sycl/usm.hpp>
+#endif
 extern "C" void *llvm_omp_target_alloc_host  (size_t Size, int DeviceNum);
 extern "C" void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum);
 extern "C" void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
@@ -543,8 +547,33 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes)
  }
  std::cout << "D->H copy from device end "<<std::endl;
 };
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { printf("TODO acceleratorCopyDeviceToDeviceAsynch");memcpy(to,from,bytes);}
-inline void acceleratorCopySynchronise(void) {printf("TODO acceleratorCopySynchronize");};
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) 
+{ 
+  printf("TODO acceleratorCopyDeviceToDeviceAsynch");//memcpy(to,from,bytes);
+#ifdef __CUDA_ARCH__
+  extern cudaStream_t copyStream;
+  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
+#elif defined __HIP_DEVICE_COMPILE__
+  extern hipStream_t copyStream;
+  hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+#elif defined __SYCL_DEVICE_ONLY__
+  theCopyAccelerator->memcpy(to,from,bytes);
+#endif
+};
+inline void acceleratorCopySynchronise(void) 
+{
+  printf("TODO acceleratorCopySynchronize");
+  //#pragma omp barrier
+#ifdef __CUDA_ARCH__
+  extern cudaStream_t copyStream;
+  cudaStreamSynchronize(copyStream);
+#elif defined __HIP_DEVICE_COMPILE__
+  extern hipStream_t copyStream;
+  hipStreamSynchronize(copyStream);
+#elif defined __SYCL_DEVICE_ONLY__
+  theCopyAccelerator->wait();
+#endif
+};

 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 inline void acceleratorMemSet(void *base,int value,size_t bytes)
@@ -558,11 +587,10 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes)
    printf(" omp_target_memcpy device to host failed in MemSet for %ld in device %d \n",bytes,devc);
  }
 };
-#ifdef OMPTARGET_MANAGED 
-#include <cuda_runtime_api.h>
 inline void *acceleratorAllocShared(size_t bytes)
 {
-  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared from OMPTARGET MANAGED l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
+#ifdef __CUDA_ARCH__
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared from OMPTARGET MANAGED from CUDA l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
  void *ptr=NULL;
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
@@ -570,25 +598,22 @@ inline void *acceleratorAllocShared(size_t bytes)
    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
-};
-inline void acceleratorFreeShared(void *ptr){cudaFree(ptr);};
-//inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
-inline void *acceleratorAllocDevice(size_t bytes)
-{
-  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating device from OMPTARGET MANAGED l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
+#elif defined __HIP_DEVICE_COMPILE__
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared from OMPTARGET MANAGED from HIP l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
  void *ptr=NULL;
-  auto err = cudaMallocManaged((void **)&ptr,bytes);
-  if( err != cudaSuccess ) {
+  auto err = hipMallocManaged((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" hipMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
-};
-inline void acceleratorFreeDevice(void *ptr){free(ptr);};
+#elif defined __SYCL_DEVICE_ONLY__
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared from OMPTARGET MANAGED from SYCL l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
+  queue q;
+  //void *ptr = malloc_shared<void *>(bytes, q);
+  return ptr;
 #else
-inline void *acceleratorAllocShared(size_t bytes)
-{
-  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared mem from OMPTARGET l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared mem from OMPTARGET from LLVM l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
  int devc = omp_get_default_device();
  void *ptr=NULL;
  ptr = (void *) llvm_omp_target_alloc_shared(bytes, devc);
@@ -596,6 +621,7 @@ inline void *acceleratorAllocShared(size_t bytes)
    printf(" llvm_omp_target_alloc_shared failed for %ld in device %d \n",bytes,devc);
  }
  return ptr;
+#endif
 };
 inline void *acceleratorAllocDevice(size_t bytes)
 {
@@ -610,7 +636,6 @@ inline void *acceleratorAllocDevice(size_t bytes)
 };
 inline void acceleratorFreeShared(void *ptr){omp_target_free(ptr, omp_get_default_device());};
 inline void acceleratorFreeDevice(void *ptr){omp_target_free(ptr, omp_get_default_device());};
-#endif

 //OpenMP CPU threads
 #else
@@ -644,6 +669,12 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 #endif
 #endif

+//////////////////////////////////////////////
+// CPU Target - No accelerator just thread instead
+//////////////////////////////////////////////
+
+#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) && (!defined(GRID_OMPTARGET))
+#undef GRID_SIMT
 #endif // CPU target

 #ifdef HAVE_MM_MALLOC_H