Updates for HiP

2026-06-15 16:43:11 +01:00 · 2020-05-24 14:00:55 -04:00
parent 92b342a477
commit 32be2b13d3
2 changed files with 60 additions and 26 deletions
@@ -33,17 +33,17 @@ void acceleratorInit(void)

  for (int i = 0; i < nDevices; i++) {

-#define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
+#define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorCudaInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
    cudaGetDeviceProperties(&gpu_props[i], i);
    if ( world_rank == 0) {
      cudaDeviceProp prop; 
      prop = gpu_props[i];
-      printf("GpuInit: ========================\n");
-      printf("GpuInit: Device Number    : %d\n", i);
-      printf("GpuInit: ========================\n");
-      printf("GpuInit: Device identifier: %s\n", prop.name);
+      printf("AcceleratorCudaInit: ========================\n");
+      printf("AcceleratorCudaInit: Device Number    : %d\n", i);
+      printf("AcceleratorCudaInit: ========================\n");
+      printf("AcceleratorCudaInit: Device identifier: %s\n", prop.name);

      GPU_PROP(managedMemory);
      GPU_PROP(isMultiGpuBoard);
@@ -55,12 +55,12 @@ void acceleratorInit(void)
  }
 #ifdef GRID_IBM_SUMMIT
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
-  if ( world_rank == 0 )  printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
+  if ( world_rank == 0 )  printf("AcceleratorCudaInit: IBM Summit or similar - NOT setting device to node rank\n");
 #else
-  if ( world_rank == 0 )  printf("GpuInit: setting device to node rank\n");
+  if ( world_rank == 0 )  printf("AcceleratorCudaInit: setting device to node rank\n");
  cudaSetDevice(rank);
 #endif
-  if ( world_rank == 0 )  printf("GpuInit: ================================================\n");
+  if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
 }
 #endif

@@ -92,17 +92,17 @@ void acceleratorInit(void)

  for (int i = 0; i < nDevices; i++) {

-#define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
+#define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
    hipGetDeviceProperties(&gpu_props[i], i);
    if ( world_rank == 0) {
      hipDeviceProp_t prop; 
      prop = gpu_props[i];
-      printf("GpuInit: ========================\n");
-      printf("GpuInit: Device Number    : %d\n", i);
-      printf("GpuInit: ========================\n");
-      printf("GpuInit: Device identifier: %s\n", prop.name);
+      printf("AcceleratorHipInit: ========================\n");
+      printf("AcceleratorHipInit: Device Number    : %d\n", i);
+      printf("AcceleratorHipInit: ========================\n");
+      printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);

      //      GPU_PROP(managedMemory);
      GPU_PROP(isMultiGpuBoard);
@@ -114,12 +114,12 @@ void acceleratorInit(void)
  }
 #ifdef GRID_IBM_SUMMIT
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
-  if ( world_rank == 0 )  printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
+  if ( world_rank == 0 )  printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n");
 #else
-  if ( world_rank == 0 )  printf("GpuInit: setting device to node rank\n");
-  cudaSetDevice(rank);
+  if ( world_rank == 0 )  printf("AcceleratorHipInit: setting device to node rank\n");
+  hipSetDevice(rank);
 #endif
-  if ( world_rank == 0 )  printf("GpuInit: ================================================\n");
+  if ( world_rank == 0 )  printf("AcceleratorHipInit: ================================================\n");
 }
 #endif

@@ -159,22 +159,22 @@ void acceleratorInit(void)
  /*
  for (int i = 0; i < nDevices; i++) {

-#define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
+#define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorSyclInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
    cudaGetDeviceProperties(&gpu_props[i], i);
    if ( world_rank == 0) {
      cudaDeviceProp prop; 
      prop = gpu_props[i];
-      printf("GpuInit: ========================\n");
-      printf("GpuInit: Device Number    : %d\n", i);
-      printf("GpuInit: ========================\n");
-      printf("GpuInit: Device identifier: %s\n", prop.name);
+      printf("AcceleratorSyclInit: ========================\n");
+      printf("AcceleratorSyclInit: Device Number    : %d\n", i);
+      printf("AcceleratorSyclInit: ========================\n");
+      printf("AcceleratorSyclInit: Device identifier: %s\n", prop.name);
    }
  }
  */
  if ( world_rank == 0 ) {
-    printf("GpuInit: ================================================\n");
+    printf("AcceleratorSyclInit: ================================================\n");
  }
 }
 #endif
@@ -284,6 +284,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 inline void *acceleratorAllocShared(size_t bytes)
 {
+#if 0
  void *ptr=NULL;
  auto err = hipMallocManaged((void **)&ptr,bytes);
  if( err != hipSuccess ) {
@@ -291,6 +292,9 @@ inline void *acceleratorAllocShared(size_t bytes)
    printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
+#else
+  return malloc(bytes);
+#endif
 };

 inline void *acceleratorAllocDevice(size_t bytes)
@@ -304,10 +308,10 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };

-inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
+inline void acceleratorFreeShared(void *ptr){ free(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}

 #endif

@@ -379,5 +383,35 @@ accelerator_inline void acceleratorSynchronise(void)
 #endif
  return;
 }
+accelerator_inline void acceleratorSynchroniseAll(void) 
+{
+#ifdef GRID_SIMT
+#ifdef GRID_CUDA
+  __syncthreads();
+#endif
+#ifdef GRID_SYCL
+  // No barrier call on SYCL??  // Option get __spir:: stuff to do warp barrier
+#endif
+#ifdef GRID_HIP
+  __syncthreads();
+#endif
+#endif
+  return;
+}
+accelerator_inline void acceleratorFence(void) 
+{
+#ifdef GRID_SIMT
+#ifdef GRID_CUDA
+  __threadfence();
+#endif
+#ifdef GRID_SYCL
+  // FIXMEE
+#endif
+#ifdef GRID_HIP
+  __threadfence();
+#endif
+#endif
+  return;
+}

 NAMESPACE_END(Grid);