First compiile on HiP

2026-01-31 04:13:29 +00:00 · 2020-05-10 05:28:09 -04:00
parent 52081acfa5
commit bbbee5660d
9 changed files with 44 additions and 25 deletions
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -12,7 +12,7 @@
 #endif

 /* NVCC save and restore compile environment*/
-#ifdef GRID_CUDA
+#ifdef __NVCC__
 #pragma push
 #pragma diag_suppress code_is_unreachable
 #pragma push_macro("__CUDA_ARCH__")
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -155,7 +155,7 @@ public:

    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) acceleratorAllocShared(bytes);

-    assert( ptr != (_Tp *)NULL);
+    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );

    return ptr;
  }
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -38,7 +38,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
 {
  typedef typename vobj::scalar_object  sobj;

-  const int Nsimd = vobj::Nsimd();
+  //  const int Nsimd = vobj::Nsimd();
  const int nthread = GridThread::GetThreads();

  Vector<sobj> sumarray(nthread);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -644,7 +644,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
-#ifndef GRID_CUDA
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
@@ -779,9 +779,9 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);

-  int tshift = (mu == Nd-1) ? 1 : 0;

 #if 0
+  int tshift = (mu == Nd-1) ? 1 : 0;
  ////////////////////////////////////////////////
  // SHAMIR CASE 
  ////////////////////////////////////////////////
@@ -828,7 +828,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
 #endif

-#ifndef GRID_CUDA
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+  int tshift = (mu == Nd-1) ? 1 : 0;
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
--- a/Grid/serialisation/Serialisation.h
+++ b/Grid/serialisation/Serialisation.h
@@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "BinaryIO.h"
 #include "TextIO.h"
 #include "XmlIO.h"
-#ifndef GRID_CUDA
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
 #include "JSON_IO.h"
 #endif

--- a/Grid/simd/Grid_gpu_vec.h
+++ b/Grid/simd/Grid_gpu_vec.h
@@ -32,7 +32,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 */
 //----------------------------------------------------------------------

+#ifdef GRID_CUDA
 #include <cuda_fp16.h>
+#endif
+#ifdef GRID_HIP
+#include <hip_fp16.h>
+#endif

 namespace Grid {

--- a/Grid/simd/Simd.h
+++ b/Grid/simd/Simd.h
@@ -31,7 +31,7 @@ directory
 #ifndef GRID_SIMD_H
 #define GRID_SIMD_H

-#ifdef GRID_CUDA
+#if defined(GRID_CUDA) || defined(GRID_HIP)
 #include <thrust/complex.h>
 #endif

@@ -65,7 +65,7 @@ typedef RealD   Real;
 typedef RealF  Real;
 #endif

-#ifdef GRID_CUDA
+#if defined(GRID_CUDA) || defined(GRID_HIP)
 typedef thrust::complex<RealF> ComplexF;
 typedef thrust::complex<RealD> ComplexD;
 typedef thrust::complex<Real>  Complex;
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -73,9 +73,6 @@ void     acceleratorThreads(uint32_t);
 //////////////////////////////////////////////
 // CUDA acceleration
 //////////////////////////////////////////////
-#ifdef __NVCC__
-#define GRID_CUDA
-#endif

 #ifdef GRID_CUDA

@@ -197,6 +194,9 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 // HIP acceleration
 //////////////////////////////////////////////
 #ifdef GRID_HIP
+NAMESPACE_END(Grid);
+#include <hip/hip_runtime.h>
+NAMESPACE_BEGIN(Grid);

 #ifdef __HIP_DEVICE_COMPILE__
 #define GRID_SIMT
@@ -224,7 +224,7 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
    };									\
    dim3 hip_threads(acceleratorThreads(),nsimd);				\
    dim3 hip_blocks ((num+acceleratorThreads()-1)/acceleratorThreads());			\
-    hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,0,0,num,simd,lambda);\
+    hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,0,0,num,nsimd,lambda);\
  }

 #define accelerator_for( iterator, num, nsimd, ... )		\