Merge branch 'feature/omp-offload' of github.com:BNL-HPC/Grid into feature/omp-offload

2025-12-21 13:14:29 +00:00 · 2023-05-05 14:37:30 -04:00
parent a9df27f18d 5b50eaa55f
commit 2b6b98be48
3 changed files with 79 additions and 21 deletions
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -476,6 +476,12 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 //OpenMP Target Offloading
 #ifdef OMPTARGET
 #include<omp.h>
 extern "C" void *llvm_omp_target_alloc_host  (size_t Size, int DeviceNum);
 extern "C" void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum);
 extern "C" void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
 //TODO: Dynamic Shared Memory
 #define THREAD_LIMIT acceleratorThreads()
 #define accelerator
@@ -507,17 +513,46 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 #endif
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  {;}
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){;}
+{
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
+  printf("copy to device start \n");
-inline void acceleratorCopySynchronize(void) {;};
+  int devc = omp_get_default_device();
  int host = omp_get_initial_device();
  if( omp_target_memcpy( to, from, bytes, 0, 0, devc, host ) ) {
    printf(" omp_target_memcpy host to device failed for %ld in device %d \n",bytes,devc);
  }
  printf("copy to device end \n");
 };
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes)
 {
  printf("copy from device start \n");
  int devc = omp_get_default_device();
  int host = omp_get_initial_device();
  if( omp_target_memcpy( to, from, bytes, 0, 0, host, devc ) ) {
    printf(" omp_target_memcpy device to host failed for %ld in device %d \n",bytes,devc);
  }
  printf("copy from device end \n");
 };
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { printf("TODO acceleratorCopyDeviceToDeviceAsynch");memcpy(to,from,bytes);}
 inline void acceleratorCopySynchronize(void) {printf("TODO acceleratorCopySynchronize");};
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
+inline void acceleratorMemSet(void *base,int value,size_t bytes)
 {
  printf(" l-l-l-l-l-l-l-l-l-l-l-l-l OMPTARGET calling memset on host and copying to dev l-l-l-l-l-l-l-l-l-l-l-l \n");
  void *base_host = memalign(GRID_ALLOC_ALIGN,bytes);
  memset(base_host,value,bytes);
  int devc = omp_get_default_device();
  int host = omp_get_initial_device();
  if( omp_target_memcpy( base, base_host, bytes, 0, 0, devc, host ) ) {
    printf(" omp_target_memcpy device to host failed in MemSet for %ld in device %d \n",bytes,devc);
  }
 };
 #ifdef OMPTARGET_MANAGED 
 #include <cuda_runtime_api.h>
 inline void *acceleratorAllocShared(size_t bytes)
 {
  printf(" l-l-l-l-l-l-l-l-l-l-l-l-l Allocating shared from OMPTARGET MANAGED l-l-l-l-l-l-l-l-l-l-l-l \n");	
  void *ptr=NULL;
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
@@ -530,10 +565,30 @@ inline void acceleratorFreeShared(void *ptr){cudaFree(ptr);};
 inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 #else
-inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
+inline void *acceleratorAllocShared(size_t bytes)
-inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
+{
-inline void acceleratorFreeShared(void *ptr){free(ptr);};
+  printf(" l-l-l-l-l-l-l-l-l-l-l-l-l Allocating shared mem from OMPTARGET l-l-l-l-l-l-l-l-l-l-l-l \n");
-inline void acceleratorFreeDevice(void *ptr){free(ptr);};
+  int devc = omp_get_default_device();
  void *ptr=NULL;
  ptr = (void *) llvm_omp_target_alloc_shared(bytes, devc);
  if( ptr == NULL ) {
    printf(" llvm_omp_target_alloc_shared failed for %ld in device %d \n",bytes,devc);
  }
  return ptr;
 };
 inline void *acceleratorAllocDevice(size_t bytes)
 {
  printf(" l-l-l-l-l-l-l-l-l-l-l-l-l Allocating device mem from OMPTARGET l-l-l-l-l-l-l-l-l-l-l-l \n");
  int devc = omp_get_default_device();
  void *ptr=NULL;
  ptr = (void *) omp_target_alloc(bytes, devc);
  if( ptr == NULL ) {
    printf(" omp_target_alloc failed for %ld in device %d \n",bytes,devc);
  }
  return ptr;
 };
 inline void acceleratorFreeShared(void *ptr){omp_target_free(ptr, omp_get_default_device());};
 inline void acceleratorFreeDevice(void *ptr){omp_target_free(ptr, omp_get_default_device());};
 #endif
 //OpenMP CPU threads
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -36,7 +36,7 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
-#define LMAX (8)
+#define LMAX (64)
 #define LMIN (8)
 #define LADD (8)
@@ -49,6 +49,7 @@ int main (int argc, char ** argv)
  int64_t threads = GridThread::GetThreads();
  int64_t accelerator_threads = acceleratorThreads();
  std::cout<<GridLogMessage << "Grid is setup with LMAX="<< LMAX << ", LMIN=" << LMIN << ", LADD=" << LADD << ", Nwarm, Nloop =" << Nwarm <<"," << Nloop <<std::endl;
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "Grid is setup to use "<<accelerator_threads<<" GPU threads"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -64,9 +65,10 @@ int main (int argc, char ** argv)
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      printf("line 67 \n");
-      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix z(&Grid); printf("z lattice color mat \n"); random(pRNG,z);
-      LatticeColourMatrix y(&Grid); random(pRNG,y);
+      LatticeColourMatrix x(&Grid); printf("x lattice color mat \n"); random(pRNG,x);
      LatticeColourMatrix y(&Grid); printf("y lattice color mat \n"); random(pRNG,y);
      for(int64_t i=0;i<Nwarm;i++){
 	x=x*y;
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,25 +1,26 @@
 #!/usr/bin/env bash
 set -e
-EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
+EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2'
 ##EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
 EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11'
 echo "-- deploying Eigen source..."
 ARC=`basename ${EIGEN_URL}`
 wget ${EIGEN_URL} --no-check-certificate
-if command -v sha256sum; then
+#if command -v sha256sum; then
-   echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \
+#   echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \
-      | sha256sum --check || exit 1
+#      | sha256sum --check || exit 1
-else
+#else
-   echo "WARNING: could not verify checksum, please install sha256sum" >&2
+#   echo "WARNING: could not verify checksum, please install sha256sum" >&2
-fi
+#fi
 ./scripts/update_eigen.sh ${ARC}
 rm ${ARC}
 # patch for non-portable includes in Eigen 3.3.5
 # apparently already fixed in Eigen HEAD so it should not be 
 # a problem in the future (A.P.)
-patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch
+# patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch
 echo '-- generating Make.inc files...'
 ./scripts/filelist