diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 9c24993a..e4b5bdd5 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -476,6 +476,12 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); //OpenMP Target Offloading #ifdef OMPTARGET +#include +extern "C" void *llvm_omp_target_alloc_host (size_t Size, int DeviceNum); +extern "C" void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum); +extern "C" void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +//TODO: Dynamic Shared Memory + #define THREAD_LIMIT acceleratorThreads() #define accelerator @@ -507,17 +513,46 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); #endif accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) {;} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){;} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} -inline void acceleratorCopySynchronize(void) {;}; +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) +{ + printf("copy to device start \n"); + int devc = omp_get_default_device(); + int host = omp_get_initial_device(); + if( omp_target_memcpy( to, from, bytes, 0, 0, devc, host ) ) { + printf(" omp_target_memcpy host to device failed for %ld in device %d \n",bytes,devc); + } + printf("copy to device end \n"); +}; +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes) +{ + printf("copy from device start \n"); + int devc = omp_get_default_device(); + int host = omp_get_initial_device(); + if( omp_target_memcpy( to, from, bytes, 0, 0, host, devc ) ) { + printf(" omp_target_memcpy device to host failed for %ld in device %d \n",bytes,devc); + } + printf("copy from device end \n"); +}; +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { printf("TODO acceleratorCopyDeviceToDeviceAsynch");memcpy(to,from,bytes);} +inline void acceleratorCopySynchronize(void) {printf("TODO acceleratorCopySynchronize");}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } -inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} +inline void acceleratorMemSet(void *base,int value,size_t bytes) +{ + printf(" l-l-l-l-l-l-l-l-l-l-l-l-l OMPTARGET calling memset on host and copying to dev l-l-l-l-l-l-l-l-l-l-l-l \n"); + void *base_host = memalign(GRID_ALLOC_ALIGN,bytes); + memset(base_host,value,bytes); + int devc = omp_get_default_device(); + int host = omp_get_initial_device(); + if( omp_target_memcpy( base, base_host, bytes, 0, 0, devc, host ) ) { + printf(" omp_target_memcpy device to host failed in MemSet for %ld in device %d \n",bytes,devc); + } +}; #ifdef OMPTARGET_MANAGED #include inline void *acceleratorAllocShared(size_t bytes) { + printf(" l-l-l-l-l-l-l-l-l-l-l-l-l Allocating shared from OMPTARGET MANAGED l-l-l-l-l-l-l-l-l-l-l-l \n"); void *ptr=NULL; auto err = cudaMallocManaged((void **)&ptr,bytes); if( err != cudaSuccess ) { @@ -530,10 +565,30 @@ inline void acceleratorFreeShared(void *ptr){cudaFree(ptr);}; inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; inline void acceleratorFreeDevice(void *ptr){free(ptr);}; #else -inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; -inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; -inline void acceleratorFreeShared(void *ptr){free(ptr);}; -inline void acceleratorFreeDevice(void *ptr){free(ptr);}; +inline void *acceleratorAllocShared(size_t bytes) +{ + printf(" l-l-l-l-l-l-l-l-l-l-l-l-l Allocating shared mem from OMPTARGET l-l-l-l-l-l-l-l-l-l-l-l \n"); + int devc = omp_get_default_device(); + void *ptr=NULL; + ptr = (void *) llvm_omp_target_alloc_shared(bytes, devc); + if( ptr == NULL ) { + printf(" llvm_omp_target_alloc_shared failed for %ld in device %d \n",bytes,devc); + } + return ptr; +}; +inline void *acceleratorAllocDevice(size_t bytes) +{ + printf(" l-l-l-l-l-l-l-l-l-l-l-l-l Allocating device mem from OMPTARGET l-l-l-l-l-l-l-l-l-l-l-l \n"); + int devc = omp_get_default_device(); + void *ptr=NULL; + ptr = (void *) omp_target_alloc(bytes, devc); + if( ptr == NULL ) { + printf(" omp_target_alloc failed for %ld in device %d \n",bytes,devc); + } + return ptr; +}; +inline void acceleratorFreeShared(void *ptr){omp_target_free(ptr, omp_get_default_device());}; +inline void acceleratorFreeDevice(void *ptr){omp_target_free(ptr, omp_get_default_device());}; #endif //OpenMP CPU threads diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index bc51b1f7..49c0309c 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -36,7 +36,7 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); -#define LMAX (8) +#define LMAX (64) #define LMIN (8) #define LADD (8) @@ -49,6 +49,7 @@ int main (int argc, char ** argv) int64_t threads = GridThread::GetThreads(); int64_t accelerator_threads = acceleratorThreads(); + std::cout<({45,12,81,9})); - LatticeColourMatrix z(&Grid); random(pRNG,z); - LatticeColourMatrix x(&Grid); random(pRNG,x); - LatticeColourMatrix y(&Grid); random(pRNG,y); + printf("line 67 \n"); + LatticeColourMatrix z(&Grid); printf("z lattice color mat \n"); random(pRNG,z); + LatticeColourMatrix x(&Grid); printf("x lattice color mat \n"); random(pRNG,x); + LatticeColourMatrix y(&Grid); printf("y lattice color mat \n"); random(pRNG,y); for(int64_t i=0;i&2 -fi +#if command -v sha256sum; then +# echo "$EIGEN_SHA256SUM $(basename "$EIGEN_URL")" \ +# | sha256sum --check || exit 1 +#else +# echo "WARNING: could not verify checksum, please install sha256sum" >&2 +#fi ./scripts/update_eigen.sh ${ARC} rm ${ARC} # patch for non-portable includes in Eigen 3.3.5 # apparently already fixed in Eigen HEAD so it should not be # a problem in the future (A.P.) -patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch +# patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch echo '-- generating Make.inc files...' ./scripts/filelist