removed print flags

2025-12-10 08:04:41 +00:00 · 2023-12-04 15:12:03 -05:00
parent 157368ed04
commit 867abeaf8e
9 changed files with 11566 additions and 95 deletions
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -222,6 +222,9 @@ void MemoryManager::InitMessage(void) {
 #ifdef GRID_SYCL
  std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
 #endif
+#ifdef GRID_OMPTARGET
+  std::cout << GridLogMessage<< "MemoryManager::Init() Using OMPTARGET omp_alloc_device"<<std::endl;
+#endif
 #else
  std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
 #ifdef GRID_CUDA
@@ -233,6 +236,9 @@ void MemoryManager::InitMessage(void) {
 #ifdef GRID_SYCL
  std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
 #endif
+#ifdef GRID_OMPTARGET
+  std::cout << GridLogMessage<< "MemoryManager::Init() Using OMPTARGET managed memory"<<std::endl;
+#endif
 #endif

 }
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -220,28 +220,22 @@ void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 }
 void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
 {
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
  } else { 
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    assert(0);
    return NULL;
  }
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
-  std::cout << __FILE__ << " " << __LINE__ << " " << bytes << " " << DeviceLRUBytes << " " << DeviceMaxBytes <<  std::endl;
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
      assert(LRU.size()>0);
      uint64_t victim = LRU.back(); // From the LRU
      auto AccCacheIterator = EntryLookup(victim);
@@ -251,7 +245,6 @@ void  MemoryManager::EvictVictims(uint64_t bytes)
      return;
    }
  }
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
 }
 uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
 {
@@ -261,18 +254,13 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  if ( EntryPresent(CpuPtr)==0 ){
    EntryCreate(CpuPtr,bytes,mode,hint);
  }
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;

  auto AccCacheIterator = EntryLookup(CpuPtr);
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
  auto & AccCache = AccCacheIterator->second;
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
  if (!AccCache.AccPtr) {
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    EvictVictims(bytes); 
  } 
  assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;

  assert(AccCache.cpuLock==0);  // Programming error

@@ -286,7 +274,6 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
    assert(AccCache.CpuPtr == CpuPtr);
    assert(AccCache.bytes  ==bytes);
  }
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
 /*
 *  State transitions and actions
 *
@@ -302,7 +289,6 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
 *  AccWrite AccDirty   AccDirty       -        - 
 */
  if(AccCache.state==Empty) {
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    assert(AccCache.LRU_valid==0);
    AccCache.CpuPtr = CpuPtr;
    AccCache.AccPtr = (uint64_t)NULL;
@@ -321,7 +307,6 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
    AccCache.accLock= 1;
    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
      AccCache.state  = AccDirty;   // CpuDirty + AcceleratorWrite=> AccDirty
@@ -335,7 +320,6 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
    AccCache.accLock++;
    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
@@ -343,7 +327,6 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
    AccCache.accLock++;
    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
@@ -351,14 +334,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
    AccCache.accLock++;
    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
  } else {
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    assert(0);
  }

  assert(AccCache.accLock>0);
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    // must possibly remove from LRU as now locked on GPU
    dprintf("AccCache entry removed from LRU \n");
    LRUremove(AccCache);
@@ -367,7 +348,6 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  int transient =hint;
  AccCache.transient= transient? EvictNext : 0;

-  std::cout << __FILE__ << " " << __LINE__ << std::endl;
  return AccCache.AccPtr;
 }
 ////////////////////////////////////
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -78,7 +78,6 @@ private:
      else 
 	this->_odata      = nullptr;
    }
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
  }
 public:

@@ -86,17 +85,13 @@ public:
  // Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
  /////////////////////////////////////////////////////////////////////////////////
  void SetViewMode(ViewMode mode) {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    accessor.ViewClose();
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
  }

  // Helper function to print the state of this object in the AccCache
  void PrintCacheState(void)
  {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    MemoryManager::PrintState(this->_odata);
  }

@@ -108,7 +103,6 @@ public:

  LatticeView<vobj> View (ViewMode mode) const 
  {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    return accessor;
  }
@@ -123,7 +117,6 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
@@ -148,7 +141,6 @@ public:
  }
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
@@ -173,7 +165,6 @@ public:
  }
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
@@ -198,7 +189,6 @@ public:
  //GridFromExpression is tricky to do
  template<class Op,class T1>
  Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = nullptr;
    GridFromExpression(this->_grid,expr);
    assert(this->_grid!=nullptr);
@@ -214,7 +204,6 @@ public:
  }
  template<class Op,class T1, class T2>
  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = nullptr;
    GridFromExpression(this->_grid,expr);
    assert(this->_grid!=nullptr);
@@ -230,7 +219,6 @@ public:
  }
  template<class Op,class T1, class T2, class T3>
  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = nullptr;
    GridFromExpression(this->_grid,expr);
    assert(this->_grid!=nullptr);
@@ -246,7 +234,6 @@ public:
  }

  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    auto me  = View(CpuWrite);
    thread_for(ss,me.size(),{
 	me[ss]= r;
@@ -262,19 +249,16 @@ public:
  // user defined constructor
  ///////////////////////////////////////////
  Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { 
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = grid;
    resize(this->_grid->oSites());
    assert((((uint64_t)&this->_odata[0])&0xF) ==0);
    this->checkerboard=0;
    SetViewMode(mode);
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
  }
  
  //  virtual ~Lattice(void) = default;
    
  void reset(GridBase* grid) {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    if (this->_grid != grid) {
      this->_grid = grid;
      this->resize(grid->oSites());
@@ -285,7 +269,6 @@ public:
  // copy constructor
  ///////////////////////////////////////////
  Lattice(const Lattice& r){ 
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = r.Grid();
    resize(this->_grid->oSites());
    *this = r;
@@ -294,7 +277,6 @@ public:
  // move constructor
  ///////////////////////////////////////////
  Lattice(Lattice && r){ 
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = r.Grid();
    this->_odata      = r._odata;
    this->_odata_size = r._odata_size;
@@ -306,7 +288,6 @@ public:
  // assignment template
  ///////////////////////////////////////////
  template<class robj> inline Lattice<vobj> & operator = (const Lattice<robj> & r){
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
    conformable(*this,r);
    this->checkerboard = r.Checkerboard();
@@ -323,7 +304,6 @@ public:
  // Copy assignment 
  ///////////////////////////////////////////
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->checkerboard = r.Checkerboard();
    conformable(*this,r);
    auto him= r.View(AcceleratorRead);
@@ -338,7 +318,6 @@ public:
  // Move assignment possible if same type
  ///////////////////////////////////////////
  inline Lattice<vobj> & operator = (Lattice<vobj> && r){
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;

    resize(0); // deletes if appropriate
    this->_grid       = r.Grid();
@@ -356,24 +335,20 @@ public:
  // *=,+=,-= operators inherit behvour from correspond */+/- operation
  /////////////////////////////////////////////////////////////////////////////
  template<class T> inline Lattice<vobj> &operator *=(const T &r) {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    *this = (*this)*r;
    return *this;
  }
  
  template<class T> inline Lattice<vobj> &operator -=(const T &r) {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    *this = (*this)-r;
    return *this;
  }
  template<class T> inline Lattice<vobj> &operator +=(const T &r) {
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    *this = (*this)+r;
    return *this;
  }

  friend inline void swap(Lattice &l, Lattice &r) { 
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    conformable(l,r);
    LatticeAccelerator<vobj> tmp;
    LatticeAccelerator<vobj> *lp = (LatticeAccelerator<vobj> *)&l;
@@ -384,7 +359,6 @@ public:
 }; // class Lattice

 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
-      std::cout << __FILE__ << " " << __LINE__ << std::endl;
  typedef typename vobj::scalar_object sobj;
  for(int g=0;g<o.Grid()->_gsites;g++){

--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -32,6 +32,9 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
 #ifdef GRID_HIP
  hipGetDevice(&device);
 #endif
+#ifdef GRID_OMPTARGET
+  device = omp_get_device_num();  
+#endif

  Iterator warpSize            = gpu_props[device].warpSize;
  Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -32,15 +32,14 @@ protected:
  uint64_t _odata_size;    
  ViewAdvise advise;
 public:
-  accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { std::cout << __FILE__ << " " << __LINE__ << std::endl; }; 
-  accelerator_inline uint64_t oSites(void) const { std::cout << __FILE__ << " " << __LINE__ << std::endl; return _odata_size; };
-  accelerator_inline int  Checkerboard(void) const { std::cout << __FILE__ << " " << __LINE__ << std::endl; return checkerboard; };
-  accelerator_inline int &Checkerboard(void) { std::cout << __FILE__ << " " << __LINE__ << std::endl; return this->checkerboard; }; // can assign checkerboard on a container, not a view
-  accelerator_inline ViewAdvise Advise(void) const { std::cout << __FILE__ << " " << __LINE__ << std::endl; return advise; };
-  accelerator_inline ViewAdvise &Advise(void) { std::cout << __FILE__ << " " << __LINE__ << std::endl; return this->advise; }; // can assign advise on a container, not a view
+  accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { }; 
+  accelerator_inline uint64_t oSites(void) const { return _odata_size; };
+  accelerator_inline int  Checkerboard(void) const { return checkerboard; };
+  accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
+  accelerator_inline ViewAdvise Advise(void) const { return advise; };
+  accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
  accelerator_inline void Conformable(GridBase * &grid) const
  { 
-    std::cout << __FILE__ << " " << __LINE__ << std::endl;	  
    if (grid) conformable(grid, _grid);
    else      grid = _grid;
  };
@@ -80,11 +79,10 @@ public:
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };

-  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){ std::cout << __FILE__ << " " << __LINE__ << std::endl; }
+  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){ }
  LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
  LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
  {
-    std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->ViewOpen(mode);
  }

@@ -92,16 +90,13 @@ public:
  void ViewOpen(ViewMode mode)
  { // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
    //    std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
-    std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->cpu_ptr = (void *)this->_odata;
    this->mode    = mode;
-    std::cout << __FILE__ << " " << __LINE__ << " " << this->cpu_ptr << " " << this->_odata_size*sizeof(vobj) << std::endl;
    this->_odata  =(vobj *)
      MemoryManager::ViewOpen(this->cpu_ptr,
 				this->_odata_size*sizeof(vobj),
 				mode,
 				this->advise);    
-    std::cout << __FILE__ << " " << __LINE__ << std::endl;
  }
  void ViewClose(void)
  { // Inform the manager
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -14,7 +14,15 @@ void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"

-#ifdef GRID_CUDA
+#ifdef __CUDA_ARCH__
+#warning "ifdef cuda arch"
+#endif
+
+// fold omptarget into device specific acceleratorInit()
+#include <cuda_runtime_api.h>
+//#if defined(GRID_CUDA) || (defined(GRID_OMPTARGET) && defined(__CUDA_ARCH__))
+#if defined(GRID_OMPTARGET)
+#warning "using cuda from opmtarget"
 cudaDeviceProp *gpu_props;
 cudaStream_t copyStream;
 cudaStream_t computeStream;
@@ -113,7 +121,7 @@ void acceleratorInit(void)
 }
 #endif

-#ifdef GRID_HIP
+#if defined(GRID_HIP) || (defined(GRID_OMPTARGET) && defined(__HIP_DEVICE_COMPILE__))
 hipDeviceProp_t *gpu_props;
 hipStream_t copyStream;
 hipStream_t computeStream;
@@ -198,7 +206,7 @@ void acceleratorInit(void)
 #endif


-#ifdef GRID_SYCL
+#if defined(GRID_SYCL) || (defined(GRID_OMPTARGET) && defined(__SYCL_DEVICE_ONLY__))

 cl::sycl::queue *theGridAccelerator;
 cl::sycl::queue *theCopyAccelerator;
@@ -270,7 +278,7 @@ void acceleratorInit(void)
 }
 #endif

-#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
+#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP)) && (!defined(GRID_OMPTARGET))
 void acceleratorInit(void){}
 #endif

--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -477,16 +477,20 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 #endif

 //////////////////////////////////////////////
-// CPU Target - No accelerator just thread instead
+// OpenMP Target acceleration
 //////////////////////////////////////////////
-
-#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
-
-#undef GRID_SIMT
-
-//OpenMP Target Offloading
-#ifdef OMPTARGET
+#ifdef GRID_OMPTARGET
+//TODO GRID_SIMT for OMPTARGET
+#define GRID_ACCELERATED
 #include<omp.h>
+#ifdef __CUDA_ARCH__
+#include <cuda_runtime_api.h>
+#elif defined __HIP_DEVICE_COMPILE__
+#include <hip/hip_runtime.h>
+#elif defined __SYCL_DEVICE_ONLY__
+#include <CL/sycl.hpp>
+#include <CL/sycl/usm.hpp>
+#endif
 extern "C" void *llvm_omp_target_alloc_host  (size_t Size, int DeviceNum);
 extern "C" void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum);
 extern "C" void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
@@ -543,8 +547,33 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes)
  }
  std::cout << "D->H copy from device end "<<std::endl;
 };
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { printf("TODO acceleratorCopyDeviceToDeviceAsynch");memcpy(to,from,bytes);}
-inline void acceleratorCopySynchronise(void) {printf("TODO acceleratorCopySynchronize");};
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) 
+{ 
+  printf("TODO acceleratorCopyDeviceToDeviceAsynch");//memcpy(to,from,bytes);
+#ifdef __CUDA_ARCH__
+  extern cudaStream_t copyStream;
+  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
+#elif defined __HIP_DEVICE_COMPILE__
+  extern hipStream_t copyStream;
+  hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+#elif defined __SYCL_DEVICE_ONLY__
+  theCopyAccelerator->memcpy(to,from,bytes);
+#endif
+};
+inline void acceleratorCopySynchronise(void) 
+{
+  printf("TODO acceleratorCopySynchronize");
+  //#pragma omp barrier
+#ifdef __CUDA_ARCH__
+  extern cudaStream_t copyStream;
+  cudaStreamSynchronize(copyStream);
+#elif defined __HIP_DEVICE_COMPILE__
+  extern hipStream_t copyStream;
+  hipStreamSynchronize(copyStream);
+#elif defined __SYCL_DEVICE_ONLY__
+  theCopyAccelerator->wait();
+#endif
+};

 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 inline void acceleratorMemSet(void *base,int value,size_t bytes)
@@ -558,11 +587,10 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes)
    printf(" omp_target_memcpy device to host failed in MemSet for %ld in device %d \n",bytes,devc);
  }
 };
-#ifdef OMPTARGET_MANAGED 
-#include <cuda_runtime_api.h>
 inline void *acceleratorAllocShared(size_t bytes)
 {
-  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared from OMPTARGET MANAGED l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
+#ifdef __CUDA_ARCH__
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared from OMPTARGET MANAGED from CUDA l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
  void *ptr=NULL;
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
@@ -570,25 +598,22 @@ inline void *acceleratorAllocShared(size_t bytes)
    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
-};
-inline void acceleratorFreeShared(void *ptr){cudaFree(ptr);};
-//inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
-inline void *acceleratorAllocDevice(size_t bytes)
-{
-  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating device from OMPTARGET MANAGED l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
+#elif defined __HIP_DEVICE_COMPILE__
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared from OMPTARGET MANAGED from HIP l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
  void *ptr=NULL;
-  auto err = cudaMallocManaged((void **)&ptr,bytes);
-  if( err != cudaSuccess ) {
+  auto err = hipMallocManaged((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" hipMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
-};
-inline void acceleratorFreeDevice(void *ptr){free(ptr);};
+#elif defined __SYCL_DEVICE_ONLY__
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared from OMPTARGET MANAGED from SYCL l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
+  queue q;
+  //void *ptr = malloc_shared<void *>(bytes, q);
+  return ptr;
 #else
-inline void *acceleratorAllocShared(size_t bytes)
-{
-  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared mem from OMPTARGET l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared mem from OMPTARGET from LLVM l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
  int devc = omp_get_default_device();
  void *ptr=NULL;
  ptr = (void *) llvm_omp_target_alloc_shared(bytes, devc);
@@ -596,6 +621,7 @@ inline void *acceleratorAllocShared(size_t bytes)
    printf(" llvm_omp_target_alloc_shared failed for %ld in device %d \n",bytes,devc);
  }
  return ptr;
+#endif
 };
 inline void *acceleratorAllocDevice(size_t bytes)
 {
@@ -610,7 +636,6 @@ inline void *acceleratorAllocDevice(size_t bytes)
 };
 inline void acceleratorFreeShared(void *ptr){omp_target_free(ptr, omp_get_default_device());};
 inline void acceleratorFreeDevice(void *ptr){omp_target_free(ptr, omp_get_default_device());};
-#endif

 //OpenMP CPU threads
 #else
@@ -644,6 +669,12 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 #endif
 #endif

+//////////////////////////////////////////////
+// CPU Target - No accelerator just thread instead
+//////////////////////////////////////////////
+
+#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) && (!defined(GRID_OMPTARGET))
+#undef GRID_SIMT
 #endif // CPU target

 #ifdef HAVE_MM_MALLOC_H
--- a/11471
+++ b/11471
--- a/configure.ac
+++ b/configure.ac
@@ -229,13 +229,13 @@ case ${ac_ACC_CSHIFT} in
 esac


-############### SYCL/CUDA/HIP/none
+############### SYCL/CUDA/HIP/OpenMP/none
 AC_ARG_ENABLE([accelerator],
-    [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
+    [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|openmp|none],[enable none,openmp,cuda,sycl,hip acceleration])],
    [ac_ACCELERATOR=${enable_accelerator}], [ac_ACCELERATOR=none])
 case ${ac_ACCELERATOR} in
    cuda)
-      echo CUDA acceleration
+      echo CUDA acceleration ${ac_ACCELERATOR} ${enable_accelerator}
      LIBS="${LIBS} -lcuda"
      AC_DEFINE([GRID_CUDA],[1],[Use CUDA offload]);;
    sycl)
@@ -244,12 +244,15 @@ case ${ac_ACCELERATOR} in
    hip)
      echo HIP acceleration
      AC_DEFINE([GRID_HIP],[1],[Use HIP offload]);;
+    openmp)
+      echo OMPTARGET acceleration
+      AC_DEFINE([GRID_OMPTARGET],[1],[Use OMPTARGET offload]);;
    none)
      echo NO acceleration    ;;
    no)
      echo NO acceleration    ;;
    *)
-      AC_MSG_ERROR(["Acceleration not suppoorted ${ac_ACCELERATOR}"]);;
+      AC_MSG_ERROR(["1Acceleration not suppoorted ${ac_ACCELERATOR}"]);;
 esac

 ############### UNIFIED MEMORY