added file line traces

2025-08-09 16:07:06 +01:00 · 2023-07-03 11:18:13 -04:00
parent 3671ace5a1
commit cb277ae516
8 changed files with 111 additions and 34 deletions
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -200,20 +200,26 @@ void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 }
 void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
 {
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
  } else { 
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    assert(0);
    return NULL;
  }
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
+  std::cout << __FILE__ << " " << __LINE__ << " " << bytes << " " << DeviceLRUBytes << " " << DeviceMaxBytes <<  std::endl;
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
      assert(LRU.size()>0);
      uint64_t victim = LRU.back();
      auto AccCacheIterator = EntryLookup(victim);
@@ -221,6 +227,7 @@ void  MemoryManager::EvictVictims(uint64_t bytes)
      Evict(AccCache);
    }
  }
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
 }
 uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
 {
@@ -230,13 +237,18 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  if ( EntryPresent(CpuPtr)==0 ){
    EntryCreate(CpuPtr,bytes,mode,hint);
  }
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;

  auto AccCacheIterator = EntryLookup(CpuPtr);
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
  auto & AccCache = AccCacheIterator->second;
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
  if (!AccCache.AccPtr) {
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    EvictVictims(bytes); 
  } 
  assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;

  assert(AccCache.cpuLock==0);  // Programming error

@@ -249,6 +261,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
    assert(AccCache.CpuPtr == CpuPtr);
    assert(AccCache.bytes  ==bytes);
  }
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
 /*
 *  State transitions and actions
 *
@@ -264,6 +277,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
 *  AccWrite AccDirty   AccDirty       -        - 
 */
  if(AccCache.state==Empty) {
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    assert(AccCache.LRU_valid==0);
    AccCache.CpuPtr = CpuPtr;
    AccCache.AccPtr = (uint64_t)NULL;
@@ -281,6 +295,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
    }
    AccCache.accLock= 1;
  } else if(AccCache.state==CpuDirty ){
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
      AccCache.state  = AccDirty;   // CpuDirty + AcceleratorWrite=> AccDirty
@@ -294,6 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
    AccCache.accLock++;
    dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
@@ -301,6 +317,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
    AccCache.accLock++;
    dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
@@ -308,11 +325,13 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
    AccCache.accLock++;
    dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
  } else {
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    assert(0);
  }

  // If view is opened on device remove from LRU
  if(AccCache.LRU_valid==1){
+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
    // must possibly remove from LRU as now locked on GPU
    LRUremove(AccCache);
  }
@@ -320,6 +339,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  int transient =hint;
  AccCache.transient= transient? EvictNext : 0;

+  std::cout << __FILE__ << " " << __LINE__ << std::endl;
  return AccCache.AccPtr;
 }
 ////////////////////////////////////
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -67,7 +67,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-  ShmCommBuf = acceleratorAllocDevice(bytes);
+  ShmCommBuf = acceleratorAllocShared(bytes);
+  //ShmCommBuf = acceleratorAllocDevice(bytes);

  if (ShmCommBuf == (void *)NULL ) {
    std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -78,6 +78,7 @@ private:
      else 
 	this->_odata      = nullptr;
    }
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
  }
 public:

@@ -85,13 +86,17 @@ public:
  // Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
  /////////////////////////////////////////////////////////////////////////////////
  void SetViewMode(ViewMode mode) {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    accessor.ViewClose();
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
  }

  // Helper function to print the state of this object in the AccCache
  void PrintCacheState(void)
  {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    MemoryManager::PrintState(this->_odata);
  }

@@ -103,6 +108,7 @@ public:

  LatticeView<vobj> View (ViewMode mode) const 
  {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    return accessor;
  }
@@ -117,6 +123,7 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -140,6 +147,7 @@ public:
  }
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -163,6 +171,7 @@ public:
  }
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -186,6 +195,7 @@ public:
  //GridFromExpression is tricky to do
  template<class Op,class T1>
  Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = nullptr;
    GridFromExpression(this->_grid,expr);
    assert(this->_grid!=nullptr);
@@ -201,6 +211,7 @@ public:
  }
  template<class Op,class T1, class T2>
  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = nullptr;
    GridFromExpression(this->_grid,expr);
    assert(this->_grid!=nullptr);
@@ -216,6 +227,7 @@ public:
  }
  template<class Op,class T1, class T2, class T3>
  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = nullptr;
    GridFromExpression(this->_grid,expr);
    assert(this->_grid!=nullptr);
@@ -231,6 +243,7 @@ public:
  }

  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    auto me  = View(CpuWrite);
    thread_for(ss,me.size(),{
 	me[ss]= r;
@@ -246,16 +259,19 @@ public:
  // user defined constructor
  ///////////////////////////////////////////
  Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { 
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = grid;
    resize(this->_grid->oSites());
    assert((((uint64_t)&this->_odata[0])&0xF) ==0);
    this->checkerboard=0;
    SetViewMode(mode);
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
  }
  
  //  virtual ~Lattice(void) = default;
    
  void reset(GridBase* grid) {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    if (this->_grid != grid) {
      this->_grid = grid;
      this->resize(grid->oSites());
@@ -266,6 +282,7 @@ public:
  // copy constructor
  ///////////////////////////////////////////
  Lattice(const Lattice& r){ 
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = r.Grid();
    resize(this->_grid->oSites());
    *this = r;
@@ -274,6 +291,7 @@ public:
  // move constructor
  ///////////////////////////////////////////
  Lattice(Lattice && r){ 
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->_grid = r.Grid();
    this->_odata      = r._odata;
    this->_odata_size = r._odata_size;
@@ -285,6 +303,7 @@ public:
  // assignment template
  ///////////////////////////////////////////
  template<class robj> inline Lattice<vobj> & operator = (const Lattice<robj> & r){
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
    conformable(*this,r);
    this->checkerboard = r.Checkerboard();
@@ -301,6 +320,7 @@ public:
  // Copy assignment 
  ///////////////////////////////////////////
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->checkerboard = r.Checkerboard();
    conformable(*this,r);
    auto me =   View(AcceleratorWriteDiscard);
@@ -315,6 +335,7 @@ public:
  // Move assignment possible if same type
  ///////////////////////////////////////////
  inline Lattice<vobj> & operator = (Lattice<vobj> && r){
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;

    resize(0); // deletes if appropriate
    this->_grid       = r.Grid();
@@ -332,20 +353,24 @@ public:
  // *=,+=,-= operators inherit behvour from correspond */+/- operation
  /////////////////////////////////////////////////////////////////////////////
  template<class T> inline Lattice<vobj> &operator *=(const T &r) {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    *this = (*this)*r;
    return *this;
  }
  
  template<class T> inline Lattice<vobj> &operator -=(const T &r) {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    *this = (*this)-r;
    return *this;
  }
  template<class T> inline Lattice<vobj> &operator +=(const T &r) {
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    *this = (*this)+r;
    return *this;
  }

  friend inline void swap(Lattice &l, Lattice &r) { 
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
    conformable(l,r);
    LatticeAccelerator<vobj> tmp;
    LatticeAccelerator<vobj> *lp = (LatticeAccelerator<vobj> *)&l;
@@ -356,6 +381,7 @@ public:
 }; // class Lattice

 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
  typedef typename vobj::scalar_object sobj;
  for(int g=0;g<o.Grid()->_gsites;g++){

--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -243,11 +243,19 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
    autoView( right_v,right, AcceleratorRead);
    // This code could read coalesce
    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, 1,{
-	auto x_l = left_v[ss];
-	auto y_l = right_v[ss];
-	inner_tmp_v[ss]=innerProductD(x_l,y_l);
-    });
+    //accelerator_for( ss, sites, 1,{
+    //    auto x_l = left_v[ss];
+    //    auto y_l = right_v[ss];
+    //    inner_tmp_v[ss]=innerProductD(x_l,y_l);
+    //});
+    #pragma omp target map ( to:left_v, right_v ) map ( tofrom:inner_tmp_v )
+    #pragma omp teams distribute parallel for thread_limit(THREAD_LIMIT) //nowait
+    for ( uint64_t ss=0;ss<sites;ss++) { 
+        auto x_l = left_v[ss];
+        auto y_l = right_v[ss];
+        inner_tmp_v[ss]=innerProductD(x_l,y_l);
+    }
+
  }

  // This is in single precision and fails some tests
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -32,14 +32,15 @@ protected:
  uint64_t _odata_size;    
  ViewAdvise advise;
 public:
-  accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { }; 
-  accelerator_inline uint64_t oSites(void) const { return _odata_size; };
-  accelerator_inline int  Checkerboard(void) const { return checkerboard; };
-  accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
-  accelerator_inline ViewAdvise Advise(void) const { return advise; };
-  accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
+  accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { std::cout << __FILE__ << " " << __LINE__ << std::endl; }; 
+  accelerator_inline uint64_t oSites(void) const { std::cout << __FILE__ << " " << __LINE__ << std::endl; return _odata_size; };
+  accelerator_inline int  Checkerboard(void) const { std::cout << __FILE__ << " " << __LINE__ << std::endl; return checkerboard; };
+  accelerator_inline int &Checkerboard(void) { std::cout << __FILE__ << " " << __LINE__ << std::endl; return this->checkerboard; }; // can assign checkerboard on a container, not a view
+  accelerator_inline ViewAdvise Advise(void) const { std::cout << __FILE__ << " " << __LINE__ << std::endl; return advise; };
+  accelerator_inline ViewAdvise &Advise(void) { std::cout << __FILE__ << " " << __LINE__ << std::endl; return this->advise; }; // can assign advise on a container, not a view
  accelerator_inline void Conformable(GridBase * &grid) const
  { 
+    std::cout << __FILE__ << " " << __LINE__ << std::endl;	  
    if (grid) conformable(grid, _grid);
    else      grid = _grid;
  };
@@ -79,10 +80,11 @@ public:
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };

-  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
+  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){ std::cout << __FILE__ << " " << __LINE__ << std::endl; }
  LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
  LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
  {
+    std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->ViewOpen(mode);
  }

@@ -90,13 +92,16 @@ public:
  void ViewOpen(ViewMode mode)
  { // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
    //    std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
+    std::cout << __FILE__ << " " << __LINE__ << std::endl;
    this->cpu_ptr = (void *)this->_odata;
    this->mode    = mode;
+    std::cout << __FILE__ << " " << __LINE__ << " " << this->cpu_ptr << " " << this->_odata_size*sizeof(vobj) << std::endl;
    this->_odata  =(vobj *)
      MemoryManager::ViewOpen(this->cpu_ptr,
 				this->_odata_size*sizeof(vobj),
 				mode,
 				this->advise);    
+    std::cout << __FILE__ << " " << __LINE__ << std::endl;
  }
  void ViewClose(void)
  { // Inform the manager
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -453,7 +453,7 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 //////////////////////////////////////////////
 // Common on all GPU targets
 //////////////////////////////////////////////
-#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
+#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP) 
 #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );

 #define accelerator_for( iter, num, nsimd, ... )		\
@@ -515,23 +515,23 @@ extern "C" void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)
 {
-  printf("copy to device start \n");
+  std::cout << "H->D copy to device start "<<std::endl;
  int devc = omp_get_default_device();
  int host = omp_get_initial_device();
  if( omp_target_memcpy( to, from, bytes, 0, 0, devc, host ) ) {
    printf(" omp_target_memcpy host to device failed for %ld in device %d \n",bytes,devc);
  }
-  printf("copy to device end \n");
+  std::cout << "H->D copy to device end "<<std::endl;
 };
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes)
 {
-  printf("copy from device start \n");
+  std::cout << "D->H copy from device start "<<std::endl;
  int devc = omp_get_default_device();
  int host = omp_get_initial_device();
  if( omp_target_memcpy( to, from, bytes, 0, 0, host, devc ) ) {
    printf(" omp_target_memcpy device to host failed for %ld in device %d \n",bytes,devc);
  }
-  printf("copy from device end \n");
+  std::cout << "D->H copy from device end "<<std::endl;
 };
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { printf("TODO acceleratorCopyDeviceToDeviceAsynch");memcpy(to,from,bytes);}
 inline void acceleratorCopySynchronize(void) {printf("TODO acceleratorCopySynchronize");};
@@ -539,7 +539,7 @@ inline void acceleratorCopySynchronize(void) {printf("TODO acceleratorCopySynchr
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 inline void acceleratorMemSet(void *base,int value,size_t bytes)
 {
-  printf(" l-l-l-l-l-l-l-l-l-l-l-l-l OMPTARGET calling memset on host and copying to dev l-l-l-l-l-l-l-l-l-l-l-l \n");
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l OMPTARGET calling memset on host and copying to dev l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
  void *base_host = memalign(GRID_ALLOC_ALIGN,bytes);
  memset(base_host,value,bytes);
  int devc = omp_get_default_device();
@@ -552,7 +552,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes)
 #include <cuda_runtime_api.h>
 inline void *acceleratorAllocShared(size_t bytes)
 {
-  printf(" l-l-l-l-l-l-l-l-l-l-l-l-l Allocating shared from OMPTARGET MANAGED l-l-l-l-l-l-l-l-l-l-l-l \n");	
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared from OMPTARGET MANAGED l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
  void *ptr=NULL;
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
@@ -562,12 +562,23 @@ inline void *acceleratorAllocShared(size_t bytes)
  return ptr;
 };
 inline void acceleratorFreeShared(void *ptr){cudaFree(ptr);};
-inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
+//inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
+inline void *acceleratorAllocDevice(size_t bytes)
+{
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating device from OMPTARGET MANAGED l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
+  void *ptr=NULL;
+  auto err = cudaMallocManaged((void **)&ptr,bytes);
+  if( err != cudaSuccess ) {
+    ptr = (void *) NULL;
+    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+  }
+  return ptr;
+};
 inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 #else
 inline void *acceleratorAllocShared(size_t bytes)
 {
-  printf(" l-l-l-l-l-l-l-l-l-l-l-l-l Allocating shared mem from OMPTARGET l-l-l-l-l-l-l-l-l-l-l-l \n");
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating shared mem from OMPTARGET l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
  int devc = omp_get_default_device();
  void *ptr=NULL;
  ptr = (void *) llvm_omp_target_alloc_shared(bytes, devc);
@@ -578,7 +589,7 @@ inline void *acceleratorAllocShared(size_t bytes)
 };
 inline void *acceleratorAllocDevice(size_t bytes)
 {
-  printf(" l-l-l-l-l-l-l-l-l-l-l-l-l Allocating device mem from OMPTARGET l-l-l-l-l-l-l-l-l-l-l-l \n");
+  std::cout << " l-l-l-l-l-l-l-l-l-l-l-l-l  Allocating device mem " << bytes << " Bytes from OMPTARGET l-l-l-l-l-l-l-l-l-l-l-l "<<std::endl;
  int devc = omp_get_default_device();
  void *ptr=NULL;
  ptr = (void *) omp_target_alloc(bytes, devc);
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -84,7 +84,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;

-  LatticeFermion src   (FGrid); random(RNG5,src);
+  LatticeFermion src   (FGrid);  random(RNG5,src);
 #if 0
  src = Zero();
  {
@@ -96,7 +96,9 @@ int main (int argc, char ** argv)
    pokeSite(tmp,src,origin);
  }
 #else
+  std::cout << GridLogMessage << "Drawing gauge field1" << std::endl;
  RealD N2 = 1.0/::sqrt(norm2(src));
+  std::cout << GridLogMessage << "Drawing gauge field3" << std::endl;
  src = src*N2;
 #endif

@@ -218,8 +220,12 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
-    err = ref-result;
-    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+//#pragma omp target is_device_ptr ( err.View(CpuWrite), ref.View(CpuWrite), result.View(CpuWrite) )
+    ref-result;
+    //err = ref-result;
+    
+    std::cout<<GridLogMessage << "norm diff 0  "<<std::endl;
+    std::cout<<GridLogMessage << "norm diff   "<< norm2(err) << " norm diff 1 " <<std::endl;
    //exit(0);

    if(( norm2(err)>1.0e-4) ) {
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -36,12 +36,12 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-#define LMAX (64)
-#define LMIN (8)
+#define LMAX (31)
+#define LMIN (31)
 #define LADD (8)

-  int64_t Nwarm=0;
-  int64_t Nloop=1;
+  int64_t Nwarm=500;
+  int64_t Nloop=1500;

  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();
@@ -65,10 +65,10 @@ int main (int argc, char ** argv)
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));

-      printf("line 67 \n");
-      LatticeColourMatrix z(&Grid); printf("z lattice color mat \n"); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); printf("x lattice color mat \n"); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); printf("y lattice color mat \n"); random(pRNG,y);
+      std::cout << __FILE__ << " " << __LINE__ << std::endl;
+      LatticeColourMatrix z(&Grid); std::cout << "z lattice color mat " << std::endl; random(pRNG,z);
+      LatticeColourMatrix x(&Grid); std::cout << "x lattice color mat " << std::endl; random(pRNG,x);
+      LatticeColourMatrix y(&Grid); std::cout << "y lattice color mat " << std::endl; random(pRNG,y);

      for(int64_t i=0;i<Nwarm;i++){
 	x=x*y;