Fence propagation from SYCL

Merge branch 'develop' of https://github.com/paboyle/Grid into develop
Script update
2025-06-17 15:27:06 +01:00 · 2023-03-29 15:00:40 -04:00 · 2023-03-27 17:29:54 -07:00 · 2023-03-27 17:29:43 -07:00 · 2023-03-27 17:29:21 -07:00 · 2023-03-27 17:28:38 -07:00
33 changed files with 1148 additions and 171 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@ -45,7 +45,7 @@ directory
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"
-#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 //disables nvcc specific warning in json.hpp
 #pragma nv_diag_suppress unsigned_compare_with_zero
 #pragma nv_diag_suppress cast_to_qualified_type
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@ -14,7 +14,7 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
-#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 #pragma nv_diag_suppress code_is_unreachable
 #else
 #pragma diag_suppress code_is_unreachable
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
@ -0,0 +1,213 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
    Copyright (C) 2015
    Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
 #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
 NAMESPACE_BEGIN(Grid);
 //Mixed precision restarted defect correction CG
 template<class FieldD,class FieldF, 
  typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
  typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
 public:
  using LinearFunction<FieldD>::operator();
  RealD   Tolerance;
  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
  Integer MaxInnerIterations;
  Integer MaxOuterIterations;
  Integer MaxPatchupIterations;
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
  LinearOperatorBase<FieldF> &Linop_f;
  LinearOperatorBase<FieldD> &Linop_d;
  //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
  LinearFunction<FieldF> *guesser;
  bool updateResidual;
  MixedPrecisionConjugateGradientBatched(RealD tol, 
          Integer maxinnerit, 
          Integer maxouterit, 
          Integer maxpatchit,
          GridBase* _sp_grid, 
          LinearOperatorBase<FieldF> &_Linop_f, 
          LinearOperatorBase<FieldD> &_Linop_d,
          bool _updateResidual=true) :
    Linop_f(_Linop_f), Linop_d(_Linop_d),
    Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
    OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
  void useGuesser(LinearFunction<FieldF> &g){
    guesser = &g;
  }
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
    std::vector<FieldD> srcs_d_in{src_d_in};
    std::vector<FieldD> sols_d{sol_d};
    (*this)(srcs_d_in,sols_d);
    sol_d = sols_d[0];
  }
  void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
    assert(src_d_in.size() == sol_d.size());
    int NBatch = src_d_in.size();
    std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
    Integer TotalOuterIterations = 0; //Number of restarts
    std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations
    std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
    GridStopWatch TotalTimer;
    TotalTimer.Start();
    GridStopWatch InnerCGtimer;
    GridStopWatch PrecChangeTimer;
    int cb = src_d_in[0].Checkerboard();
    std::vector<RealD> src_norm;
    std::vector<RealD> norm;
    std::vector<RealD> stop;
    GridBase* DoublePrecGrid = src_d_in[0].Grid();
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
    FieldD tmp2_d(DoublePrecGrid);
    tmp2_d.Checkerboard() = cb;
    std::vector<FieldD> src_d;
    std::vector<FieldF> src_f;
    std::vector<FieldF> sol_f;
    for (int i=0; i<NBatch; i++) {
      sol_d[i].Checkerboard() = cb;
      src_norm.push_back(norm2(src_d_in[i]));
      norm.push_back(0.);
      stop.push_back(src_norm[i] * Tolerance*Tolerance);
      src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
      src_f.push_back(SinglePrecGrid);
      src_f[i].Checkerboard() = cb;
      sol_f.push_back(SinglePrecGrid);
      sol_f[i].Checkerboard() = cb;
    }
    RealD inner_tol = InnerTolerance;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;
    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
      std::cout << GridLogMessage << std::endl;
      std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
      bool allConverged = true;
      for (int i=0; i<NBatch; i++) {
        //Compute double precision rsd and also new RHS vector.
        Linop_d.HermOp(sol_d[i], tmp_d);
        norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
        std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
        PrecChangeTimer.Start();
        precisionChange(src_f[i], src_d[i]);
        PrecChangeTimer.Stop();
        sol_f[i] = Zero();
        if(norm[i] > OuterLoopNormMult * stop[i]) {
          allConverged = false;
        }
      }
      if (allConverged) break;
      if (updateResidual) {
        RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
        RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
        while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
        CG_f.Tolerance = inner_tol;
      }
      //Optionally improve inner solver guess (eg using known eigenvectors)
      if(guesser != NULL) {
        (*guesser)(src_f, sol_f);
      }
      for (int i=0; i<NBatch; i++) {
        //Inner CG
        InnerCGtimer.Start();
        CG_f(Linop_f, src_f[i], sol_f[i]);
        InnerCGtimer.Stop();
        TotalInnerIterations[i] += CG_f.IterationsToComplete;
        //Convert sol back to double and add to double prec solution
        PrecChangeTimer.Start();
        precisionChange(tmp_d, sol_f[i]);
        PrecChangeTimer.Stop();
        axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
      }
    }
    //Final trial CG
    std::cout << GridLogMessage << std::endl;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
    for (int i=0; i<NBatch; i++) {
      ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
      CG_d(Linop_d, src_d_in[i], sol_d[i]);
      TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
    }
    TotalTimer.Stop();
    std::cout << GridLogMessage << std::endl;
    for (int i=0; i<NBatch; i++) {
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
    }
    std::cout << GridLogMessage << std::endl;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid);
 /*Allocation types, saying which pointer cache should be used*/
 #define Cpu      (0)
-#define CpuSmall (1)
+#define CpuHuge  (1)
-#define Acc      (2)
+#define CpuSmall (2)
-#define AccSmall (3)
+#define Acc      (3)
-#define Shared   (4)
+#define AccHuge  (4)
-#define SharedSmall (5)
+#define AccSmall (5)
 #define Shared   (6)
 #define SharedHuge  (7)
 #define SharedSmall (8)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void)
 }
 uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
 uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
 uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
@ -170,6 +176,16 @@ void MemoryManager::Init(void)
    }
  }
  str= getenv("GRID_ALLOC_NCACHE_HUGE");
  if ( str ) {
    Nc = atoi(str);
    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
      Ncache[CpuHuge]=Nc;
      Ncache[AccHuge]=Nc;
      Ncache[SharedHuge]=Nc;
    }
  }
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
  if ( str ) {
    Nc = atoi(str);
@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) {
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
 #endif
 #ifdef GRID_UVM
@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) {
 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
 {
 #ifdef ALLOCATION_CACHE
-  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache;
-  int cache = type + small;
+  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
  else                                     cache = type;
  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
  if (ncache == 0) return ptr;
  void * ret = NULL;
  int v = -1;
@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 void *MemoryManager::Lookup(size_t bytes,int type)
 {
 #ifdef ALLOCATION_CACHE
-  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache;
-  int cache = type+small;
+  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
  else                                     cache = type;
  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@ -35,6 +35,12 @@ NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?
 #define GRID_ALLOC_SMALL_LIMIT (4096)
 #define GRID_ALLOC_HUGE_LIMIT  (2147483648)
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 #define FILE_LINE __FILE__ ":" TOSTRING(__LINE__)
 #define AUDIT(a) MemoryManager::Audit(FILE_LINE)
 /*Pinning pages is costly*/
 ////////////////////////////////////////////////////////////////////////////
@ -65,6 +71,21 @@ enum ViewMode {
  CpuWriteDiscard = 0x10 // same for now
 };
 struct MemoryStatus {
  uint64_t     DeviceBytes;
  uint64_t     DeviceLRUBytes;
  uint64_t     DeviceMaxBytes;
  uint64_t     HostToDeviceBytes;
  uint64_t     DeviceToHostBytes;
  uint64_t     HostToDeviceXfer;
  uint64_t     DeviceToHostXfer;
  uint64_t     DeviceEvictions;
  uint64_t     DeviceDestroy;
  uint64_t     DeviceAllocCacheBytes;
  uint64_t     HostAllocCacheBytes;
 };
 class MemoryManager {
 private:
@ -78,7 +99,7 @@ private:
  } AllocationCacheEntry;
  static const int NallocCacheMax=128; 
-  static const int NallocType=6;
+  static const int NallocType=9;
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
@ -92,8 +113,9 @@ private:
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
  static void PrintBytes(void);
 public:
  static void PrintBytes(void);
  static void Audit(std::string s);
  static void Init(void);
  static void InitMessage(void);
  static void *AcceleratorAllocate(size_t bytes);
@ -113,6 +135,27 @@ private:
  static uint64_t     DeviceToHostBytes;
  static uint64_t     HostToDeviceXfer;
  static uint64_t     DeviceToHostXfer;
  static uint64_t     DeviceEvictions;
  static uint64_t     DeviceDestroy;
  static uint64_t     DeviceCacheBytes();
  static uint64_t     HostCacheBytes();
  static MemoryStatus GetFootprint(void) {
    MemoryStatus stat;
    stat.DeviceBytes       = DeviceBytes;
    stat.DeviceLRUBytes    = DeviceLRUBytes;
    stat.DeviceMaxBytes    = DeviceMaxBytes;
    stat.HostToDeviceBytes = HostToDeviceBytes;
    stat.DeviceToHostBytes = DeviceToHostBytes;
    stat.HostToDeviceXfer  = HostToDeviceXfer;
    stat.DeviceToHostXfer  = DeviceToHostXfer;
    stat.DeviceEvictions   = DeviceEvictions;
    stat.DeviceDestroy     = DeviceDestroy;
    stat.DeviceAllocCacheBytes = DeviceCacheBytes();
    stat.HostAllocCacheBytes   = HostCacheBytes();
    return stat;
  };
 private:
 #ifndef GRID_UVM
@ -170,6 +213,7 @@ private:
 public:
  static void Print(void);
  static void PrintAll(void);
  static void PrintState( void* CpuPtr);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -3,8 +3,13 @@
 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
+
-#define dprintf(...)
+#define MAXLINE 512
 static char print_buffer [ MAXLINE ];
 #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
 #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
 //#define dprintf(...) 
 ////////////////////////////////////////////////////////////
@ -23,6 +28,8 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
 uint64_t  MemoryManager::DeviceEvictions;
 uint64_t  MemoryManager::DeviceDestroy;
 ////////////////////////////////////
 // Priority ordering for unlocked entries
@ -104,15 +111,17 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-   dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr) {
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceDestroy++;
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    AccCache.AccPtr=(uint64_t) NULL;
    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@ -121,26 +130,36 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 {
  ///////////////////////////////////////////////////////////////////////////
-  // Make CPU consistent, remove from Accelerator, remove entry
+  // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
-  // Cannot be locked. If allocated must be in LRU pool.
+  // Cannot be acclocked. If allocated must be in LRU pool.
  //
  // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
  //                          and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
  //                          but there is a weakness where CpuLock entries are attempted for erase
  //                          Take these OUT LRU queue when CPU locked?
  //                          Cannot take out the table as cpuLock data is important.
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n",
-  assert(AccCache.accLock==0);
+	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
-  assert(AccCache.cpuLock==0);
+	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  assert(AccCache.accLock==0); // Cannot evict so logic bomb
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.state==AccDirty) {
    Flush(AccCache);
  }
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr) {
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
    dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
-  uint64_t CpuPtr = AccCache.CpuPtr;
+  //  uint64_t CpuPtr = AccCache.CpuPtr;
-  EntryErase(CpuPtr);
+  DeviceEvictions++;
  //  EntryErase(CpuPtr);
 }
 void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
 {
@ -150,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Flush  %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@ -165,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@ -191,6 +210,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@ -202,6 +222,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@ -212,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
      assert(LRU.size()>0);
-      uint64_t victim = LRU.back();
+      uint64_t victim = LRU.back(); // From the LRU
      auto AccCacheIterator = EntryLookup(victim);
      auto & AccCache = AccCacheIterator->second;
      Evict(AccCache);
    } else {
      return;
    }
  }
 }
@ -241,11 +265,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error
  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
+    dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
-		    (uint64_t)bytes);
+	            (uint64_t)bytes,
 		    (uint64_t)AccCache.accLock);
    assert(AccCache.CpuPtr == CpuPtr);
    assert(AccCache.bytes  ==bytes);
  }
@ -280,6 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@ -292,28 +318,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
+    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
+    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
  } else {
    assert(0);
  }
-  // If view is opened on device remove from LRU
+  assert(AccCache.accLock>0);
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
    dprintf("AccCache entry removed from LRU \n");
    LRUremove(AccCache);
  }
@ -334,10 +362,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  assert(AccCache.accLock>0);
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
  } else {
    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@ -374,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
-  if (!AccCache.AccPtr) {
+  // CPU doesn't need to free space
-     EvictVictims(bytes);
+  //  if (!AccCache.AccPtr) {
-  }
+  //    EvictVictims(bytes);
  //  }
  assert((mode==CpuRead)||(mode==CpuWrite));
  assert(AccCache.accLock==0);  // Programming error
@ -430,20 +461,28 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 void  MemoryManager::Print(void)
 {
  PrintBytes();
-  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
+  std::cout << GridLogMessage << "Memory Manager                             " << std::endl;
-  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-  std::cout << GridLogDebug << DeviceBytes   << " bytes allocated on device " << std::endl;
+  std::cout << GridLogMessage << DeviceBytes   << " bytes allocated on device " << std::endl;
-  std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
+  std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
-  std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device       " << std::endl;
+  std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device       " << std::endl;
-  std::cout << GridLogDebug << HostToDeviceXfer << " transfers        to   device " << std::endl;
+  std::cout << GridLogMessage << HostToDeviceXfer << " transfers        to   device " << std::endl;
-  std::cout << GridLogDebug << DeviceToHostXfer << " transfers        from device " << std::endl;
+  std::cout << GridLogMessage << DeviceToHostXfer << " transfers        from device " << std::endl;
-  std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
+  std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
-  std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
+  std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
-  std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
+  std::cout << GridLogMessage << DeviceEvictions  << " Evictions from device " << std::endl;
-  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << DeviceDestroy    << " Destroyed vectors on device " << std::endl;
-  std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
+  std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
-  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 }
 void  MemoryManager::PrintAll(void)
 {
  Print();
  std::cout << GridLogMessage << std::endl;
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
  std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
    auto &AccCache = it->second;
@ -453,13 +492,13 @@ void  MemoryManager::Print(void)
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");
-    std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
+    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 	      << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 	      << "\t" << AccCache.cpuLock
 	      << "\t" << AccCache.accLock
 	      << "\t" << AccCache.LRU_valid<<std::endl;
  }
-  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 };
 int   MemoryManager::isOpen   (void* _CpuPtr) 
@ -473,6 +512,61 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
    return 0;
  }
 }
 void MemoryManager::Audit(std::string s)
 {
  uint64_t CpuBytes=0;
  uint64_t AccBytes=0;
  uint64_t LruBytes1=0;
  uint64_t LruBytes2=0;
  uint64_t LruCnt=0;
  uint64_t LockedBytes=0;
  std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
  for(auto it=LRU.begin();it!=LRU.end();it++){
    uint64_t cpuPtr = *it;
    assert(EntryPresent(cpuPtr));
    auto AccCacheIterator = EntryLookup(cpuPtr);
    auto & AccCache = AccCacheIterator->second;
    LruBytes2+=AccCache.bytes;
    assert(AccCache.LRU_valid==1);
    assert(AccCache.LRU_entry==it);
  }
  std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
    auto &AccCache = it->second;
    std::string str;
    if ( AccCache.state==Empty    ) str = std::string("Empty");
    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");
    CpuBytes+=AccCache.bytes;
    if( AccCache.AccPtr )    AccBytes+=AccCache.bytes;
    if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
    if( AccCache.LRU_valid ) LruCnt++;
    if ( AccCache.cpuLock || AccCache.accLock ) {
      assert(AccCache.LRU_valid==0);
      std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 		<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 		<< "\t cpuLock  " << AccCache.cpuLock
 		<< "\t accLock  " << AccCache.accLock
 		<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
    }
    assert( AccCache.cpuLock== 0 ) ;
    assert( AccCache.accLock== 0 ) ;
  }
  std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
  assert(LruBytes1==LruBytes2);
  assert(LruBytes1==DeviceLRUBytes);
  std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
  assert(AccBytes==DeviceBytes);
  std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
  assert(LruCnt == LRU.size());
  std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
 }
 void MemoryManager::PrintState(void* _CpuPtr)
 {
@ -489,8 +583,8 @@ void MemoryManager::PrintState(void* _CpuPtr)
    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
-    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
+    std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
-    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
+    << "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
    << "\t" << AccCache.cpuLock
    << "\t" << AccCache.accLock
    << "\t" << AccCache.LRU_valid<<std::endl;
--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@ -12,7 +12,10 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
 uint64_t  MemoryManager::DeviceEvictions;
 uint64_t  MemoryManager::DeviceDestroy;
 void  MemoryManager::Audit(std::string s){};
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
@ -21,6 +24,7 @@ void  MemoryManager::PrintState(void* CpuPtr)
 std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
 };
 void  MemoryManager::Print(void){};
 void  MemoryManager::PrintAll(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};
 NAMESPACE_END(Grid);
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -401,8 +401,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  //   std::cout << "Copy Synchronised\n"<<std::endl;
  acceleratorCopySynchronise();
  int nreq=list.size();
  if (nreq==0) return;
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -36,10 +36,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef GRID_HIP
 #include <hip/hip_runtime_api.h>
 #endif
-#ifdef GRID_SYCl
+#ifdef GRID_SYCL
-
+#define GRID_SYCL_LEVEL_ZERO_IPC
 #endif
 NAMESPACE_BEGIN(Grid); 
 #define header "SharedMemoryMpi: "
 /*Construct from an MPI communicator*/
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  }
 }
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
 template <typename T>
 T iDivUp(T a, T b) // Round a / b to nearest higher integer value
 { return (a % b != 0) ? (a / b + 1) : (a / b); }
 template <typename T>
 __global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
 {
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx >= e1*e2) return;
    int n, b, o;
    n = idx / e2;
    b = idx % e2;
    o = n*stride + b;
    vector[2*idx + 0] = lo + o;
    vector[2*idx + 1] = ro + o;
 }
 #endif
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int ent=0;
  if(cbmask == 0x3 ){
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
    ent = e1*e2;
    dim3 blockSize(acceleratorThreads());
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
    accelerator_barrier();
 #else
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
 #endif
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int ent=0;
  if ( cbmask == 0x3 ) {
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
    ent = e1*e2;
    dim3 blockSize(acceleratorThreads());
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
    accelerator_barrier();
 #else
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
 #endif
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -288,8 +288,8 @@ public:
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
    conformable(*this,r);
    this->checkerboard = r.Checkerboard();
    auto me =   View(AcceleratorWriteDiscard);
    auto him= r.View(AcceleratorRead);
    auto me =   View(AcceleratorWriteDiscard);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@ -303,8 +303,8 @@ public:
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
    this->checkerboard = r.Checkerboard();
    conformable(*this,r);
    auto me =   View(AcceleratorWriteDiscard);
    auto him= r.View(AcceleratorRead);
    auto me =   View(AcceleratorWriteDiscard);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@ -0,0 +1,55 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_crc.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
 {
  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
  typedef typename vobj::tensor_reduced normtype;
  typedef typename normtype::scalar_object scalar;
  std::vector<scalar> sff;
  sliceSum(ff,sff,mu);
  for(int t=0;t<sff.size();t++){
    std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
  }
 }
 template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
 #define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -156,33 +156,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 }
 template<class vobj>
-inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
+inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
 {
  Integer osites = arg.Grid()->oSites();
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  typename vobj::scalar_object ssum;
  autoView( arg_v, arg, AcceleratorRead);
-  ssum= sum_gpu(&arg_v[0],osites);
+  return sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
-  auto ssum= sum_cpu(&arg_v[0],osites);
+  return sum_cpu(&arg_v[0],osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
  auto ssum = rankSum(arg);
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
 template<class vobj>
-inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
+inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_gpu_large(&arg_v[0],osites);
+  return sum_gpu_large(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_cpu(&arg_v[0],osites);
+  return sum_cpu(&arg_v[0],osites);
 #endif
 }
 template<class vobj>
 inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
  auto ssum = rankSumLarge(arg);
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
  }
 }
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
                               const std::vector<Lattice<vobj>> &fineData,
                               const VLattice &Basis)
 {
  int NBatch = fineData.size();
  assert(coarseData.size() == NBatch);
  GridBase * fine  = fineData[0].Grid();
  GridBase * coarse= coarseData[0].Grid();
  Lattice<iScalar<CComplex>> ip(coarse);
  std::vector<Lattice<vobj>> fineDataCopy = fineData;
  autoView(ip_, ip, AcceleratorWrite);
  for(int v=0;v<nbasis;v++) {
    for (int k=0; k<NBatch; k++) {
      autoView( coarseData_ , coarseData[k], AcceleratorWrite);
      blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
      accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
        convertType(coarseData_[sc](v),ip_[sc]);
      });
      // improve numerical stability of projection
      // |fine> = |fine> - <basis|fine> |basis>
      ip=-ip;
      blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); 
    }
  }
 }
 template<class vobj,class vobj2,class CComplex>
  inline void blockZAXPY(Lattice<vobj> &fineZ,
@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }
 #endif
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
                               std::vector<Lattice<vobj>> &fineData,
                               const VLattice &Basis)
 {
  int NBatch = coarseData.size();
  assert(fineData.size() == NBatch);
  GridBase * fine   = fineData[0].Grid();
  GridBase * coarse = coarseData[0].Grid();
  for (int k=0; k<NBatch; k++)
    fineData[k]=Zero();
  for (int i=0;i<nbasis;i++) {
    for (int k=0; k<NBatch; k++) {
      Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
      blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
    }
  }
 }
 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
 template<class vobj,class vvobj>
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@ -65,29 +65,40 @@ GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
 GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
 GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogDslash     (1, "Dslash", GridLogColours, "BLUE");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
-  GridLogError.Active(0);
+  GridLogError.Active(1);
  GridLogWarning.Active(0);
  GridLogMessage.Active(1); // at least the messages should be always on
  GridLogMemory.Active(0); 
  GridLogTracing.Active(0); 
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  GridLogDslash.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
  for (int i = 0; i < logstreams.size(); i++) {
-    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
+    if (logstreams[i] == std::string("Tracing"))     GridLogTracing.Active(1);
    if (logstreams[i] == std::string("Memory"))      GridLogMemory.Active(1);
    if (logstreams[i] == std::string("Warning"))     GridLogWarning.Active(1);
    if (logstreams[i] == std::string("NoMessage"))   GridLogMessage.Active(0);
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("Dslash"))      GridLogDslash.Active(1);
    if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
    if (logstreams[i] == std::string("NoHMC"))       GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@ -138,7 +138,8 @@ public:
        stream << std::setw(log.topWidth);
      }
      stream << log.topName << log.background()<< " : ";
-      stream << log.colour() <<  std::left;
+      //      stream << log.colour() <<  std::left;
      stream <<  std::left;
      if (log.chanWidth > 0)
      {
        stream << std::setw(log.chanWidth);
@ -153,9 +154,9 @@ public:
 	stream << log.evidence()
 	       << now	       << log.background() << " : " ;
      }
-      stream << log.colour();
+      //      stream << log.colour();
      stream <<  std::right;
      stream.flags(f);
      return stream;
    } else { 
      return devnull;
@ -180,8 +181,12 @@ extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
 extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogDslash;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern GridLogger GridLogMemory;
 extern GridLogger GridLogTracing;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
--- a/Grid/perfmon/PerfCount.cc
+++ b/Grid/perfmon/PerfCount.cc
@ -27,10 +27,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/perfmon/PerfCount.h>
 #include <Grid/perfmon/Timer.h>
 #include <Grid/perfmon/PerfCount.h>
 NAMESPACE_BEGIN(Grid);
 GridTimePoint theProgramStart = GridClock::now();
 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 #define RawConfig(A,B) (A<<8|B)
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
--- a/Grid/perfmon/PerfCount.h
+++ b/Grid/perfmon/PerfCount.h
@ -30,6 +30,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_PERFCOUNT_H
 #define GRID_PERFCOUNT_H
 #ifndef __SSC_START
 #define __SSC_START
 #define __SSC_STOP
 #endif
 #include <sys/time.h>
 #include <ctime>
 #include <chrono>
@ -72,17 +78,9 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 inline uint64_t cyclecount(void){ 
  return 0;
 }
 #define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
 #define __SSC_STOP  __SSC_MARK(0x110)
 #define __SSC_START __SSC_MARK(0x111)
 #else
 #define __SSC_MARK(mark) 
 #define __SSC_STOP  
 #define __SSC_START 
 /*
 * cycle counters arch dependent
 */
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@ -35,17 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid)
-// Dress the output; use std::chrono
+//typedef  std::chrono::system_clock          GridClock;
-// C++11 time facilities better?
+typedef  std::chrono::high_resolution_clock   GridClock;
 inline double usecond(void) {
  struct timeval tv;
 #ifdef TIMERS_ON
  gettimeofday(&tv,NULL);
 #endif
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
 typedef  std::chrono::seconds               GridSecs;
@ -53,6 +44,15 @@ typedef  std::chrono::milliseconds          GridMillisecs;
 typedef  std::chrono::microseconds          GridUsecs;
 typedef  std::chrono::microseconds          GridTime;
 extern GridTimePoint theProgramStart;
 // Dress the output; use std::chrono
 // C++11 time facilities better?
 inline double usecond(void) {
  auto usecs = std::chrono::duration_cast<GridUsecs>(GridClock::now()-theProgramStart); 
  return 1.0*usecs.count();
 }
 inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 {
  stream << time.count()<<" s";
--- a/Grid/perfmon/Tracing.h
+++ b/Grid/perfmon/Tracing.h
@ -0,0 +1,70 @@
 #pragma once
 NAMESPACE_BEGIN(Grid);
 #ifdef GRID_TRACING_NVTX
 #include <nvToolsExt.h>
 class GridTracer {
 public:
  GridTracer(const char* name) {
    nvtxRangePushA(name);
  }
  ~GridTracer() {
    nvtxRangePop();
  }
 };
 inline void tracePush(const char *name) { nvtxRangePushA(name); }
 inline void tracePop(const char *name) { nvtxRangePop(); }
 inline int  traceStart(const char *name) {  }
 inline void traceStop(int ID) {  }
 #endif
 #ifdef GRID_TRACING_ROCTX
 #include <roctracer/roctx.h>
 class GridTracer {
 public:
  GridTracer(const char* name) {
    roctxRangePushA(name);
    std::cout << "roctxRangePush "<<name<<std::endl;
  }
  ~GridTracer() {
    roctxRangePop();
    std::cout << "roctxRangePop "<<std::endl;
  }
 };
 inline void tracePush(const char *name) { roctxRangePushA(name); }
 inline void tracePop(const char *name) { roctxRangePop(); }
 inline int  traceStart(const char *name) { roctxRangeStart(name); }
 inline void traceStop(int ID) { roctxRangeStop(ID); }
 #endif
 #ifdef GRID_TRACING_TIMER
 class GridTracer {
 public:
  const char *name;
  double elapsed;
  GridTracer(const char* _name) {
    name = _name;
    elapsed=-usecond();
  }
  ~GridTracer() {
    elapsed+=usecond();
    std::cout << GridLogTracing << name << " took " <<elapsed<< " us" <<std::endl;
  }
 };
 inline void tracePush(const char *name) {  }
 inline void tracePop(const char *name) {  }
 inline int  traceStart(const char *name) { return 0; }
 inline void traceStop(int ID) {  }
 #endif
 #ifdef GRID_TRACING_NONE
 #define GRID_TRACE(name) 
 inline void tracePush(const char *name) {  }
 inline void tracePop(const char *name) {  }
 inline int  traceStart(const char *name) { return 0;  }
 inline void traceStop(int ID) {  }
 #else
 #define GRID_TRACE(name) GridTracer uniq_name_using_macros##__COUNTER__(name);
 #endif
 NAMESPACE_END(Grid);
--- a/Grid/pugixml/pugixml.cc
+++ b/Grid/pugixml/pugixml.cc
@ -16,7 +16,7 @@
 #ifdef __NVCC__
 #pragma push
-#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 #pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #else
 #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@ -456,9 +456,9 @@ template <class Fimpl>
 void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
 #ifdef FAST_FERM_TO_PROP
-  autoView(p_v,p,AcceleratorWrite);
+  autoView(p_v,p,CpuWrite);
-  autoView(f_v,f,AcceleratorRead);
+  autoView(f_v,f,CpuRead);
-  accelerator_for(idx,p_v.oSites(),1,{
+  thread_for(idx,p_v.oSites(),{
      for(int ss = 0; ss < Ns; ++ss) {
      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
@ -484,9 +484,9 @@ template <class Fimpl>
 void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
 #ifdef FAST_FERM_TO_PROP
-  autoView(p_v,p,AcceleratorWrite);
+  autoView(p_v,p,CpuRead);
-  autoView(f_v,f,AcceleratorRead);
+  autoView(f_v,f,CpuWrite);
-  accelerator_for(idx,p_v.oSites(),1,{
+  thread_for(idx,p_v.oSites(),{
      for(int ss = 0; ss < Ns; ++ss) {
      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -459,11 +459,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 #ifdef SYCL_HACK     
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl);    return; }
 #else
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #endif     
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
@ -474,6 +470,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
 #ifndef GRID_CUDA
@ -498,10 +495,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
     acceleratorFenceComputeStream();
   } else if( interior ) {
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -398,6 +398,8 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    // Buffers are gathered AND synchronised
    // Copies are MPI ISend OR asynch copy on copy stream
    reqs.resize(Packets.size());
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
@ -410,14 +412,18 @@ public:
      comms_bytes+=bytes;
      shm_bytes  +=2*Packets[i].bytes-bytes;
    }
    _grid->StencilBarrier();// Synch shared memory on a single nodes
  }
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    // complete intranode
    acceleratorCopySynchronise();
    // complete MPI
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromComplete(reqs[i],i);
    }
    // Everyone agrees we are all done
    _grid->StencilBarrier(); 
    commtime+=usecond();
  }
  ////////////////////////////////////////////////////////////////////////
@ -425,34 +431,10 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void Communicate(void)
  {
    if ( 0 ){
      thread_region {
 	// must be called in parallel region
 	int mythread  = thread_num();
 	int maxthreads= thread_max();
 	int nthreads = CartesianCommunicator::nCommThreads;
 	assert(nthreads <= maxthreads);
 	if (nthreads == -1) nthreads = 1;
 	if (mythread < nthreads) {
 	  for (int i = mythread; i < Packets.size(); i += nthreads) {
 	    double start = usecond();
 	    uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 							 Packets[i].to_rank,
 							 Packets[i].recv_buf,
 							 Packets[i].from_rank,
 							 Packets[i].bytes,i);
 	    comm_bytes_thr[mythread] += bytes;
 	    shm_bytes_thr[mythread]  += Packets[i].bytes - bytes;
 	    comm_time_thr[mythread]  += usecond() - start;
 	  }
 	}
      }
    } else { // Concurrent and non-threaded asynch calls to MPI
    std::vector<std::vector<CommsRequest_t> > reqs;
    this->CommunicateBegin(reqs);
    this->CommunicateComplete(reqs);
  }
  }
  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
  {
@ -527,7 +509,6 @@ public:
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    mpi3synctime_g+=usecond();
    // conformable(source.Grid(),_grid);
    assert(source.Grid()==_grid);
    halogtime-=usecond();
@ -586,13 +567,8 @@ public:
    CommsMerge(decompress,Mergers,Decompressions);
  }
  template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
-    mpi3synctime-=usecond();
+    assert(MergersSHM.size()==0);
-    accelerator_barrier();
+    assert(DecompressionsSHM.size()==0);
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    mpi3synctime+=usecond();
    shmmergetime-=usecond();
    CommsMerge(decompress,MergersSHM,DecompressionsSHM);
    shmmergetime+=usecond();
  }
  template<class decompressor>
@ -609,6 +585,7 @@ public:
 	  decompress.Exchange(mp,vp0,vp1,type,o);
      });
    }
    if ( mm.size() )    acceleratorFenceComputeStream();
    mergetime+=usecond();
    decompresstime-=usecond();
@ -619,7 +596,9 @@ public:
 	decompress.Decompress(kp,mp,o);
      });
    }
    if ( dd.size() )    acceleratorFenceComputeStream();
    decompresstime+=usecond();
  }
  ////////////////////////////////////////
  // Set up routines
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -249,14 +249,16 @@ inline int  acceleratorIsCommunicable(void *ptr)
 //////////////////////////////////////////////
 #ifdef GRID_SYCL
 NAMESPACE_END(Grid);
 #if 0
 #include <CL/sycl.hpp>
 #include <CL/sycl/usm.hpp>
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
 #include <level_zero/ze_api.h>
 #include <CL/sycl/backend/level_zero.hpp>
 #else
 #include <sycl/CL/sycl.hpp>
 #include <sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
 NAMESPACE_BEGIN(Grid);
--- a/systems/PVC/benchmarks/run-1tile.sh
+++ b/systems/PVC/benchmarks/run-1tile.sh
@ -4,7 +4,7 @@
 #SBATCH -p QZ1J-ICX-PVC
 ##SBATCH -p QZ1J-SPR-PVC-2C
-source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
 export NT=8
--- a/systems/PVC/benchmarks/run-2tile-mpi.sh
+++ b/systems/PVC/benchmarks/run-2tile-mpi.sh
@ -4,7 +4,7 @@
 #SBATCH -p QZ1J-ICX-PVC
-source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
 export NT=16
@ -19,16 +19,14 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
 export I_MPI_OFFLOAD_CELL=tile
 export EnableImplicitScaling=0
 export EnableWalkerPartition=0
-export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
+#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
-for i in 0 
+for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 do
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 1.1.1.2.log$i
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 2.1.1.1.log$i 
 done
 #mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.1x2.log
 #mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.2x1.log
--- a/systems/PVC/benchmarks/wrap.sh
+++ b/systems/PVC/benchmarks/wrap.sh
@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
 echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
-if [ $MPI_LOCALRANKID = "0" ] 
+#if [ $MPI_LOCALRANKID = "0" ] 
-then
+#then
 #  ~psteinbr/build_pti/ze_tracer -h $@
-  onetrace --chrome-device-timeline $@
+#  onetrace --chrome-device-timeline $@
-else
+#else
  $@
-fi
+#fi
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@ -1 +1 @@
-CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi
+CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi --enable-unified=no
--- a/tests/core/Test_fft_matt.cc
+++ b/tests/core/Test_fft_matt.cc
@ -0,0 +1,270 @@
    /*************************************************************************************
    grid` physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_cshift.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 Gamma::Algebra Gmu [] = {
  Gamma::Algebra::GammaX,
  Gamma::Algebra::GammaY,
  Gamma::Algebra::GammaZ,
  Gamma::Algebra::GammaT,
  Gamma::Algebra::Gamma5
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  Coordinate latt_size   = GridDefaultLatt();
  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();
  int vol = 1;
  for(int d=0;d<latt_size.size();d++){
    vol = vol * latt_size[d];
  }
  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian RBGRID(&GRID);
  LatticeComplexD    coor(&GRID);
  ComplexD ci(0.0,1.0);
  std::vector<int> seeds({1,2,3,4});
  GridSerialRNG          sRNG;  sRNG.SeedFixedIntegers(seeds); // naughty seeding
  GridParallelRNG          pRNG(&GRID);
  pRNG.SeedFixedIntegers(seeds);
  LatticeGaugeFieldD Umu(&GRID);
  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
  ////////////////////////////////////////////////////
  // Wilson test
  ////////////////////////////////////////////////////
  {
    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
    LatticeFermionD    src_p(&GRID);
    LatticeFermionD    tmp(&GRID);
    LatticeFermionD    ref(&GRID);
    LatticeFermionD    result(&GRID);
    RealD mass=0.1;
    WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
    Dw.M(src,ref);
    std::cout << "Norm src "<<norm2(src)<<std::endl;
    std::cout << "Norm Dw x src "<<norm2(ref)<<std::endl;
    {
      FFT theFFT(&GRID);
      ////////////////
      // operator in Fourier space
      ////////////////
      tmp =ref;
      theFFT.FFT_all_dim(result,tmp,FFT::forward);
      std::cout<<"FFT[ Dw x src ]  "<< norm2(result)<<std::endl;    
      tmp = src;
      theFFT.FFT_all_dim(src_p,tmp,FFT::forward);
      std::cout<<"FFT[ src      ]  "<< norm2(src_p)<<std::endl;
      /////////////////////////////////////////////////////////////////
      // work out the predicted FT from Fourier
      /////////////////////////////////////////////////////////////////
      auto FGrid = &GRID;
      LatticeFermionD    Kinetic(FGrid); Kinetic = Zero();
      LatticeComplexD    kmu(FGrid); 
      LatticeInteger     scoor(FGrid); 
      LatticeComplexD    sk (FGrid); sk = Zero();
      LatticeComplexD    sk2(FGrid); sk2= Zero();
      LatticeComplexD    W(FGrid); W= Zero();
      LatticeComplexD    one(FGrid); one =ComplexD(1.0,0.0);
      ComplexD ci(0.0,1.0);
      for(int mu=0;mu<Nd;mu++) {
 	RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
 	LatticeCoordinate(kmu,mu);
 	kmu = TwoPiL * kmu;
 	sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
 	sk  = sk  +     sin(kmu)    *sin(kmu); 
 	// -1/2 Dw ->  1/2 gmu (eip - emip) = i sinp gmu
 	Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src_p);
      }
      W = mass + sk2; 
      Kinetic = Kinetic + W * src_p;
      std::cout<<"Momentum space src         "<< norm2(src_p)<<std::endl;
      std::cout<<"Momentum space Dw x src    "<< norm2(Kinetic)<<std::endl;
      std::cout<<"FT[Coordinate space Dw]    "<< norm2(result)<<std::endl;
      result = result - Kinetic;
      std::cout<<"diff "<< norm2(result)<<std::endl;
    }
    std::cout << " =======================================" <<std::endl;
    std::cout << " Checking FourierFreePropagator x Dw = 1" <<std::endl;
    std::cout << " =======================================" <<std::endl;
    std::cout << "Dw src = " <<norm2(src)<<std::endl;
    std::cout << "Dw tmp = " <<norm2(tmp)<<std::endl;
    Dw.M(src,tmp);
    Dw.FreePropagator(tmp,ref,mass);
    std::cout << "Dw ref = " <<norm2(ref)<<std::endl;
    ref = ref - src;
    std::cout << "Dw ref-src = " <<norm2(ref)<<std::endl;
  }
  ////////////////////////////////////////////////////
  // Wilson prop
  ////////////////////////////////////////////////////
  {
    std::cout<<"****************************************"<<std::endl;
    std::cout << "Wilson Mom space 4d propagator \n";
    std::cout<<"****************************************"<<std::endl;
    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
    LatticeFermionD    tmp(&GRID);
    LatticeFermionD    ref(&GRID);
    LatticeFermionD    diff(&GRID);
    src=Zero();
    Coordinate point(4,0); // 0,0,0,0
    SpinColourVectorD ferm;
    ferm=Zero();
    ferm()(0)(0) = ComplexD(1.0);
    pokeSite(ferm,src,point);
    RealD mass=0.1;
    WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
    // Momentum space prop
    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
    Dw.FreePropagator(src,ref,mass) ;
    Gamma G5(Gamma::Algebra::Gamma5);
    LatticeFermionD    result(&GRID); 
    const int sdir=0;
    ////////////////////////////////////////////////////////////////////////
    // Conjugate gradient on normal equations system
    ////////////////////////////////////////////////////////////////////////
    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
    Dw.Mdag(src,tmp);
    src=tmp;
    MdagMLinearOperator<WilsonFermionD,LatticeFermionD> HermOp(Dw);
    ConjugateGradient<LatticeFermionD> CG(1.0e-10,10000);
    CG(HermOp,src,result);
    ////////////////////////////////////////////////////////////////////////
    std::cout << " Taking difference" <<std::endl;
    std::cout << "Dw result "<<norm2(result)<<std::endl;
    std::cout << "Dw ref     "<<norm2(ref)<<std::endl;
    diff = ref - result;
    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
    DumpSliceNorm("Slice Norm Solution ",result,Nd-1);
  }
  ////////////////////////////////////////////////////
  //Gauge invariance test
  ////////////////////////////////////////////////////
  {
    std::cout<<"****************************************"<<std::endl;
    std::cout << "Gauge invariance test \n";
    std::cout<<"****************************************"<<std::endl;
    LatticeGaugeField     U_GT(&GRID); // Gauge transformed field
    LatticeColourMatrix   g(&GRID);    // local Gauge xform matrix
    U_GT = Umu;
    // Make a random xform to teh gauge field
    SU<Nc>::RandomGaugeTransform(pRNG,U_GT,g); // Unit gauge
    LatticeFermionD    src(&GRID);
    LatticeFermionD    tmp(&GRID);
    LatticeFermionD    ref(&GRID);
    LatticeFermionD    diff(&GRID);
    // could loop over colors
    src=Zero();
    Coordinate point(4,0); // 0,0,0,0
    SpinColourVectorD ferm;
    ferm=Zero();
    ferm()(0)(0) = ComplexD(1.0);
    pokeSite(ferm,src,point);
    RealD mass=0.1;
    WilsonFermionD Dw(U_GT,GRID,RBGRID,mass);
    // Momentum space prop
    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
    Dw.FreePropagator(src,ref,mass) ;
    Gamma G5(Gamma::Algebra::Gamma5);
    LatticeFermionD    result(&GRID); 
    const int sdir=0;
    ////////////////////////////////////////////////////////////////////////
    // Conjugate gradient on normal equations system
    ////////////////////////////////////////////////////////////////////////
    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
    Dw.Mdag(src,tmp);
    src=tmp;
    MdagMLinearOperator<WilsonFermionD,LatticeFermionD> HermOp(Dw);
    ConjugateGradient<LatticeFermionD> CG(1.0e-10,10000);
    CG(HermOp,src,result);
    ////////////////////////////////////////////////////////////////////////
    std::cout << " Taking difference" <<std::endl;
    std::cout << "Dw result "<<norm2(result)<<std::endl;
    std::cout << "Dw ref     "<<norm2(ref)<<std::endl;
    diff = ref - result;
    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
    DumpSliceNorm("Slice Norm Solution ",result,Nd-1);
  }
  Grid_finalize();
 }
--- a/tests/core/Test_memory_manager.cc
+++ b/tests/core/Test_memory_manager.cc
@ -0,0 +1,110 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_memory_manager.cc
    Copyright (C) 2022
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 void  MemoryTest(GridCartesian         * FGrid,int N);
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  int N=100;
  for(int i=0;i<N;i++){
    std::cout << "============================"<<std::endl;
    std::cout << "Epoch "<<i<<"/"<<N<<std::endl;
    std::cout << "============================"<<std::endl;
    MemoryTest(UGrid,256);
    MemoryManager::Print();
    AUDIT();
  }
  Grid_finalize();
 }
 void  MemoryTest(GridCartesian         * FGrid, int N)
 {
  LatticeComplexD zero(FGrid); zero=Zero();
  std::vector<LatticeComplexD> A(N,zero);//FGrid);
  std::vector<ComplexD> B(N,ComplexD(0.0)); // Update sequentially on host
  for(int v=0;v<N;v++) A[v] = Zero();
  uint64_t counter = 0;
  for(int epoch = 0;epoch<10000;epoch++){
    int v  = random() %N; // Which vec
    int w  = random() %2; // Write or read
    int e  = random() %3; // expression or for loop
    int dev= random() %2; // On device?
    //    int e=1;
    ComplexD zc = counter++;
    if ( w ) {
      B[v] = B[v] + zc;
      if ( e == 0 ) {
 	A[v] = A[v] + zc - A[v] + A[v];
      } else {
 	if ( dev ) { 
 	  autoView(A_v,A[v],AcceleratorWrite);
 	  accelerator_for(ss,FGrid->oSites(),1,{
 	    A_v[ss] = A_v[ss] + zc;
 	    });
 	} else {
 	  autoView(A_v,A[v],CpuWrite);
 	  thread_for(ss,FGrid->oSites(),{
 	      A_v[ss] = A_v[ss] + zc;
 	    });
 	}
      }
    } else {
      if ( e == 0 ) {
 	A[v] = A[v] + A[v] - A[v];
      } else { 
 	if ( dev ) { 
 	  autoView(A_v,A[v],AcceleratorRead);
 	  accelerator_for(ss,FGrid->oSites(),1,{
 	      assert(B[v]==A_v[ss]()()().getlane(0));
 	    });
 	  //	std::cout << "["<<v<<"] checked on GPU"<<B[v]<<std::endl;
 	} else {
 	  autoView(A_v,A[v],CpuRead);
 	  thread_for(ss,FGrid->oSites(),{
 	      assert(B[v]==A_v[ss]()()().getlane(0));
 	    });
 	  //	std::cout << "["<<v<<"] checked on CPU"<<B[v]<<std::endl;
 	}
      }    
    }
  }
 }
Author	SHA1	Message	Date
Peter Boyle	a00ae981e0	Fence propagation from SYCL	2023-03-29 15:00:40 -04:00
Peter Boyle	3f2fd49db4	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-03-27 17:29:54 -07:00
Peter Boyle	0efa107cb6	Script update	2023-03-27 17:29:43 -07:00
Peter Boyle	8feedb4f6f	Include files moved	2023-03-27 17:29:21 -07:00
Peter Boyle	05e562e3d7	Move the copy synch out to stencil and do one per call instead of one per packet	2023-03-27 17:28:38 -07:00
Peter Boyle	dd3bbb8fa2	MOve the synchronise out to the stencil so one call instead of one call per packet	2023-03-27 17:27:45 -07:00
Peter Boyle	2fbcf13c46	SYCL fix	2023-03-27 14:25:14 -07:00
Peter Boyle	4ea48ef0c4	Merge pull request #419 from lehner/feature/gpt Separate rankSum from sum	2023-03-24 15:42:16 -04:00
Peter Boyle	546be724e7	Merge pull request #421 from UniOfLeicester/feature/accel_Copy_plane Populate the Cshift_table in the GPU	2023-03-24 12:04:06 -04:00
Peter Boyle	481bbaf1fc	Interface to query memory use	2023-03-23 12:55:31 -04:00
Peter Boyle	281488611a	WriteDiscard on construct	2023-03-23 10:28:50 -04:00
Peter Boyle	bae0f8ea99	Merge pull request #425 from rrhodgson/feature/CacheLogging Huge Cache	2023-03-21 08:59:08 -04:00
Peter Boyle	bbbcd36ae5	Merge pull request #426 from rrhodgson/feature/LCDeflation Batched Local Coherence Tools	2023-03-21 08:58:40 -04:00
Peter Boyle	39c0815d9e	WriteDiscard	2023-03-21 08:57:29 -04:00
Raoul Hodgson	a3e935c902	Batched block project/promote size checks	2023-02-27 11:38:16 +00:00
Raoul Hodgson	7731c7db8e	Add huge cache type and allow Ncache==0	2023-02-26 14:15:28 +00:00
Raoul Hodgson	ff97340324	Expose cached bytes	2023-02-26 12:22:45 +00:00
Raoul Hodgson	920a51438d	Added batched Mixed precision CG	2023-02-14 17:04:13 +00:00
Raoul Hodgson	be528b6d27	Add batched block project/promote functions	2023-02-14 14:37:10 +00:00
Peter Boyle	796abfad80	Merge pull request #422 from fjosw/fix/NVCC_DIAG_PRAGMA_SUPPORT Disable diagnostic pragma warnings for CUDA 12+	2023-01-17 09:34:49 -05:00
Fabian Joswig	ad0270ac8c	fix: diagnostic pragma warnings fixed for CUDA 12+	2023-01-12 12:36:30 +00:00
Makis Kappas	7d62f1d6d2	Populate the Cshift_table in the GPU Cshift is allocated in Unified memory and used in the LambdaApply kernels but also populated from the host. This creates a lot of Unified HtoD and DtoH mem operations and has a negative effect in performance. With this commit we populate the Cshift table in the device with the populate_Cshift_table() kernel.	2023-01-11 21:26:25 +00:00
Christoph Lehner	458c943987	merged upstream	2022-12-31 11:16:21 +02:00
Christoph Lehner	88015b0858	Split sum in rankSum and GlobalSum	2022-12-26 10:01:32 +01:00
Peter Boyle	4ca1bf7cca	Added gauge invariance test	2022-12-21 07:23:16 -05:00
Peter Boyle	2ff868f7a5	CPU open doesn't need to free space	2022-12-20 05:10:23 -05:00
Peter Boyle	ede02b6883	Memory manager debug Felix case	2022-12-20 05:10:23 -05:00
Peter Boyle	1822ced302	Bug fix	2022-12-20 05:10:23 -05:00
Peter Boyle	37ba32776f	More logging	2022-12-20 05:10:23 -05:00
Peter Boyle	99b3697b03	More loggin	2022-12-20 05:10:23 -05:00
Peter Boyle	43a45ec97b	SSC_START	2022-12-20 05:10:23 -05:00
Peter Boyle	b00a4142e5	A=A fix	2022-12-20 05:10:23 -05:00
Peter Boyle	3791bc527b	Logging pulled in from dirichlet branch	2022-12-20 05:10:23 -05:00
Peter Boyle	d8c29f5fcf	Updated FFT test for PETSc	2022-12-18 12:05:00 -05:00
Peter Boyle	281f8101fe	Matt FFT test	2022-12-17 20:35:33 -05:00
Peter Boyle	07acfe89f2	Merge pull request #417 from rrhodgson/feature/fermtoprop Feature/fermtoprop	2022-12-06 12:45:03 -05:00
Raoul Hodgson	40234f531f	FermToProp accelerator_for -> thread_for	2022-12-06 17:34:51 +00:00
Raoul Hodgson	d49694f38f	PropToFerm fix	2022-12-06 15:48:54 +00:00
`@ -1 +1 @@`
	`CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi`	`CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi --enable-unified=no`