Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet

2025-09-18 09:11:04 +01:00 · 2023-03-28 11:33:05 -04:00
parent 5c85774ee3 900e01f49b
commit ddf6d5c9e3
14 changed files with 800 additions and 42 deletions
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
@@ -0,0 +1,213 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
+
+    Copyright (C) 2015
+
+    Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
+#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
+
+NAMESPACE_BEGIN(Grid);
+
+//Mixed precision restarted defect correction CG
+template<class FieldD,class FieldF, 
+  typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
+  typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
+public:
+  using LinearFunction<FieldD>::operator();
+  RealD   Tolerance;
+  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+  Integer MaxInnerIterations;
+  Integer MaxOuterIterations;
+  Integer MaxPatchupIterations;
+  GridBase* SinglePrecGrid; //Grid for single-precision fields
+  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+  LinearOperatorBase<FieldF> &Linop_f;
+  LinearOperatorBase<FieldD> &Linop_d;
+
+  //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+  LinearFunction<FieldF> *guesser;
+  bool updateResidual;
+  
+  MixedPrecisionConjugateGradientBatched(RealD tol, 
+          Integer maxinnerit, 
+          Integer maxouterit, 
+          Integer maxpatchit,
+          GridBase* _sp_grid, 
+          LinearOperatorBase<FieldF> &_Linop_f, 
+          LinearOperatorBase<FieldD> &_Linop_d,
+          bool _updateResidual=true) :
+    Linop_f(_Linop_f), Linop_d(_Linop_d),
+    Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
+    OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
+
+  void useGuesser(LinearFunction<FieldF> &g){
+    guesser = &g;
+  }
+  
+  void operator() (const FieldD &src_d_in, FieldD &sol_d){
+    std::vector<FieldD> srcs_d_in{src_d_in};
+    std::vector<FieldD> sols_d{sol_d};
+
+    (*this)(srcs_d_in,sols_d);
+
+    sol_d = sols_d[0];
+  }
+
+  void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
+    assert(src_d_in.size() == sol_d.size());
+    int NBatch = src_d_in.size();
+
+    std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
+
+    Integer TotalOuterIterations = 0; //Number of restarts
+    std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations
+    std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
+  
+    GridStopWatch TotalTimer;
+    TotalTimer.Start();
+
+    GridStopWatch InnerCGtimer;
+    GridStopWatch PrecChangeTimer;
+    
+    int cb = src_d_in[0].Checkerboard();
+    
+    std::vector<RealD> src_norm;
+    std::vector<RealD> norm;
+    std::vector<RealD> stop;
+    
+    GridBase* DoublePrecGrid = src_d_in[0].Grid();
+    FieldD tmp_d(DoublePrecGrid);
+    tmp_d.Checkerboard() = cb;
+    
+    FieldD tmp2_d(DoublePrecGrid);
+    tmp2_d.Checkerboard() = cb;
+
+    std::vector<FieldD> src_d;
+    std::vector<FieldF> src_f;
+    std::vector<FieldF> sol_f;
+
+    for (int i=0; i<NBatch; i++) {
+      sol_d[i].Checkerboard() = cb;
+
+      src_norm.push_back(norm2(src_d_in[i]));
+      norm.push_back(0.);
+      stop.push_back(src_norm[i] * Tolerance*Tolerance);
+
+      src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
+
+      src_f.push_back(SinglePrecGrid);
+      src_f[i].Checkerboard() = cb;
+
+      sol_f.push_back(SinglePrecGrid);
+      sol_f[i].Checkerboard() = cb;
+    }
+    
+    RealD inner_tol = InnerTolerance;
+    
+    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
+    CG_f.ErrorOnNoConverge = false;
+    
+    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
+      
+    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
+      std::cout << GridLogMessage << std::endl;
+      std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
+      
+      bool allConverged = true;
+      
+      for (int i=0; i<NBatch; i++) {
+        //Compute double precision rsd and also new RHS vector.
+        Linop_d.HermOp(sol_d[i], tmp_d);
+        norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
+        
+        std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
+
+        PrecChangeTimer.Start();
+        precisionChange(src_f[i], src_d[i]);
+        PrecChangeTimer.Stop();
+        
+        sol_f[i] = Zero();
+      
+        if(norm[i] > OuterLoopNormMult * stop[i]) {
+          allConverged = false;
+        }
+      }
+      if (allConverged) break;
+
+      if (updateResidual) {
+        RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
+        RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
+        while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+        CG_f.Tolerance = inner_tol;
+      }
+
+      //Optionally improve inner solver guess (eg using known eigenvectors)
+      if(guesser != NULL) {
+        (*guesser)(src_f, sol_f);
+      }
+
+      for (int i=0; i<NBatch; i++) {
+        //Inner CG
+        InnerCGtimer.Start();
+        CG_f(Linop_f, src_f[i], sol_f[i]);
+        InnerCGtimer.Stop();
+        TotalInnerIterations[i] += CG_f.IterationsToComplete;
+        
+        //Convert sol back to double and add to double prec solution
+        PrecChangeTimer.Start();
+        precisionChange(tmp_d, sol_f[i]);
+        PrecChangeTimer.Stop();
+        
+        axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
+      }
+
+    }
+    
+    //Final trial CG
+    std::cout << GridLogMessage << std::endl;
+    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
+    
+    for (int i=0; i<NBatch; i++) {
+      ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
+      CG_d(Linop_d, src_d_in[i], sol_d[i]);
+      TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
+    }
+
+    TotalTimer.Stop();
+
+    std::cout << GridLogMessage << std::endl;
+    for (int i=0; i<NBatch; i++) {
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
+    }
+    std::cout << GridLogMessage << std::endl;
+    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+    
+  }
+};
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid);

 /*Allocation types, saying which pointer cache should be used*/
 #define Cpu      (0)
-#define CpuSmall (1)
-#define Acc      (2)
-#define AccSmall (3)
-#define Shared   (4)
-#define SharedSmall (5)
+#define CpuHuge  (1)
+#define CpuSmall (2)
+#define Acc      (3)
+#define AccHuge  (4)
+#define AccSmall (5)
+#define Shared   (6)
+#define SharedHuge  (7)
+#define SharedSmall (8)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
@@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void)
  
 }

+uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
+uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
+
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
 uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
@@ -170,6 +176,16 @@ void MemoryManager::Init(void)
    }
  }

+  str= getenv("GRID_ALLOC_NCACHE_HUGE");
+  if ( str ) {
+    Nc = atoi(str);
+    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
+      Ncache[CpuHuge]=Nc;
+      Ncache[AccHuge]=Nc;
+      Ncache[SharedHuge]=Nc;
+    }
+  }
+
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
  if ( str ) {
    Nc = atoi(str);
@@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) {
  
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
 #endif
  
 #ifdef GRID_UVM
@@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) {
 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
 {
 #ifdef ALLOCATION_CACHE
-  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
-  int cache = type + small;
+  int cache;
+  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
+  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
+  else                                     cache = type;
+
  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
@@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)

 void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
-  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 

+  if (ncache == 0) return ptr;
+
  void * ret = NULL;
  int v = -1;

@@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 void *MemoryManager::Lookup(size_t bytes,int type)
 {
 #ifdef ALLOCATION_CACHE
-  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
-  int cache = type+small;
+  int cache;
+  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
+  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
+  else                                     cache = type;
+
  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
@@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)

 void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
-  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?

 #define GRID_ALLOC_SMALL_LIMIT (4096)
+#define GRID_ALLOC_HUGE_LIMIT  (2147483648)

 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
@@ -70,6 +71,21 @@ enum ViewMode {
  CpuWriteDiscard = 0x10 // same for now
 };

+struct MemoryStatus {
+  uint64_t     DeviceBytes;
+  uint64_t     DeviceLRUBytes;
+  uint64_t     DeviceMaxBytes;
+  uint64_t     HostToDeviceBytes;
+  uint64_t     DeviceToHostBytes;
+  uint64_t     HostToDeviceXfer;
+  uint64_t     DeviceToHostXfer;
+  uint64_t     DeviceEvictions;
+  uint64_t     DeviceDestroy;
+  uint64_t     DeviceAllocCacheBytes;
+  uint64_t     HostAllocCacheBytes;
+};
+
+
 class MemoryManager {
 private:

@@ -83,7 +99,7 @@ private:
  } AllocationCacheEntry;

  static const int NallocCacheMax=128; 
-  static const int NallocType=6;
+  static const int NallocType=9;
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
@@ -122,6 +138,25 @@ private:
  static uint64_t     DeviceEvictions;
  static uint64_t     DeviceDestroy;
  
+  static uint64_t     DeviceCacheBytes();
+  static uint64_t     HostCacheBytes();
+
+  static MemoryStatus GetFootprint(void) {
+    MemoryStatus stat;
+    stat.DeviceBytes       = DeviceBytes;
+    stat.DeviceLRUBytes    = DeviceLRUBytes;
+    stat.DeviceMaxBytes    = DeviceMaxBytes;
+    stat.HostToDeviceBytes = HostToDeviceBytes;
+    stat.DeviceToHostBytes = DeviceToHostBytes;
+    stat.HostToDeviceXfer  = HostToDeviceXfer;
+    stat.DeviceToHostXfer  = DeviceToHostXfer;
+    stat.DeviceEvictions   = DeviceEvictions;
+    stat.DeviceDestroy     = DeviceDestroy;
+    stat.DeviceAllocCacheBytes = DeviceCacheBytes();
+    stat.HostAllocCacheBytes   = HostCacheBytes();
+    return stat;
+  };
+  
 private:
 #ifndef GRID_UVM
  //////////////////////////////////////////////////////////////////////
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  acceleratorCopySynchronise();
-  StencilBarrier();// Synch shared memory on a single nodes
-
  int nreq=list.size();

  if (nreq==0) return;
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -37,9 +37,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef GRID_HIP
 #include <hip/hip_runtime_api.h>
 #endif
-#ifdef GRID_SYCl
+#ifdef GRID_SYCL

 #endif
+#define GRID_SYCL_LEVEL_ZERO_IPC
+

 NAMESPACE_BEGIN(Grid); 
 #define header "SharedMemoryMpi: "
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  }
 }

+#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
+
+template <typename T>
+T iDivUp(T a, T b) // Round a / b to nearest higher integer value
+{ return (a % b != 0) ? (a / b + 1) : (a / b); }
+
+template <typename T>
+__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
+{
+    int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    if (idx >= e1*e2) return;
+
+    int n, b, o;
+
+    n = idx / e2;
+    b = idx % e2;
+    o = n*stride + b;
+
+    vector[2*idx + 0] = lo + o;
+    vector[2*idx + 1] = ro + o;
+}
+
+#endif
+
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int ent=0;

  if(cbmask == 0x3 ){
+#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
+    ent = e1*e2;
+    dim3 blockSize(acceleratorThreads());
+    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
+    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
+    accelerator_barrier();
+#else
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
+#endif
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int ent=0;

  if ( cbmask == 0x3 ) {
+#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
+    ent = e1*e2;
+    dim3 blockSize(acceleratorThreads());
+    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
+    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
+    accelerator_barrier();
+#else
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
+#endif
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -153,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 }

 template<class vobj>
-inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
+inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
 {
  Integer osites = arg.Grid()->oSites();
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
-  typename vobj::scalar_object ssum;
  autoView( arg_v, arg, AcceleratorRead);
-  ssum= sum_gpu(&arg_v[0],osites);
+  return sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
-  auto ssum= sum_cpu(&arg_v[0],osites);
+  return sum_cpu(&arg_v[0],osites);
 #endif  
+}
+
+template<class vobj>
+inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
+{
+  auto ssum = rankSum(arg);
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }

 template<class vobj>
-inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
+inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_gpu_large(&arg_v[0],osites);
+  return sum_gpu_large(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_cpu(&arg_v[0],osites);
+  return sum_cpu(&arg_v[0],osites);
 #endif
+}
+
+template<class vobj>
+inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
+{
+  auto ssum = rankSumLarge(arg);
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
  }
 }
+template<class vobj,class CComplex,int nbasis,class VLattice>
+inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
+                               const std::vector<Lattice<vobj>> &fineData,
+                               const VLattice &Basis)
+{
+  int NBatch = fineData.size();
+  assert(coarseData.size() == NBatch);

+  GridBase * fine  = fineData[0].Grid();
+  GridBase * coarse= coarseData[0].Grid();
+
+  Lattice<iScalar<CComplex>> ip(coarse);
+  std::vector<Lattice<vobj>> fineDataCopy = fineData;
+
+  autoView(ip_, ip, AcceleratorWrite);
+  for(int v=0;v<nbasis;v++) {
+    for (int k=0; k<NBatch; k++) {
+      autoView( coarseData_ , coarseData[k], AcceleratorWrite);
+      blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
+      accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
+        convertType(coarseData_[sc](v),ip_[sc]);
+      });
+
+      // improve numerical stability of projection
+      // |fine> = |fine> - <basis|fine> |basis>
+      ip=-ip;
+      blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); 
+    }
+  }
+}

 template<class vobj,class vobj2,class CComplex>
  inline void blockZAXPY(Lattice<vobj> &fineZ,
@@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }
 #endif

+template<class vobj,class CComplex,int nbasis,class VLattice>
+inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
+                               std::vector<Lattice<vobj>> &fineData,
+                               const VLattice &Basis)
+{
+  int NBatch = coarseData.size();
+  assert(fineData.size() == NBatch);
+
+  GridBase * fine   = fineData[0].Grid();
+  GridBase * coarse = coarseData[0].Grid();
+  for (int k=0; k<NBatch; k++)
+    fineData[k]=Zero();
+  for (int i=0;i<nbasis;i++) {
+    for (int k=0; k<NBatch; k++) {
+      Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
+      blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
+    }
+  }
+}
+
 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
 template<class vobj,class vvobj>
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -434,7 +434,6 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    accelerator_barrier();
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
@@ -443,7 +442,6 @@ public:
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
-    _grid->StencilBarrier();// Synch shared memory on a single nodes
  }

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
@@ -452,6 +450,9 @@ public:
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
+    acceleratorCopySynchronise();
+    // Everyone agrees we are all done
+    _grid->StencilBarrier(); 
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
@@ -529,7 +530,6 @@ public:
  {
    _grid->StencilBarrier();// Synch shared memory on a single nodes

-    // conformable(source.Grid(),_grid);
    assert(source.Grid()==_grid);

    u_comm_offset=0;
@@ -655,8 +655,8 @@ public:
    CommsMerge(decompress,Mergers,Decompressions);
  }
  template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
-    _grid->StencilBarrier();// Synch shared memory on a single nodes
-    CommsMerge(decompress,MergersSHM,DecompressionsSHM);
+    assert(MergersSHM.size()==0);
+    assert(DecompressionsSHM.size()==0);
  }

  template<class decompressor>
--- a/benchmarks/Benchmark_dwf_fp32_paranoid.cc
+++ b/benchmarks/Benchmark_dwf_fp32_paranoid.cc
@@ -0,0 +1,387 @@
+ /*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid
+    Source file: ./benchmarks/Benchmark_dwf.cc
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#ifdef GRID_CUDA
+#define CUDA_PROFILE
+#endif
+
+#ifdef CUDA_PROFILE
+#include <cuda_profiler_api.h>
+#endif
+
+using namespace std;
+using namespace Grid;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  int threads = GridThread::GetThreads();
+
+  Coordinate latt4 = GridDefaultLatt();
+  int Ls=16;
+  for(int i=0;i<argc;i++)
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+
+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
+
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  LatticeFermionF src   (FGrid); random(RNG5,src);
+  LatticeFermionF src1   (FGrid); random(RNG5,src1);
+#if 0
+  src = Zero();
+  {
+    Coordinate origin({0,0,0,latt4[2]-1,0});
+    SpinColourVectorF tmp;
+    tmp=Zero();
+    tmp()(0)(0)=Complex(-2.0,0.0);
+    std::cout << " source site 0 " << tmp<<std::endl;
+    pokeSite(tmp,src,origin);
+  }
+#else
+  RealD N2 = 1.0/::sqrt(norm2(src));
+  src = src*N2;
+#endif
+
+
+  LatticeFermionF result(FGrid); result=Zero();
+  LatticeFermionF    ref(FGrid);    ref=Zero();
+  LatticeFermionF    tmp(FGrid);
+  LatticeFermionF    err(FGrid);
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  LatticeGaugeFieldF Umu(UGrid);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+#if 0
+  Umu=1.0;
+  for(int mu=0;mu<Nd;mu++){
+    LatticeColourMatrixF ttmp(UGrid);
+    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
+    //    if (mu !=2 ) ttmp = 0;
+    //    ttmp = ttmp* pow(10.0,mu);
+    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
+  }
+  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
+#endif
+
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  // replicate across fifth dimension
+  //  LatticeGaugeFieldF Umu5d(FGrid);
+  std::vector<LatticeColourMatrixF> U(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  }
+  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
+
+  if (1)
+  {
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
+	  }
+	}
+      }
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
+  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  int ncall =100;
+
+  if (1) {
+    FGrid->Barrier();
+    Dw.Dhop(src,result,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src1,result,0);
+      Dw.Dhop(src,result,0);
+      err = ref-result;
+      std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+      assert (norm2(err)< 1.0e-4 );
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=single_site_flops*volume*ncall;
+
+    auto nsimd = vComplex::Nsimd();
+    auto simdwidth = sizeof(vComplex);
+
+    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
+    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
+    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
+    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
+    err = ref-result;
+    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+    //exit(0);
+
+    if(( norm2(err)>1.0e-4) ) {
+
+      /*
+      std::cout << "RESULT\n " << result<<std::endl;
+      std::cout << "REF   \n " << ref   <<std::endl;
+      std::cout << "ERR   \n " << err   <<std::endl;
+      */
+      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
+      FGrid->Barrier();
+      exit(-1);
+    }
+    assert (norm2(err)< 1.0e-4 );
+  }
+
+  if (1)
+  { // Naive wilson dag implementation
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    int i=s+Ls*ss;
+	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
+	  }
+	}
+      }
+      
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      //      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+    }
+    ref = -0.5*ref;
+  }
+  //  dump=1;
+  Dw.Dhop(src,result,1);
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
+  err = ref-result;
+  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
+  if((norm2(err)>1.0e-4)){
+/*
+	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
+	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
+	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
+*/
+  }
+  LatticeFermionF src_e (FrbGrid);
+  LatticeFermionF src_o (FrbGrid);
+  LatticeFermionF r_e   (FrbGrid);
+  LatticeFermionF r_o   (FrbGrid);
+  LatticeFermionF r_eo  (FGrid);
+
+  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+
+  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
+
+
+  // S-direction is INNERMOST and takes no part in the parity.
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  {
+    FGrid->Barrier();
+    Dw.DhopEO(src_o,r_e,DaggerNo);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+#ifdef CUDA_PROFILE
+      if(i==10) cudaProfilerStart();
+#endif
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+#ifdef CUDA_PROFILE
+      if(i==20) cudaProfilerStop();
+#endif
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(single_site_flops*volume*ncall)/2.0;
+
+    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
+  }
+  Dw.DhopEO(src_o,r_e,DaggerNo);
+  Dw.DhopOE(src_e,r_o,DaggerNo);
+  Dw.Dhop  (src  ,result,DaggerNo);
+
+  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
+  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
+  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
+
+  setCheckerboard(r_eo,r_o);
+  setCheckerboard(r_eo,r_e);
+
+  err = r_eo-result;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+  if((norm2(err)>1.0e-4)){
+    /*
+	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
+	std::cout<< "Deo REF\n " <<result  << std::endl;
+	std::cout<< "Deo ERR   \n " << err <<std::endl;
+    */
+  }
+
+  pickCheckerboard(Even,src_e,err);
+  pickCheckerboard(Odd,src_o,err);
+  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
+
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
+  Grid_finalize();
+  exit(0);
+}
--- a/systems/PVC/benchmarks/run-1tile.sh
+++ b/systems/PVC/benchmarks/run-1tile.sh
@@ -4,7 +4,7 @@
 #SBATCH -p QZ1J-ICX-PVC
 ##SBATCH -p QZ1J-SPR-PVC-2C

-source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh

 export NT=8

--- a/systems/PVC/benchmarks/run-2tile-mpi.sh
+++ b/systems/PVC/benchmarks/run-2tile-mpi.sh
@@ -4,7 +4,7 @@

 #SBATCH -p QZ1J-ICX-PVC

-source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh

 export NT=16

@@ -19,11 +19,15 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
 export I_MPI_OFFLOAD_CELL=tile
 export EnableImplicitScaling=0
 export EnableWalkerPartition=0
-export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
+#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0

-#mpiexec -launcher ssh -n 1 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log
+for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+do
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 1.1.1.2.log$i
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 2.1.1.1.log$i 
+done

 mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0

--- a/systems/PVC/benchmarks/wrap.sh
+++ b/systems/PVC/benchmarks/wrap.sh
@@ -5,10 +5,5 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
 echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK


-#if [ $MPI_LOCALRANKID = "0" ] 
-#then
-#  ~psteinbr/build_pti/ze_tracer -c $@
-#  onetrace --chrome-kernel-timeline $@
-#else
  $@
-#fi
+