First cut at faster GPU slice sum via atomics

2026-07-28 14:33:29 +01:00 · 2022-12-22 15:13:45 -05:00
21 changed files with 295 additions and 443 deletions
@@ -45,7 +45,7 @@ directory
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"

-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 //disables nvcc specific warning in json.hpp
 #pragma nv_diag_suppress unsigned_compare_with_zero
 #pragma nv_diag_suppress cast_to_qualified_type
@@ -14,7 +14,7 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress code_is_unreachable
 #else
 #pragma diag_suppress code_is_unreachable
@@ -54,7 +54,6 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
-#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
@@ -1,213 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
-
-    Copyright (C) 2015
-
-    Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
-#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
-
-NAMESPACE_BEGIN(Grid);
-
-//Mixed precision restarted defect correction CG
-template<class FieldD,class FieldF, 
-  typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
-  typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
-class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
-public:
-  using LinearFunction<FieldD>::operator();
-  RealD   Tolerance;
-  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
-  Integer MaxInnerIterations;
-  Integer MaxOuterIterations;
-  Integer MaxPatchupIterations;
-  GridBase* SinglePrecGrid; //Grid for single-precision fields
-  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
-  LinearOperatorBase<FieldF> &Linop_f;
-  LinearOperatorBase<FieldD> &Linop_d;
-
-  //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
-  LinearFunction<FieldF> *guesser;
-  bool updateResidual;
-  
-  MixedPrecisionConjugateGradientBatched(RealD tol, 
-          Integer maxinnerit, 
-          Integer maxouterit, 
-          Integer maxpatchit,
-          GridBase* _sp_grid, 
-          LinearOperatorBase<FieldF> &_Linop_f, 
-          LinearOperatorBase<FieldD> &_Linop_d,
-          bool _updateResidual=true) :
-    Linop_f(_Linop_f), Linop_d(_Linop_d),
-    Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
-    OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
-
-  void useGuesser(LinearFunction<FieldF> &g){
-    guesser = &g;
-  }
-  
-  void operator() (const FieldD &src_d_in, FieldD &sol_d){
-    std::vector<FieldD> srcs_d_in{src_d_in};
-    std::vector<FieldD> sols_d{sol_d};
-
-    (*this)(srcs_d_in,sols_d);
-
-    sol_d = sols_d[0];
-  }
-
-  void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
-    assert(src_d_in.size() == sol_d.size());
-    int NBatch = src_d_in.size();
-
-    std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
-
-    Integer TotalOuterIterations = 0; //Number of restarts
-    std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations
-    std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
-  
-    GridStopWatch TotalTimer;
-    TotalTimer.Start();
-
-    GridStopWatch InnerCGtimer;
-    GridStopWatch PrecChangeTimer;
-    
-    int cb = src_d_in[0].Checkerboard();
-    
-    std::vector<RealD> src_norm;
-    std::vector<RealD> norm;
-    std::vector<RealD> stop;
-    
-    GridBase* DoublePrecGrid = src_d_in[0].Grid();
-    FieldD tmp_d(DoublePrecGrid);
-    tmp_d.Checkerboard() = cb;
-    
-    FieldD tmp2_d(DoublePrecGrid);
-    tmp2_d.Checkerboard() = cb;
-
-    std::vector<FieldD> src_d;
-    std::vector<FieldF> src_f;
-    std::vector<FieldF> sol_f;
-
-    for (int i=0; i<NBatch; i++) {
-      sol_d[i].Checkerboard() = cb;
-
-      src_norm.push_back(norm2(src_d_in[i]));
-      norm.push_back(0.);
-      stop.push_back(src_norm[i] * Tolerance*Tolerance);
-
-      src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
-
-      src_f.push_back(SinglePrecGrid);
-      src_f[i].Checkerboard() = cb;
-
-      sol_f.push_back(SinglePrecGrid);
-      sol_f[i].Checkerboard() = cb;
-    }
-    
-    RealD inner_tol = InnerTolerance;
-    
-    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
-    CG_f.ErrorOnNoConverge = false;
-    
-    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
-      
-    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
-      std::cout << GridLogMessage << std::endl;
-      std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
-      
-      bool allConverged = true;
-      
-      for (int i=0; i<NBatch; i++) {
-        //Compute double precision rsd and also new RHS vector.
-        Linop_d.HermOp(sol_d[i], tmp_d);
-        norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
-        
-        std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
-
-        PrecChangeTimer.Start();
-        precisionChange(src_f[i], src_d[i]);
-        PrecChangeTimer.Stop();
-        
-        sol_f[i] = Zero();
-      
-        if(norm[i] > OuterLoopNormMult * stop[i]) {
-          allConverged = false;
-        }
-      }
-      if (allConverged) break;
-
-      if (updateResidual) {
-        RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
-        RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
-        while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
-        CG_f.Tolerance = inner_tol;
-      }
-
-      //Optionally improve inner solver guess (eg using known eigenvectors)
-      if(guesser != NULL) {
-        (*guesser)(src_f, sol_f);
-      }
-
-      for (int i=0; i<NBatch; i++) {
-        //Inner CG
-        InnerCGtimer.Start();
-        CG_f(Linop_f, src_f[i], sol_f[i]);
-        InnerCGtimer.Stop();
-        TotalInnerIterations[i] += CG_f.IterationsToComplete;
-        
-        //Convert sol back to double and add to double prec solution
-        PrecChangeTimer.Start();
-        precisionChange(tmp_d, sol_f[i]);
-        PrecChangeTimer.Stop();
-        
-        axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
-      }
-
-    }
-    
-    //Final trial CG
-    std::cout << GridLogMessage << std::endl;
-    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
-    
-    for (int i=0; i<NBatch; i++) {
-      ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
-      CG_d(Linop_d, src_d_in[i], sol_d[i]);
-      TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
-    }
-
-    TotalTimer.Stop();
-
-    std::cout << GridLogMessage << std::endl;
-    for (int i=0; i<NBatch; i++) {
-      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
-    }
-    std::cout << GridLogMessage << std::endl;
-    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
-    
-  }
-};
-
-NAMESPACE_END(Grid);
-
-#endif
@@ -4,14 +4,11 @@ NAMESPACE_BEGIN(Grid);

 /*Allocation types, saying which pointer cache should be used*/
 #define Cpu      (0)
-#define CpuHuge  (1)
-#define CpuSmall (2)
-#define Acc      (3)
-#define AccHuge  (4)
-#define AccSmall (5)
-#define Shared   (6)
-#define SharedHuge  (7)
-#define SharedSmall (8)
+#define CpuSmall (1)
+#define Acc      (2)
+#define AccSmall (3)
+#define Shared   (4)
+#define SharedSmall (5)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
@@ -38,15 +35,12 @@ void MemoryManager::PrintBytes(void)
  
 }

-uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
-uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
-
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
 uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
@@ -176,16 +170,6 @@ void MemoryManager::Init(void)
    }
  }

-  str= getenv("GRID_ALLOC_NCACHE_HUGE");
-  if ( str ) {
-    Nc = atoi(str);
-    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
-      Ncache[CpuHuge]=Nc;
-      Ncache[AccHuge]=Nc;
-      Ncache[SharedHuge]=Nc;
-    }
-  }
-
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
  if ( str ) {
    Nc = atoi(str);
@@ -206,9 +190,7 @@ void MemoryManager::InitMessage(void) {
  
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
 #endif
  
 #ifdef GRID_UVM
@@ -240,11 +222,8 @@ void MemoryManager::InitMessage(void) {
 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
 {
 #ifdef ALLOCATION_CACHE
-  int cache;
-  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
-  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
-  else                                     cache = type;
-
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache = type + small;
  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
@@ -253,12 +232,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)

 void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
+  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 

-  if (ncache == 0) return ptr;
-
  void * ret = NULL;
  int v = -1;

@@ -293,11 +271,8 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 void *MemoryManager::Lookup(size_t bytes,int type)
 {
 #ifdef ALLOCATION_CACHE
-  int cache;
-  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
-  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
-  else                                     cache = type;
-
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache = type+small;
  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
@@ -306,6 +281,7 @@ void *MemoryManager::Lookup(size_t bytes,int type)

 void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
+  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
@@ -35,7 +35,6 @@ NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?

 #define GRID_ALLOC_SMALL_LIMIT (4096)
-#define GRID_ALLOC_HUGE_LIMIT  (2147483648)

 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
@@ -71,21 +70,6 @@ enum ViewMode {
  CpuWriteDiscard = 0x10 // same for now
 };

-struct MemoryStatus {
-  uint64_t     DeviceBytes;
-  uint64_t     DeviceLRUBytes;
-  uint64_t     DeviceMaxBytes;
-  uint64_t     HostToDeviceBytes;
-  uint64_t     DeviceToHostBytes;
-  uint64_t     HostToDeviceXfer;
-  uint64_t     DeviceToHostXfer;
-  uint64_t     DeviceEvictions;
-  uint64_t     DeviceDestroy;
-  uint64_t     DeviceAllocCacheBytes;
-  uint64_t     HostAllocCacheBytes;
-};
-
-
 class MemoryManager {
 private:

@@ -99,7 +83,7 @@ private:
  } AllocationCacheEntry;

  static const int NallocCacheMax=128; 
-  static const int NallocType=9;
+  static const int NallocType=6;
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
@@ -137,26 +121,7 @@ private:
  static uint64_t     DeviceToHostXfer;
  static uint64_t     DeviceEvictions;
  static uint64_t     DeviceDestroy;
-  
-  static uint64_t     DeviceCacheBytes();
-  static uint64_t     HostCacheBytes();
-
-  static MemoryStatus GetFootprint(void) {
-    MemoryStatus stat;
-    stat.DeviceBytes       = DeviceBytes;
-    stat.DeviceLRUBytes    = DeviceLRUBytes;
-    stat.DeviceMaxBytes    = DeviceMaxBytes;
-    stat.HostToDeviceBytes = HostToDeviceBytes;
-    stat.DeviceToHostBytes = DeviceToHostBytes;
-    stat.HostToDeviceXfer  = HostToDeviceXfer;
-    stat.DeviceToHostXfer  = DeviceToHostXfer;
-    stat.DeviceEvictions   = DeviceEvictions;
-    stat.DeviceDestroy     = DeviceDestroy;
-    stat.DeviceAllocCacheBytes = DeviceCacheBytes();
-    stat.HostAllocCacheBytes   = HostCacheBytes();
-    return stat;
-  };
-  
+ 
 private:
 #ifndef GRID_UVM
  //////////////////////////////////////////////////////////////////////
@@ -401,6 +401,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  //   std::cout << "Copy Synchronised\n"<<std::endl;
+  acceleratorCopySynchronise();
+
  int nreq=list.size();

  if (nreq==0) return;
@@ -36,10 +36,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef GRID_HIP
 #include <hip/hip_runtime_api.h>
 #endif
-#ifdef GRID_SYCL
-#define GRID_SYCL_LEVEL_ZERO_IPC
-#endif
+#ifdef GRID_SYCl

+#endif

 NAMESPACE_BEGIN(Grid); 
 #define header "SharedMemoryMpi: "
@@ -297,30 +297,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  }
 }

-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-
-template <typename T>
-T iDivUp(T a, T b) // Round a / b to nearest higher integer value
-{ return (a % b != 0) ? (a / b + 1) : (a / b); }
-
-template <typename T>
-__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
-{
-    int idx = blockIdx.x*blockDim.x + threadIdx.x;
-    if (idx >= e1*e2) return;
-
-    int n, b, o;
-
-    n = idx / e2;
-    b = idx % e2;
-    o = n*stride + b;
-
-    vector[2*idx + 0] = lo + o;
-    vector[2*idx + 1] = ro + o;
-}
-
-#endif
-
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@@ -345,20 +321,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int ent=0;

  if(cbmask == 0x3 ){
-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-    ent = e1*e2;
-    dim3 blockSize(acceleratorThreads());
-    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
-    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
-    accelerator_barrier();
-#else
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
-#endif
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@@ -409,19 +377,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int ent=0;

  if ( cbmask == 0x3 ) {
-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-    ent = e1*e2;
-    dim3 blockSize(acceleratorThreads());
-    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
-    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
-    accelerator_barrier();
-#else
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
-#endif
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
@@ -129,7 +129,7 @@ public:
    
    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
-    auto me  = View(AcceleratorWriteDiscard);
+    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
      coalescedWrite(me[ss],tmp);
@@ -152,7 +152,7 @@ public:

    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
-    auto me  = View(AcceleratorWriteDiscard);
+    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
      coalescedWrite(me[ss],tmp);
@@ -174,7 +174,7 @@ public:
    this->checkerboard=cb;
    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
-    auto me  = View(AcceleratorWriteDiscard);
+    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
      coalescedWrite(me[ss],tmp);
@@ -245,7 +245,7 @@ public:
  ///////////////////////////////////////////
  // user defined constructor
  ///////////////////////////////////////////
-  Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { 
+  Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) { 
    this->_grid = grid;
    resize(this->_grid->oSites());
    assert((((uint64_t)&this->_odata[0])&0xF) ==0);
@@ -288,8 +288,8 @@ public:
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
    conformable(*this,r);
    this->checkerboard = r.Checkerboard();
+    auto me =   View(AcceleratorWrite);
    auto him= r.View(AcceleratorRead);
-    auto me =   View(AcceleratorWriteDiscard);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -303,8 +303,8 @@ public:
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
    this->checkerboard = r.Checkerboard();
    conformable(*this,r);
+    auto me =   View(AcceleratorWrite);
    auto him= r.View(AcceleratorRead);
-    auto me =   View(AcceleratorWriteDiscard);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -156,44 +156,33 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 }

 template<class vobj>
-inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
+inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
  Integer osites = arg.Grid()->oSites();
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+  typename vobj::scalar_object ssum;
  autoView( arg_v, arg, AcceleratorRead);
-  return sum_gpu(&arg_v[0],osites);
+  ssum= sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
-  return sum_cpu(&arg_v[0],osites);
+  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif  
-}
-
-template<class vobj>
-inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
-{
-  auto ssum = rankSum(arg);
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }

 template<class vobj>
-inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
+inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
-  return sum_gpu_large(&arg_v[0],osites);
+  auto ssum= sum_gpu_large(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
-  return sum_cpu(&arg_v[0],osites);
+  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif
-}
-
-template<class vobj>
-inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
-{
-  auto ssum = rankSumLarge(arg);
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
@@ -0,0 +1,126 @@
+NAMESPACE_BEGIN(Grid);
+
+// If NOT CUDA or HIP -- we should provide
+// -- atomicAdd(float *,float)
+// -- atomicAdd(double *,double)
+// 
+// Augment CUDA with complex atomics
+#if !defined(GRID_HIP) || !defined(GRID_CUDA)
+inline void atomicAdd(float *acc,float elem)
+{
+  *acc += elem;
+}
+inline void atomicAdd(double *acc,double elem)
+{
+  *acc += elem;
+}
+#endif
+inline void atomicAdd(ComplexD *accum,ComplexD & elem)
+{
+  double *a_p = (double *)accum;
+  double *e_p = (double *)&elem;
+  for(int w=0;w<2;w++){
+    atomicAdd(&a_p[w],e_p[w]);
+  }
+}
+inline void atomicAdd(ComplexF *accum,ComplexF & elem)
+{
+  float *a_p = (float *)accum;
+  float *e_p = (float *)&elem;
+  for(int w=0;w<2;w++){
+    atomicAdd(&a_p[w],e_p[w]);
+  }
+}
+// Augment CUDA with vobj atomics
+template<class vobj> accelerator_inline void atomicAdd(vobj *accum, vobj & elem)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  scalar_type *a_p= (scalar_type *)accum;
+  scalar_type *e_p= (scalar_type *)& elem;
+  for(int w=0;w<vobj::Nsimd();w++){
+    atomicAdd(&a_p[w],e_p[w]);
+  }
+}
+// Atomics based slice sum
+template<class vobj> inline void sliceSumGpu(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_object::scalar_type scalar_type;
+  GridBase  *grid = Data.Grid();
+  assert(grid!=NULL);
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  // Move to device memory and copy in / out
+  Vector<vobj> lvSum(rd); // will locally sum vectors first
+  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
+  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD
+
+  result.resize(fd); // And then global sum to return the same vector to every node 
+  for(int r=0;r<rd;r++){
+    lvSum[r]=Zero();
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  // sum over reduced dimension planes, breaking out orthog dir
+  // Parallel over orthog direction
+  autoView( Data_v, Data, AcceleratorRead);
+  auto lvSum_p=&lvSum[0];
+  int ostride = grid->_ostride[orthogdim]; 
+  accelerator_for( ree,rd*e1*e2,1, {
+    int b = ree%e2;
+    int re= ree/e2;
+    int n=re%e1;
+    int r=re/e1;
+    int so=r*ostride;
+    int ss=so+n*stride+b;
+    atomicAdd(&lvSum_p[r],Data_v[ss]);
+  });
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  Coordinate icoor(Nd);
+
+  for(int rt=0;rt<rd;rt++){
+
+    extract(lvSum[rt],extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx];
+
+    }
+  }
+  
+  // sum over nodes.
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      result[t]=lsSum[lt];
+    } else {
+      result[t]=Zero();
+    }
+
+  }
+  scalar_type * ptr = (scalar_type *) &result[0];
+  int words = fd*sizeof(sobj)/sizeof(scalar_type);
+  grid->GlobalSumVector(ptr, words);
+}
+
+
+NAMESPACE_END(Grid);
@@ -288,36 +288,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
  }
 }
-template<class vobj,class CComplex,int nbasis,class VLattice>
-inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
-                               const std::vector<Lattice<vobj>> &fineData,
-                               const VLattice &Basis)
-{
-  int NBatch = fineData.size();
-  assert(coarseData.size() == NBatch);

-  GridBase * fine  = fineData[0].Grid();
-  GridBase * coarse= coarseData[0].Grid();
-
-  Lattice<iScalar<CComplex>> ip(coarse);
-  std::vector<Lattice<vobj>> fineDataCopy = fineData;
-
-  autoView(ip_, ip, AcceleratorWrite);
-  for(int v=0;v<nbasis;v++) {
-    for (int k=0; k<NBatch; k++) {
-      autoView( coarseData_ , coarseData[k], AcceleratorWrite);
-      blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
-      accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
-        convertType(coarseData_[sc](v),ip_[sc]);
-      });
-
-      // improve numerical stability of projection
-      // |fine> = |fine> - <basis|fine> |basis>
-      ip=-ip;
-      blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); 
-    }
-  }
-}

 template<class vobj,class vobj2,class CComplex>
  inline void blockZAXPY(Lattice<vobj> &fineZ,
@@ -619,26 +590,6 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }
 #endif

-template<class vobj,class CComplex,int nbasis,class VLattice>
-inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
-                               std::vector<Lattice<vobj>> &fineData,
-                               const VLattice &Basis)
-{
-  int NBatch = coarseData.size();
-  assert(fineData.size() == NBatch);
-
-  GridBase * fine   = fineData[0].Grid();
-  GridBase * coarse = coarseData[0].Grid();
-  for (int k=0; k<NBatch; k++)
-    fineData[k]=Zero();
-  for (int i=0;i<nbasis;i++) {
-    for (int k=0; k<NBatch; k++) {
-      Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
-      blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
-    }
-  }
-}
-
 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
 template<class vobj,class vvobj>
@@ -16,7 +16,7 @@

 #ifdef __NVCC__
 #pragma push
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #else
 #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
@@ -459,7 +459,11 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField

   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
+#ifdef SYCL_HACK     
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl);    return; }
+#else
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
+#endif     
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
@@ -470,7 +474,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
-     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
 #ifndef GRID_CUDA
@@ -495,9 +498,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
+     acceleratorFenceComputeStream();
   } else if( interior ) {
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt);    return;}
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
@@ -398,8 +398,6 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    // Buffers are gathered AND synchronised
-    // Copies are MPI ISend OR asynch copy on copy stream
    reqs.resize(Packets.size());
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
@@ -412,18 +410,14 @@ public:
      comms_bytes+=bytes;
      shm_bytes  +=2*Packets[i].bytes-bytes;
    }
+    _grid->StencilBarrier();// Synch shared memory on a single nodes
  }

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    // complete intranode
-    acceleratorCopySynchronise();
-    // complete MPI
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromComplete(reqs[i],i);
    }
-    // Everyone agrees we are all done
-    _grid->StencilBarrier(); 
    commtime+=usecond();
  }
  ////////////////////////////////////////////////////////////////////////
@@ -431,9 +425,33 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void Communicate(void)
  {
-    std::vector<std::vector<CommsRequest_t> > reqs;
-    this->CommunicateBegin(reqs);
-    this->CommunicateComplete(reqs);
+    if ( 0 ){
+      thread_region {
+	// must be called in parallel region
+	int mythread  = thread_num();
+	int maxthreads= thread_max();
+	int nthreads = CartesianCommunicator::nCommThreads;
+	assert(nthreads <= maxthreads);
+	if (nthreads == -1) nthreads = 1;
+	if (mythread < nthreads) {
+	  for (int i = mythread; i < Packets.size(); i += nthreads) {
+	    double start = usecond();
+	    uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf,
+							 Packets[i].to_rank,
+							 Packets[i].recv_buf,
+							 Packets[i].from_rank,
+							 Packets[i].bytes,i);
+	    comm_bytes_thr[mythread] += bytes;
+	    shm_bytes_thr[mythread]  += Packets[i].bytes - bytes;
+	    comm_time_thr[mythread]  += usecond() - start;
+	  }
+	}
+      }
+    } else { // Concurrent and non-threaded asynch calls to MPI
+      std::vector<std::vector<CommsRequest_t> > reqs;
+      this->CommunicateBegin(reqs);
+      this->CommunicateComplete(reqs);
+    }
  }

  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
@@ -509,6 +527,7 @@ public:
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    mpi3synctime_g+=usecond();

+    // conformable(source.Grid(),_grid);
    assert(source.Grid()==_grid);
    halogtime-=usecond();

@@ -567,8 +586,13 @@ public:
    CommsMerge(decompress,Mergers,Decompressions);
  }
  template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
-    assert(MergersSHM.size()==0);
-    assert(DecompressionsSHM.size()==0);
+    mpi3synctime-=usecond();
+    accelerator_barrier();
+    _grid->StencilBarrier();// Synch shared memory on a single nodes
+    mpi3synctime+=usecond();
+    shmmergetime-=usecond();
+    CommsMerge(decompress,MergersSHM,DecompressionsSHM);
+    shmmergetime+=usecond();
  }

  template<class decompressor>
@@ -585,7 +609,6 @@ public:
 	  decompress.Exchange(mp,vp0,vp1,type,o);
      });
    }
-    if ( mm.size() )    acceleratorFenceComputeStream();
    mergetime+=usecond();

    decompresstime-=usecond();
@@ -596,9 +619,7 @@ public:
 	decompress.Decompress(kp,mp,o);
      });
    }
-    if ( dd.size() )    acceleratorFenceComputeStream();
    decompresstime+=usecond();
-    
  }
  ////////////////////////////////////////
  // Set up routines
@@ -249,16 +249,14 @@ inline int  acceleratorIsCommunicable(void *ptr)
 //////////////////////////////////////////////
 #ifdef GRID_SYCL
 NAMESPACE_END(Grid);
-#if 0
 #include <CL/sycl.hpp>
 #include <CL/sycl/usm.hpp>
+
+#define GRID_SYCL_LEVEL_ZERO_IPC
+
+#ifdef GRID_SYCL_LEVEL_ZERO_IPC
 #include <level_zero/ze_api.h>
 #include <CL/sycl/backend/level_zero.hpp>
-#else
-#include <sycl/CL/sycl.hpp>
-#include <sycl/usm.hpp>
-#include <level_zero/ze_api.h>
-#include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
 NAMESPACE_BEGIN(Grid);

@@ -4,7 +4,7 @@
 #SBATCH -p QZ1J-ICX-PVC
 ##SBATCH -p QZ1J-SPR-PVC-2C

-#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh

 export NT=8

@@ -4,7 +4,7 @@

 #SBATCH -p QZ1J-ICX-PVC

-#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh

 export NT=16

@@ -19,14 +19,16 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
 export I_MPI_OFFLOAD_CELL=tile
 export EnableImplicitScaling=0
 export EnableWalkerPartition=0
-#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
-#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0

-for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+for i in 0 
 do
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 1.1.1.2.log$i
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 2.1.1.1.log$i 
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
 done
+#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.1x2.log
+#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.2x1.log


@@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
 echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK


-#if [ $MPI_LOCALRANKID = "0" ] 
-#then
+if [ $MPI_LOCALRANKID = "0" ] 
+then
 #  ~psteinbr/build_pti/ze_tracer -h $@
-#  onetrace --chrome-device-timeline $@
-#else
+  onetrace --chrome-device-timeline $@
+else
  $@
-#fi
+fi
@@ -0,0 +1,73 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_poisson_fft.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/Lattice_slice_gpu.h>
+
+using namespace Grid;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int N=16;
+  std::vector<int> latt_size  ({N,N,N,N});
+  std::vector<int> simd_layout({vComplexD::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout ({1,1,1,1});
+
+  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
+
+  LatticeComplexD  rn(&GRID);
+
+  GridParallelRNG RNG(&GRID);
+  RNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));  
+  gaussian(RNG,rn);
+
+  std::vector<TComplex> reduced_ref;
+  std::vector<TComplex> reduced_gpu;
+  for(int d=0;d<4;d++){
+    {
+      RealD t=-usecond();
+      sliceSum(rn,reduced_ref,d);
+      t+=usecond();
+      std::cout << " sliceSum took "<<t<<" usecs"<<std::endl;
+    }
+    {
+      RealD t=-usecond();
+      sliceSumGpu(rn,reduced_gpu,d);
+      t+=usecond();
+      std::cout << " sliceSumGpu took "<<t<<" usecs"<<std::endl;
+    }
+    for(int t=0;t<reduced_ref.size();t++){
+      std::cout << t<<" ref "<< reduced_ref[t] <<" opt " << reduced_gpu[t] << " diff "<<reduced_ref[t]-reduced_gpu[t]<<std::endl;
+      TComplex diff = reduced_ref[t]-reduced_gpu[t];
+      assert(abs(TensorRemove(diff)) < 1e-8 );
+    }
+  }
+  Grid_finalize();
+}