diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 326b9ea3..0ddac437 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -142,6 +142,15 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
   return sumD_cpu(arg,osites);
 #endif  
 }
+template<class vobj>
+inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
+{
+#if defined(GRID_CUDA)||defined(GRID_HIP)
+  return sumD_gpu_large(arg,osites);
+#else
+  return sumD_cpu(arg,osites);
+#endif  
+}
 
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
@@ -159,6 +168,22 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
   return ssum;
 }
 
+template<class vobj>
+inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
+{
+#if defined(GRID_CUDA)||defined(GRID_HIP)
+  autoView( arg_v, arg, AcceleratorRead);
+  Integer osites = arg.Grid()->oSites();
+  auto ssum= sum_gpu_large(&arg_v[0],osites);
+#else
+  autoView(arg_v, arg, CpuRead);
+  Integer osites = arg.Grid()->oSites();
+  auto ssum= sum_cpu(&arg_v[0],osites);
+#endif
+  arg.Grid()->GlobalSum(ssum);
+  return ssum;
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deterministic Reduction operations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index c2875052..bad86d2a 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
 }
 
 template <class Iterator>
-void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
+int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
   
   int device;
 #ifdef GRID_CUDA
@@ -37,13 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
   Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
   Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;
   Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
-  
+  /*  
   std::cout << GridLogDebug << "GPU has:" << std::endl;
   std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
   std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
   std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
   std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
-  
+  */  
   if (warpSize != WARP_SIZE) {
     std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
     exit(EXIT_FAILURE);
@@ -53,12 +53,12 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
   threads = warpSize;
   if ( threads*sizeofsobj > sharedMemPerBlock ) {
     std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
-    exit(EXIT_FAILURE);
+    return 0;
   }
   while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
   // keep all the streaming multiprocessors busy
   blocks = nextPow2(multiProcessorCount);
-  
+  return 1;
 }
 
 template <class sobj, class Iterator>
@@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) 
 {
   typedef typename vobj::scalar_objectD sobj;
   typedef decltype(lat) Iterator;
@@ -207,7 +207,9 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   Integer size = osites*nsimd;
 
   Integer numThreads, numBlocks;
-  getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  assert(ok);
+
   Integer smemSize = numThreads * sizeof(sobj);
 
   Vector<sobj> buffer(numBlocks);
@@ -218,6 +220,54 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   auto result = buffer_v[0];
   return result;
 }
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::vector_type  vector;
+  typedef typename vobj::scalar_typeD scalarD;
+  typedef typename vobj::scalar_objectD sobj;
+  sobj ret;
+  scalarD *ret_p = (scalarD *)&ret;
+  
+  const int words = sizeof(vobj)/sizeof(vector);
+
+  Vector<vector> buffer(osites);
+  vector *dat = (vector *)lat;
+  vector *buf = &buffer[0];
+  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
+  for(int w=0;w<words;w++) {
+
+    accelerator_for(ss,osites,1,{
+	buf[ss] = dat[ss*words+w];
+      });
+      
+    ret_p[w] = sumD_gpu_small(tbuf,osites);
+  }
+  return ret;
+}
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::vector_type  vector;
+  typedef typename vobj::scalar_typeD scalarD;
+  typedef typename vobj::scalar_objectD sobj;
+  sobj ret;
+  
+  Integer nsimd= vobj::Nsimd();
+  Integer size = osites*nsimd;
+  Integer numThreads, numBlocks;
+  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  
+  if ( ok ) {
+    ret = sumD_gpu_small(lat,osites);
+  } else {
+    ret = sumD_gpu_large(lat,osites);
+  }
+  return ret;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -230,6 +280,13 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
   return result;
 }
 
-
+template <class vobj>
+inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj result;
+  result = sumD_gpu_large(lat,osites);
+  return result;
+}
 
 NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/fermion/WilsonCloverHelpers.h b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
index 588525cc..60f19317 100644
--- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h
+++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
@@ -726,8 +726,8 @@ public:
   static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
     conformable(f, m);
     auto grid  = f.Grid();
-    const int Nsite = grid->oSites();
-    const int Nsimd = grid->Nsimd();
+    const uint32_t Nsite = grid->oSites();
+    const uint32_t Nsimd = grid->Nsimd();
     autoView(f_v, f, AcceleratorWrite);
     autoView(m_v, m, AcceleratorRead);
     // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 517d3d3d..a5fb7aa8 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -483,9 +483,10 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
 
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
+
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
 inline void acceleratorCopySynchronise(void) {};
 
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h
index a9fa13ea..6887134d 100644
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -72,3 +72,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_region                                       DO_PRAGMA(omp parallel)
 #define thread_critical                                     DO_PRAGMA(omp critical)
 
+#ifdef GRID_OMP
+inline void thread_bcopy(void *from, void *to,size_t bytes)
+{
+  uint64_t *ufrom = (uint64_t *)from;
+  uint64_t *uto   = (uint64_t *)to;
+  assert(bytes%8==0);
+  uint64_t words=bytes/8;
+  thread_for(w,words,{
+      uto[w] = ufrom[w];
+  });
+}
+#else
+inline void thread_bcopy(void *from, void *to,size_t bytes)
+{
+  bcopy(from,to,bytes);
+}
+#endif
diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 6992129e..36854d9c 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -534,6 +534,7 @@ void Grid_init(int *argc,char ***argv)
 void Grid_finalize(void)
 {
 #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
+  MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
   Grid_unquiesce_nodes();
 #endif
diff --git a/systems/mac-arm/config-command-mpi b/systems/mac-arm/config-command-mpi
new file mode 100644
index 00000000..d1e75c39
--- /dev/null
+++ b/systems/mac-arm/config-command-mpi
@@ -0,0 +1 @@
+CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi