From 0b905a72ddfafcf02bcc4b6738ff31c74be79ed5 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 29 Oct 2021 02:22:22 +0100
Subject: [PATCH 01/10] Better reduction for GPUs

---
 Grid/lattice/Lattice_reduction_gpu.h | 48 ++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 6 deletions(-)
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index c2875052..823e497e 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
 }
 
 template <class Iterator>
-void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
+int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
   
   int device;
 #ifdef GRID_CUDA
@@ -37,13 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
   Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
   Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;
   Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
-  
+  /*  
   std::cout << GridLogDebug << "GPU has:" << std::endl;
   std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
   std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
   std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
   std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
-  
+  */  
   if (warpSize != WARP_SIZE) {
     std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
     exit(EXIT_FAILURE);
@@ -53,12 +53,12 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
   threads = warpSize;
   if ( threads*sizeofsobj > sharedMemPerBlock ) {
     std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
-    exit(EXIT_FAILURE);
+    return 0;
   }
   while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
   // keep all the streaming multiprocessors busy
   blocks = nextPow2(multiProcessorCount);
-  
+  return 1;
 }
 
 template <class sobj, class Iterator>
@@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer osites) 
 {
   typedef typename vobj::scalar_objectD sobj;
   typedef decltype(lat) Iterator;
@@ -208,6 +208,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 
   Integer numThreads, numBlocks;
   getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+
   Integer smemSize = numThreads * sizeof(sobj);
 
   Vector<sobj> buffer(numBlocks);
@@ -218,6 +219,41 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   auto result = buffer_v[0];
   return result;
 }
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::vector_type  vector;
+  typedef typename vobj::scalar_typeD scalarD;
+  typedef typename vobj::scalar_objectD sobj;
+  sobj ret;
+  scalarD *ret_p = (scalarD *)&ret;
+  
+  const int words = sizeof(vobj)/sizeof(vector);
+
+  Integer nsimd= vobj::Nsimd();
+  Integer size = osites*nsimd;
+  Integer numThreads, numBlocks;
+  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  
+  if ( ok ) {
+    ret = sumD_gpu_internal(lat,osites);
+  } else {
+    std::cout << GridLogWarning << " dropping to summing word by word for large object size "<<sizeof(vobj)<<std::endl;
+    Vector<vector> buffer(osites);
+    vector *dat = (vector *)lat;
+    vector *buf = &buffer[0];
+    iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
+    for(int w=0;w<words;w++) {
+
+      accelerator_for(ss,osites,1,{
+	 buf[ss] = dat[ss*words+w];
+      });
+      
+      ret_p[w] = sumD_gpu_internal(tbuf,osites);
+    }
+  }
+  return ret;
+}
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////

From 42d56ea6b63778ba4013cd0da0d325b5619f23cb Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 29 Oct 2021 02:23:08 +0100
Subject: [PATCH 02/10] Verbosity

---
 Grid/lattice/Lattice_reduction_gpu.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index 823e497e..73a704f5 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -238,7 +238,6 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   if ( ok ) {
     ret = sumD_gpu_internal(lat,osites);
   } else {
-    std::cout << GridLogWarning << " dropping to summing word by word for large object size "<<sizeof(vobj)<<std::endl;
     Vector<vector> buffer(osites);
     vector *dat = (vector *)lat;
     vector *buf = &buffer[0];

From e8c187b3233aa6a5a02e4a1a31a3e83d629b5f2b Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 15 Feb 2022 11:24:38 -0500
Subject: [PATCH 03/10] SyCL happier?

---
 Grid/qcd/action/fermion/WilsonCloverHelpers.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/qcd/action/fermion/WilsonCloverHelpers.h b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
index 588525cc..60f19317 100644
--- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h
+++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
@@ -726,8 +726,8 @@ public:
   static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
     conformable(f, m);
     auto grid  = f.Grid();
-    const int Nsite = grid->oSites();
-    const int Nsimd = grid->Nsimd();
+    const uint32_t Nsite = grid->oSites();
+    const uint32_t Nsimd = grid->Nsimd();
     autoView(f_v, f, AcceleratorWrite);
     autoView(m_v, m, AcceleratorRead);
     // NOTE: this function cannot be 'private' since nvcc forbids this for kernels

From 63dbaeefaa533717379811f4695a149a82e2d6ec Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 16 Feb 2022 14:01:43 +0000
Subject: [PATCH 04/10] Extra barrier prior to finalize just in case it fixes
 an issue on Tursa

---
 Grid/util/Init.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 6992129e..36854d9c 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -534,6 +534,7 @@ void Grid_init(int *argc,char ***argv)
 void Grid_finalize(void)
 {
 #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
+  MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
   Grid_unquiesce_nodes();
 #endif

From 3e882f555dcb75a29e640c6c70fd7e7f23973b73 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Mar 2022 08:54:45 -0500
Subject: [PATCH 05/10] Large / small sumD options

---
 Grid/lattice/Lattice_reduction.h     |  9 +++++
 Grid/lattice/Lattice_reduction_gpu.h | 53 ++++++++++++++++++++--------
 2 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 326b9ea3..c3478ab4 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -142,6 +142,15 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
   return sumD_cpu(arg,osites);
 #endif  
 }
+template<class vobj>
+inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
+{
+#if defined(GRID_CUDA)||defined(GRID_HIP)
+  return sumD_gpu_large(arg,osites);
+#else
+  return sumD_cpu(arg,osites);
+#endif  
+}
 
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index 73a704f5..c685a2c0 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) 
 {
   typedef typename vobj::scalar_objectD sobj;
   typedef decltype(lat) Iterator;
@@ -207,7 +207,8 @@ inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer
   Integer size = osites*nsimd;
 
   Integer numThreads, numBlocks;
-  getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  assert(ok);
 
   Integer smemSize = numThreads * sizeof(sobj);
 
@@ -219,6 +220,37 @@ inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer
   auto result = buffer_v[0];
   return result;
 }
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::vector_type  vector;
+  typedef typename vobj::scalar_typeD scalarD;
+  typedef typename vobj::scalar_objectD sobj;
+  sobj ret;
+  scalarD *ret_p = (scalarD *)&ret;
+  
+  const int words = sizeof(vobj)/sizeof(vector);
+
+  Integer nsimd= vobj::Nsimd();
+  Integer size = osites*nsimd;
+  Integer numThreads, numBlocks;
+
+  Vector<vector> buffer(osites);
+  vector *dat = (vector *)lat;
+  vector *buf = &buffer[0];
+  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
+  for(int w=0;w<words;w++) {
+
+    accelerator_for(ss,osites,1,{
+	buf[ss] = dat[ss*words+w];
+      });
+      
+    ret_p[w] = sumD_gpu_small(tbuf,osites);
+  }
+  return ret;
+}
+
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
@@ -236,23 +268,14 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
   
   if ( ok ) {
-    ret = sumD_gpu_internal(lat,osites);
+    ret = sumD_gpu_small(lat,osites);
   } else {
-    Vector<vector> buffer(osites);
-    vector *dat = (vector *)lat;
-    vector *buf = &buffer[0];
-    iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
-    for(int w=0;w<words;w++) {
-
-      accelerator_for(ss,osites,1,{
-	 buf[ss] = dat[ss*words+w];
-      });
-      
-      ret_p[w] = sumD_gpu_internal(tbuf,osites);
-    }
+    ret = sumD_gpu_large(lat,osites);
   }
   return ret;
 }
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////

From 694306f2022228b1b752fa467fe6a5583c68e7e4 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Mar 2022 10:53:44 -0500
Subject: [PATCH 06/10] Configure for mac arm

---
 systems/mac-arm/config-command-mpi | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 systems/mac-arm/config-command-mpi

diff --git a/systems/mac-arm/config-command-mpi b/systems/mac-arm/config-command-mpi
new file mode 100644
index 00000000..d1e75c39
--- /dev/null
+++ b/systems/mac-arm/config-command-mpi
@@ -0,0 +1 @@
+CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi

From e16fc5b2e4dd2d507c8c92a05f987a8aa1ec5c3e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Mar 2022 11:17:24 -0500
Subject: [PATCH 07/10] Threaded intranode comms transfer - ideally between
 NUMA domains

---
 Grid/threads/Accelerator.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index b427b304..12483185 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -481,9 +481,10 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
 
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
+
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { GridThread::bcopy(from,to,bytes);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ GridThread::bcopy(from,to,bytes);}
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { GridThread::bcopy(from,to,bytes);}
 inline void acceleratorCopySynchronise(void) {};
 
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }

From d4ae71b8806e535145a718a9e1e00a7315aaf5bc Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Wed, 2 Mar 2022 15:40:18 +0000
Subject: [PATCH 08/10] sum_gpu_large and sum_gpu templates added.

---
 Grid/lattice/Lattice_reduction.h     | 16 ++++++++++++++++
 Grid/lattice/Lattice_reduction_gpu.h |  8 ++++++++
 2 files changed, 24 insertions(+)

diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index c3478ab4..0ddac437 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -168,6 +168,22 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
   return ssum;
 }
 
+template<class vobj>
+inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
+{
+#if defined(GRID_CUDA)||defined(GRID_HIP)
+  autoView( arg_v, arg, AcceleratorRead);
+  Integer osites = arg.Grid()->oSites();
+  auto ssum= sum_gpu_large(&arg_v[0],osites);
+#else
+  autoView(arg_v, arg, CpuRead);
+  Integer osites = arg.Grid()->oSites();
+  auto ssum= sum_cpu(&arg_v[0],osites);
+#endif
+  arg.Grid()->GlobalSum(ssum);
+  return ssum;
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deterministic Reduction operations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index c685a2c0..c3422af3 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -288,6 +288,14 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
   return result;
 }
 
+template <class vobj>
+inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj result;
+  result = sumD_gpu_large(lat,osites);
+  return result;
+}
 
 
 NAMESPACE_END(Grid);

From d1decee4cce0b9a9f02bb5521cb06840b20022ad Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Wed, 2 Mar 2022 16:54:23 +0000
Subject: [PATCH 09/10] Cleaned up unused variables in Lattice_reduction_gpu.h

---
 Grid/lattice/Lattice_reduction_gpu.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index c3422af3..bad86d2a 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -232,10 +232,6 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
   
   const int words = sizeof(vobj)/sizeof(vector);
 
-  Integer nsimd= vobj::Nsimd();
-  Integer size = osites*nsimd;
-  Integer numThreads, numBlocks;
-
   Vector<vector> buffer(osites);
   vector *dat = (vector *)lat;
   vector *buf = &buffer[0];
@@ -258,10 +254,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   typedef typename vobj::scalar_typeD scalarD;
   typedef typename vobj::scalar_objectD sobj;
   sobj ret;
-  scalarD *ret_p = (scalarD *)&ret;
   
-  const int words = sizeof(vobj)/sizeof(vector);
-
   Integer nsimd= vobj::Nsimd();
   Integer size = osites*nsimd;
   Integer numThreads, numBlocks;
@@ -275,7 +268,6 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   return ret;
 }
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -297,5 +289,4 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
   return result;
 }
 
-
 NAMESPACE_END(Grid);

From 92a83a9eb33c54467b29ff557bad4aed90c4f65e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 16 Mar 2022 17:14:36 +0000
Subject: [PATCH 10/10] Performance improve for Tesseract

---
 Grid/threads/Accelerator.h |  6 +++---
 Grid/threads/Threads.h     | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 12483185..389f2cc4 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -482,9 +482,9 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { GridThread::bcopy(from,to,bytes);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ GridThread::bcopy(from,to,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { GridThread::bcopy(from,to,bytes);}
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
 inline void acceleratorCopySynchronise(void) {};
 
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h
index a9fa13ea..6887134d 100644
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -72,3 +72,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_region                                       DO_PRAGMA(omp parallel)
 #define thread_critical                                     DO_PRAGMA(omp critical)
 
+#ifdef GRID_OMP
+inline void thread_bcopy(void *from, void *to,size_t bytes)
+{
+  uint64_t *ufrom = (uint64_t *)from;
+  uint64_t *uto   = (uint64_t *)to;
+  assert(bytes%8==0);
+  uint64_t words=bytes/8;
+  thread_for(w,words,{
+      uto[w] = ufrom[w];
+  });
+}
+#else
+inline void thread_bcopy(void *from, void *to,size_t bytes)
+{
+  bcopy(from,to,bytes);
+}
+#endif