Performance improve for Tesseract

Threaded intranode comms transfer - ideally between NUMA domains
2026-02-13 18:30:54 +00:00 · 2022-03-16 17:14:36 +00:00 · 2022-03-01 11:17:24 -05:00
2 changed files with 21 additions and 3 deletions
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -481,9 +481,10 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });

 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
+
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
 inline void acceleratorCopySynchronise(void) {};

 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -72,3 +72,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_region                                       DO_PRAGMA(omp parallel)
 #define thread_critical                                     DO_PRAGMA(omp critical)

+#ifdef GRID_OMP
+inline void thread_bcopy(void *from, void *to,size_t bytes)
+{
+  uint64_t *ufrom = (uint64_t *)from;
+  uint64_t *uto   = (uint64_t *)to;
+  assert(bytes%8==0);
+  uint64_t words=bytes/8;
+  thread_for(w,words,{
+      uto[w] = ufrom[w];
+  });
+}
+#else
+inline void thread_bcopy(void *from, void *to,size_t bytes)
+{
+  bcopy(from,to,bytes);
+}
+#endif
Author	SHA1	Message	Date
Peter Boyle	92a83a9eb3	Performance improve for Tesseract	2022-03-16 17:14:36 +00:00
Peter Boyle	e16fc5b2e4	Threaded intranode comms transfer - ideally between NUMA domains	2022-03-01 11:17:24 -05:00