Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc

Aim to reduce the amount of cuda and other code variations floating around all over the place. Will move GpuInit iinto Accelerator.cc from Init.cc Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
2025-09-21 02:31:05 +01:00 · 2020-05-08 06:23:55 -07:00
parent 28a1fcaaff
commit f8b8e00090
13 changed files with 557 additions and 718 deletions
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -0,0 +1,10 @@
+#include <Grid/GridCore.h>
+
+NAMESPACE_BEGIN(Grid);
+uint32_t accelerator_threads;
+uint32_t acceleratorThreads(void)       {return accelerator_threads;};
+void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
+#ifdef GRID_SYCL
+cl::sycl::queue *theGridAccelerator;
+#endif
+NAMESPACE_END(Grid);
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -0,0 +1,345 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Accelerator.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+NAMESPACE_BEGIN(Grid);
+
+//////////////////////////////////////////////////////////////////////////////////
+// Accelerator primitives; fall back to threading if not CUDA or SYCL
+//////////////////////////////////////////////////////////////////////////////////
+//
+// Function attributes
+//
+//    accelerator
+//    accelerator_inline
+//
+// Parallel looping
+// 
+//    accelerator_for
+//    accelerator_forNB 
+//    uint32_t accelerator_barrier();         // device synchronise
+//
+// Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd
+//
+//    uint32_t acceleratorThreads(void);   
+//    void     acceleratorThreads(uint32_t);
+//
+// Warp control and info:
+//
+//    void     acceleratorSynchronise(void); // synch warp etc..
+//    int      acceleratorSIMTlane(int Nsimd);
+//
+// Memory management:
+//
+//    void *acceleratorAllocShared(size_t bytes);
+//    void acceleratorFreeShared(void *ptr);
+//
+//    void *acceleratorAllocDevice(size_t bytes);
+//    void acceleratorFreeDevice(void *ptr);
+//
+//    void *acceleratorCopyToDevice(void *from,void *to,size_t bytes);
+//    void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes);
+//
+//////////////////////////////////////////////////////////////////////////////////
+
+uint32_t acceleratorThreads(void);   
+void     acceleratorThreads(uint32_t);
+
+//////////////////////////////////////////////
+// CUDA acceleration
+//////////////////////////////////////////////
+#ifdef GRID_CUDA
+
+#ifdef __CUDA_ARCH__
+#define GRID_SIMT
+#endif
+
+#define accelerator        __host__ __device__
+#define accelerator_inline __host__ __device__ inline
+
+#define accelerator_barrier(dummy)					\
+  {									\
+    cudaDeviceSynchronize();						\
+    cudaError err = cudaGetLastError();					\
+    if ( cudaSuccess != err ) {						\
+      printf("Cuda error %s \n", cudaGetErrorString( err ));		\
+      puts(__FILE__);							\
+      printf("Line %d\n",__LINE__);					\
+      exit(0);								\
+    }									\
+  }
+
+#define accelerator_forNB( iterator, num, nsimd, ... )			\
+  {									\
+    typedef uint64_t Iterator;						\
+    auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
+      __VA_ARGS__;							\
+    };									\
+    dim3 cu_threads(acceleratorThreads(),nsimd);			\
+    dim3 cu_blocks ((num+acceleratorThreads()-1)/acceleratorThreads());			\
+    LambdaApply<<<cu_blocks,cu_threads>>>(nsimd,num,lambda);	\
+  }
+
+#define accelerator_for( iterator, num, nsimd, ... )		\
+  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
+  accelerator_barrier(dummy);
+
+inline void *acceleratorAllocShared(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = cudaMallocManaged((void **)&ptr,bytes);
+  if( err != cudaSuccess ) {
+    ptr = (_Tp *) NULL;
+    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+  }
+  return ptr;
+};
+inline void *acceleratorAllocDevice(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = cudaMalloc((void **)&ptr,bytes);
+  if( err != cudaSuccess ) {
+    ptr = (_Tp *) NULL;
+    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+  }
+  return ptr;
+};
+inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
+inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
+
+template<typename lambda>  __global__
+void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
+{
+  uint64_t isite = threadIdx.y;
+  uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
+  if ( (osite <Osites) && (isite<Isites) ) {
+    Lambda(isite,osite);
+  }
+}
+
+#endif
+
+//////////////////////////////////////////////
+// SyCL acceleration
+//////////////////////////////////////////////
+
+#ifdef GRID_SYCL
+NAMESPACE_END(Grid);
+#include <CL/sycl.hpp>
+#include <CL/sycl/usm.hpp>
+NAMESPACE_BEGIN(Grid);
+
+extern cl::sycl::queue *theGridAccelerator;
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define GRID_SIMT
+#endif
+
+#define accelerator 
+#define accelerator_inline strong_inline
+
+#define accelerator_forNB(iterator,num,nsimd, ... )			\
+  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
+      cl::sycl::range<3> local {acceleratorThreads(),1,nsimd};			\
+      cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
+      cgh.parallel_for<class dslash>(					\
+      cl::sycl::nd_range<3>(global,local),            \
+      [=] (cl::sycl::nd_item<3> item) mutable {       \
+      auto iterator = item.get_global_id(0);	      \
+      auto lane     = item.get_global_id(2);	      \
+      { __VA_ARGS__ };				      \
+     });	   			              \
+    });
+
+#define accelerator_barrier(dummy) theGridAccelerator->wait();
+
+#define accelerator_for( iterator, num, nsimd, ... )		\
+  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
+  accelerator_barrier(dummy);
+
+inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
+inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
+inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
+inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
+
+#endif
+
+//////////////////////////////////////////////
+// HIP acceleration
+//////////////////////////////////////////////
+#ifdef GRID_HIP
+
+#ifdef __HIP_DEVICE_COMPILE__
+#define GRID_SIMT
+#endif
+
+#define accelerator        __host__ __device__
+#define accelerator_inline __host__ __device__ inline
+#define accelerator_barrier(dummy)				\
+  {								\
+    hipDeviceSynchronize();					\
+    auto err = hipGetLastError();				\
+    if ( err != hipSuccess ) {					\
+      printf("HIP error %s \n", hipGetErrorString( err )); \
+      puts(__FILE__); \
+      printf("Line %d\n",__LINE__);					\
+      exit(0);							\
+    }								\
+  }
+
+#define accelerator_forNB( iterator, num, nsimd, ... )			\
+  {									\
+    typedef uint64_t Iterator;						\
+    auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
+      __VA_ARGS__;							\
+    };									\
+    dim3 hip_threads(acceleratorThreads(),nsimd);				\
+    dim3 hip_blocks ((num+acceleratorThreads()-1)/acceleratorThreads());			\
+    hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,0,0,num,simd,lambda);\
+  }
+
+#define accelerator_for( iterator, num, nsimd, ... )		\
+  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
+  accelerator_barrier(dummy);
+
+inline void *acceleratorAllocShared(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = hipMallocManaged((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
+    ptr = (_Tp *) NULL;
+    printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
+  }
+  return ptr;
+};
+inline void *acceleratorAllocDevice(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = hipMalloc((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
+    ptr = (_Tp *) NULL;
+    printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
+  }
+  return ptr;
+};
+inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
+inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
+
+template<typename lambda>  __global__
+void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
+{
+  uint64_t isite = hipThreadIdx_y;
+  uint64_t osite = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
+  if ( (osite <Osites) && (isite<Isites) ) {
+    Lambda(isite,osite);
+  }
+}
+
+#endif
+
+//////////////////////////////////////////////
+// CPU Target - No accelerator just thread instead
+//////////////////////////////////////////////
+#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
+
+#undef GRID_SIMT
+
+#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned 
+
+#define accelerator 
+#define accelerator_inline strong_inline
+#define accelerator_for(iterator,num,nsimd, ... )   thread_for(iterator, num, { __VA_ARGS__ });
+#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
+#define accelerator_barrier(dummy) 
+
+#ifdef HAVE_MALLOC_MALLOC_H
+#include <malloc/malloc.h>
+#endif
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+#ifdef HAVE_MM_MALLOC_H
+#include <mm_malloc.h>
+#endif
+
+#ifdef HAVE_MM_MALLOC_H
+inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
+inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
+inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
+inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
+#else
+inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
+inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
+inline void acceleratorFreeShared(void *ptr){free(ptr);};
+inline void acceleratorFreeDevice(void *ptr){free(ptr);};
+#endif
+
+
+#endif // CPU target
+
+///////////////////////////////////////////////////
+// Synchronise across local threads for divergence resynch
+///////////////////////////////////////////////////
+accelerator_inline void acceleratorSynchronise(void) 
+{
+#ifdef GRID_SIMT
+#ifdef GRID_CUDA
+  __syncwarp();
+#endif
+#ifdef GRID_SYCL
+  // No barrier call on SYCL??  // Option get __spir:: stuff to do warp barrier
+#endif
+#ifdef GRID_HIP
+  __syncthreads();
+#endif
+#endif
+  return;
+}
+
+////////////////////////////////////////////////////
+// Address subvectors on accelerators
+////////////////////////////////////////////////////
+#ifdef GRID_SIMT
+
+#ifdef GRID_CUDA
+accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
+#endif
+#ifdef GRID_SYCL
+accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
+#endif
+#ifdef GRID_HIP
+accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_y; } // HIP specific
+#endif
+
+#else
+
+accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/threads/Pragmas.h
+++ b/Grid/threads/Pragmas.h
@@ -2,7 +2,7 @@

    Grid physics library, www.github.com/paboyle/Grid 

-    Source file: ./lib/Threads.h
+    Source file: ./lib/Pragmas.h

    Copyright (C) 2015

@@ -28,148 +28,5 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once

-#ifndef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
-#define MIN(x,y) ((x)>(y)?(y):(x))
-#endif
-
-#define strong_inline     __attribute__((always_inline)) inline
-#define UNROLL  _Pragma("unroll")
-
-//////////////////////////////////////////////////////////////////////////////////
-// New primitives; explicit host thread calls, and accelerator data parallel calls
-//////////////////////////////////////////////////////////////////////////////////
-
-#ifdef _OPENMP
-#define GRID_OMP
-#include <omp.h>
-#endif
-
-#ifdef GRID_OMP
-#define DO_PRAGMA_(x) _Pragma (#x)
-#define DO_PRAGMA(x) DO_PRAGMA_(x)
-#define thread_num(a) omp_get_thread_num()
-#define thread_max(a) omp_get_max_threads()
-#else 
-#define DO_PRAGMA_(x) 
-#define DO_PRAGMA(x) 
-#define thread_num(a) (0)
-#define thread_max(a) (1)
-#endif
-
-#define thread_for( i, num, ... )                           DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
-#define thread_foreach( i, container, ... )                 DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
-#define thread_for_in_region( i, num, ... )                 DO_PRAGMA(omp for schedule(static))          for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
-#define thread_for_collapse2( i, num, ... )                 DO_PRAGMA(omp parallel for collapse(2))      for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
-#define thread_for_collapse( N , i, num, ... )              DO_PRAGMA(omp parallel for collapse ( N ) )  for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
-#define thread_for_collapse_in_region( N , i, num, ... )    DO_PRAGMA(omp for collapse ( N ))            for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
-#define thread_region                                       DO_PRAGMA(omp parallel)
-#define thread_critical                                     DO_PRAGMA(omp critical)
-
-
-//////////////////////////////////////////////////////////////////////////////////
-// Accelerator primitives; fall back to threading if not CUDA or SYCL
-//////////////////////////////////////////////////////////////////////////////////
-
-#ifdef GRID_CUDA
-
-extern uint32_t gpu_threads;
-
-#ifdef __CUDA_ARCH__
-#define GRID_SIMT
-#endif
-
-#define accelerator        __host__ __device__
-#define accelerator_inline __host__ __device__ inline
-
-template<typename lambda>  __global__
-void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda)
-{
-  uint64_t isite = threadIdx.y;
-  uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
-  if ( (osite <Osites) && (isite<Isites) ) {
-    Lambda(isite,osite);
-  }
-}
-
-/////////////////////////////////////////////////////////////////
-// Internal only really... but need to call when 
-/////////////////////////////////////////////////////////////////
-#define accelerator_barrier(dummy)				\
-  {								\
-    cudaDeviceSynchronize();					\
-    cudaError err = cudaGetLastError();				\
-    if ( cudaSuccess != err ) {					\
-      printf("Cuda error %s \n", cudaGetErrorString( err )); \
-      puts(__FILE__); \
-      printf("Line %d\n",__LINE__);					\
-      exit(0);							\
-    }								\
-  }
-
-// Copy the for_each_n style ; Non-blocking variant
-#define accelerator_forNB( iterator, num, nsimd, ... )			\
-  {									\
-    typedef uint64_t Iterator;						\
-    auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
-      __VA_ARGS__;							\
-    };									\
-    dim3 cu_threads(gpu_threads,nsimd);					\
-    dim3 cu_blocks ((num+gpu_threads-1)/gpu_threads);			\
-    LambdaApplySIMT<<<cu_blocks,cu_threads>>>(nsimd,num,lambda);	\
-  }
-
-// Copy the for_each_n style ; Non-blocking variant (default
-#define accelerator_for( iterator, num, nsimd, ... )		\
-  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
-  accelerator_barrier(dummy);
-
-#endif
-
-#ifdef GRID_SYCL
-
-#ifdef __SYCL_DEVICE_ONLY__
-#define GRID_SIMT
-#endif
-
-#include <CL/sycl.hpp>
-#include <CL/sycl/usm.hpp>
-
-extern cl::sycl::queue *theGridAccelerator;
-
-extern uint32_t gpu_threads;
-
-#define accelerator 
-#define accelerator_inline strong_inline
-
-#define accelerator_forNB(iterator,num,nsimd, ... )			\
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
-      cl::sycl::range<3> local {gpu_threads,1,nsimd};			\
-      cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
-      cgh.parallel_for<class dslash>(					\
-      cl::sycl::nd_range<3>(global,local),            \
-      [=] (cl::sycl::nd_item<3> item) mutable {       \
-      auto iterator = item.get_global_id(0);	      \
-      auto lane     = item.get_global_id(2);	      \
-      { __VA_ARGS__ };				      \
-     });	   			              \
-    });
-
-#define accelerator_barrier(dummy) theGridAccelerator->wait();
-
-#define accelerator_for( iterator, num, nsimd, ... )		\
-  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
-  accelerator_barrier(dummy);
-
-
-#endif
-
-#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) )
-
-#define accelerator 
-#define accelerator_inline strong_inline
-#define accelerator_for(iterator,num,nsimd, ... )   thread_for(iterator, num, { __VA_ARGS__ });
-#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
-#define accelerator_barrier(dummy) 
-
-#endif
+#include <Grid/threads/Threads.h>
+#include <Grid/threads/Accelerator.h>
--- a/Grid/threads/ThreadReduction.h
+++ b/Grid/threads/ThreadReduction.h
@@ -0,0 +1,127 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/ThreadReduction.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once 
+
+// Introduce a class to gain deterministic bit reproducible reduction.
+// make static; perhaps just a namespace is required.
+NAMESPACE_BEGIN(Grid);
+
+class GridThread {
+public:
+  static int _threads;
+  static int _hyperthreads;
+  static int _cores;
+
+  static void SetCores(int cr) { 
+#ifdef GRID_OMP
+    _cores = cr;
+#else 
+    _cores = 1;
+#endif
+  }
+  static void SetThreads(int thr) { 
+#ifdef GRID_OMP
+    _threads = MIN(thr,omp_get_max_threads()) ;
+    omp_set_num_threads(_threads);
+#else 
+    _threads = 1;
+#endif
+  };
+  static void SetMaxThreads(void) { 
+#ifdef GRID_OMP
+    _threads = omp_get_max_threads();
+    omp_set_num_threads(_threads);
+#else 
+    _threads = 1;
+#endif
+  };
+  static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
+  static int GetCores(void)   { return _cores; };
+  static int GetThreads(void) { return _threads; };
+  static int SumArraySize(void) {return _threads;};
+
+  static void GetWork(int nwork, int me, int & mywork, int & myoff){
+    GetWork(nwork,me,mywork,myoff,_threads);
+  }
+  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
+    int basework = nwork/units;
+    int backfill = units-(nwork%units);
+    if ( me >= units ) { 
+      mywork = myoff = 0;
+    } else { 
+      mywork = (nwork+me)/units;
+      myoff  = basework * me;
+      if ( me > backfill ) 
+	myoff+= (me-backfill);
+    }
+    return;
+  };
+
+  static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
+    me     = ThreadBarrier();
+    GetWork(nwork,me,mywork,myoff);
+  };
+
+  static int  ThreadBarrier(void) {
+#ifdef GRID_OMP
+#pragma omp barrier
+    return omp_get_thread_num();
+#else
+    return 0;
+#endif
+  };
+  
+  template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
+    sum_array[me] = val;
+    val=Zero();
+    ThreadBarrier();
+    for(int i=0;i<_threads;i++) val+= sum_array[i];
+    ThreadBarrier();
+  }
+
+  static void bcopy(const void *src, void *dst, size_t len) {
+#ifdef GRID_OMP
+#pragma omp parallel 
+    {
+      const char *c_src =(char *) src;
+      char *c_dest=(char *) dst;
+      int me,mywork,myoff;
+      GridThread::GetWorkBarrier(len,me, mywork,myoff);
+      bcopy(&c_src[myoff],&c_dest[myoff],mywork);
+    }
+#else 
+    bcopy(src,dst,len);
+#endif
+  }
+
+
+};
+
+NAMESPACE_END(Grid);
+
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -28,101 +28,41 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once 

+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#define MIN(x,y) ((x)>(y)?(y):(x))
+#endif

-// Introduce a class to gain deterministic bit reproducible reduction.
-// make static; perhaps just a namespace is required.
-NAMESPACE_BEGIN(Grid);
+#define strong_inline     __attribute__((always_inline)) inline
+#define UNROLL  _Pragma("unroll")

-class GridThread {
-public:
-  static int _threads;
-  static int _hyperthreads;
-  static int _cores;
+//////////////////////////////////////////////////////////////////////////////////
+// New primitives; explicit host thread calls, and accelerator data parallel calls
+//////////////////////////////////////////////////////////////////////////////////
+
+#ifdef _OPENMP
+#define GRID_OMP
+#include <omp.h>
+#endif

-  static void SetCores(int cr) { 
 #ifdef GRID_OMP
-    _cores = cr;
+#define DO_PRAGMA_(x) _Pragma (#x)
+#define DO_PRAGMA(x) DO_PRAGMA_(x)
+#define thread_num(a) omp_get_thread_num()
+#define thread_max(a) omp_get_max_threads()
 #else 
-    _cores = 1;
+#define DO_PRAGMA_(x) 
+#define DO_PRAGMA(x) 
+#define thread_num(a) (0)
+#define thread_max(a) (1)
 #endif
-  }
-  static void SetThreads(int thr) { 
-#ifdef GRID_OMP
-    _threads = MIN(thr,omp_get_max_threads()) ;
-    omp_set_num_threads(_threads);
-#else 
-    _threads = 1;
-#endif
-  };
-  static void SetMaxThreads(void) { 
-#ifdef GRID_OMP
-    _threads = omp_get_max_threads();
-    omp_set_num_threads(_threads);
-#else 
-    _threads = 1;
-#endif
-  };
-  static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
-  static int GetCores(void)   { return _cores; };
-  static int GetThreads(void) { return _threads; };
-  static int SumArraySize(void) {return _threads;};

-  static void GetWork(int nwork, int me, int & mywork, int & myoff){
-    GetWork(nwork,me,mywork,myoff,_threads);
-  }
-  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
-    int basework = nwork/units;
-    int backfill = units-(nwork%units);
-    if ( me >= units ) { 
-      mywork = myoff = 0;
-    } else { 
-      mywork = (nwork+me)/units;
-      myoff  = basework * me;
-      if ( me > backfill ) 
-	myoff+= (me-backfill);
-    }
-    return;
-  };
-
-  static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
-    me     = ThreadBarrier();
-    GetWork(nwork,me,mywork,myoff);
-  };
-
-  static int  ThreadBarrier(void) {
-#ifdef GRID_OMP
-#pragma omp barrier
-    return omp_get_thread_num();
-#else
-    return 0;
-#endif
-  };
-  
-  template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
-    sum_array[me] = val;
-    val=Zero();
-    ThreadBarrier();
-    for(int i=0;i<_threads;i++) val+= sum_array[i];
-    ThreadBarrier();
-  }
-
-  static void bcopy(const void *src, void *dst, size_t len) {
-#ifdef GRID_OMP
-#pragma omp parallel 
-    {
-      const char *c_src =(char *) src;
-      char *c_dest=(char *) dst;
-      int me,mywork,myoff;
-      GridThread::GetWorkBarrier(len,me, mywork,myoff);
-      bcopy(&c_src[myoff],&c_dest[myoff],mywork);
-    }
-#else 
-    bcopy(src,dst,len);
-#endif
-  }
-
-
-};
-
-NAMESPACE_END(Grid);
+#define thread_for( i, num, ... )                           DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_foreach( i, container, ... )                 DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
+#define thread_for_in_region( i, num, ... )                 DO_PRAGMA(omp for schedule(static))          for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_for_collapse2( i, num, ... )                 DO_PRAGMA(omp parallel for collapse(2))      for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_for_collapse( N , i, num, ... )              DO_PRAGMA(omp parallel for collapse ( N ) )  for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_for_collapse_in_region( N , i, num, ... )    DO_PRAGMA(omp for collapse ( N ))            for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_region                                       DO_PRAGMA(omp parallel)
+#define thread_critical                                     DO_PRAGMA(omp critical)