Speed up Cshift

2025-09-18 17:21:05 +01:00 · 2020-05-11 17:02:01 -04:00
parent 8c31c065b5
commit 07c0c02f8c
12 changed files with 373 additions and 265 deletions
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -29,7 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALIGNED_ALLOCATOR_H
 #define GRID_ALIGNED_ALLOCATOR_H
 NAMESPACE_BEGIN(Grid);
 /*Move control to configure.ac and Config.h*/
@@ -157,6 +156,15 @@ public:
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
 #if 0    
    size_type page_size=4096;
    size_type pages = (bytes+page_size-1)/page_size;
    uint8_t *bp = (uint8_t *)ptr;
    accelerator_for(pg,pages,1,{
      bp[pg*page_size]=0;
    });
 #endif
    return ptr;
  }
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -81,6 +81,7 @@ public:
  bool _isCheckerBoarded; 
  int        LocallyPeriodic;
  Coordinate _checker_dim_mask;
 public:
--- a/Grid/cartesian/Cartesian_full.h
+++ b/Grid/cartesian/Cartesian_full.h
@@ -38,6 +38,7 @@ class GridCartesian: public GridBase {
 public:
  int dummy;
  Coordinate _checker_dim_mask;
  virtual int  CheckerBoardFromOindexTable (int Oindex) {
    return 0;
  }
@@ -104,6 +105,7 @@ public:
    _ldimensions.resize(_ndimension);
    _rdimensions.resize(_ndimension);
    _simd_layout.resize(_ndimension);
    _checker_dim_mask.resize(_ndimension);;
    _lstart.resize(_ndimension);
    _lend.resize(_ndimension);
@@ -114,6 +116,8 @@ public:
    for (int d = 0; d < _ndimension; d++)
      {
 	_checker_dim_mask[d]=0;
        _fdimensions[d] = dimensions[d];   // Global dimensions
        _gdimensions[d] = _fdimensions[d]; // Global dimensions
        _simd_layout[d] = simd_layout[d];
--- a/Grid/cartesian/Cartesian_red_black.h
+++ b/Grid/cartesian/Cartesian_red_black.h
@@ -35,12 +35,28 @@ static const int CbRed  =0;
 static const int CbBlack=1;
 static const int Even   =CbRed;
 static const int Odd    =CbBlack;
 accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
 {
  int nd=rdim.size();
  Coordinate coor(nd);
  Lexicographic::CoorFromIndex(coor,oindex,rdim);
  int linear=0;
  for(int d=0;d<nd;d++){
    if(chk_dim_msk[d])
      linear=linear+coor[d];
  }
  return (linear&0x1);
 }
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
 {
 public:
-  Coordinate _checker_dim_mask;
+  //  Coordinate _checker_dim_mask;
  int              _checker_dim;
  std::vector<int> _checker_board;
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -29,6 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 extern Vector<std::pair<int,int> > Cshift_table; 
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
@@ -46,7 +48,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
  int e2=rhs.Grid()->_slice_block[dimension];
  int ent = 0;
-  static Vector<std::pair<int,int> > table; table.resize(e1*e2);
+  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
  int stride=rhs.Grid()->_slice_stride[dimension];
  auto rhs_v = rhs.View();
@@ -55,7 +58,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
      for(int b=0;b<e2;b++){
 	int o  = n*stride;
 	int bo = n*e2;
-	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
+	Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
      }
    }
  } else { 
@@ -65,13 +68,15 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
 	 int o  = n*stride;
 	 int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
 	 if ( ocb &cbmask ) {
-	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
+	   Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
 	 }
       }
     }
  }
-  thread_for(i,ent,{
+  auto buffer_p = & buffer[0];
-    buffer[table[i].first]=rhs_v[table[i].second];
+  auto table = &Cshift_table[0];
  accelerator_for(i,ent,1,{
    buffer_p[table[i].first]=rhs_v[table[i].second];
  });
 }
@@ -97,34 +102,36 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
  auto rhs_v = rhs.View();
  if ( cbmask ==0x3){
-    thread_for_collapse(2,n,e1,{
+    accelerator_for2d(n,e1,b,e2,1,{
      for(int b=0;b<e2;b++){
 	int o      =   n*n1;
 	int offset = b+n*e2;
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
-      }
+      });
    });
  } else { 
-    // Case of SIMD split AND checker dim cannot currently be hit, except in 
+  
-    // Test_cshift_red_black code.
+    Coordinate rdim=rhs.Grid()->_rdimensions;
-    std::cout << " Dense packed buffer WARNING " <<std::endl;
+    Coordinate cdm =rhs.Grid()->_checker_dim_mask;
-    thread_for_collapse(2,n,e1,{
+    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
-      for(int b=0;b<e2;b++){
+    accelerator_for2d(n,e1,b,e2,1,{
 	Coordinate coor;
 	int o=n*n1;
-	int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
+	int oindex = o+b;
       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
 	int ocb=1<<cb;
 	int offset = b+n*e2;
 	if ( ocb & cbmask ) {
 	  vobj temp =rhs_v[so+o+b];
 	  extract<vobj>(temp,pointers,offset);
 	}
-      }
+      });
    });
  }
 }
@@ -145,7 +152,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
  int e2=rhs.Grid()->_slice_block[dimension];
  int stride=rhs.Grid()->_slice_stride[dimension];
-  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
  int ent    =0;
  if ( cbmask ==0x3 ) {
@@ -154,7 +162,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
      for(int b=0;b<e2;b++){
 	int o   =n*rhs.Grid()->_slice_stride[dimension];
 	int bo  =n*rhs.Grid()->_slice_block[dimension];
-	table[ent++] = std::pair<int,int>(so+o+b,bo+b);
+	Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
      }
    }
@@ -165,15 +173,17 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
 	int o   =n*rhs.Grid()->_slice_stride[dimension];
 	int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
-	  table[ent++]=std::pair<int,int> (so+o+b,bo++);
+	  Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
 	}
      }
    }
  }
  auto rhs_v = rhs.View();
-  thread_for(i,ent,{
+  auto buffer_p = & buffer[0];
-    rhs_v[table[i].first]=buffer[table[i].second];
+  auto table = &Cshift_table[0];
  accelerator_for(i,ent,1,{
    rhs_v[table[i].first]=buffer_p[table[i].second];
  });
 }
@@ -195,13 +205,11 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  if(cbmask ==0x3 ) {
    auto rhs_v = rhs.View();
-    thread_for_collapse(2,n,e1,{
+    accelerator_for2d(n,e1,b,e2,1,{
      for(int b=0;b<e2;b++){
 	int o      = n*rhs.Grid()->_slice_stride[dimension];
 	int offset = b+n*rhs.Grid()->_slice_block[dimension];
 	merge(rhs_v[so+o+b],pointers,offset);
-      }
+      });
    });
  } else { 
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
@@ -225,6 +233,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
 template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];
@@ -239,14 +248,16 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs.Grid()->_slice_block[dimension];
  int stride = rhs.Grid()->_slice_stride[dimension];
-  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+
  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
  int ent=0;
  if(cbmask == 0x3 ){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
-	table[ent++] = std::pair<int,int>(lo+o,ro+o);
+	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
  } else { 
@@ -255,7 +266,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
        int o =n*stride+b;
        int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
        if ( ocb&cbmask ) {
-	  table[ent++] = std::pair<int,int>(lo+o,ro+o);
+	  Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
 	}
      }
    }
@@ -263,7 +274,8 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  auto rhs_v = rhs.View();
  auto lhs_v = lhs.View();
-  thread_for(i,ent,{
+  auto table = &Cshift_table[0];
  accelerator_for(i,ent,1,{
    lhs_v[table[i].first]=rhs_v[table[i].second];
  });
@@ -271,7 +283,6 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
 template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];
  if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
@@ -285,27 +296,29 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int e2=rhs.Grid()->_slice_block [dimension];
  int stride = rhs.Grid()->_slice_stride[dimension];
-  static std::vector<std::pair<int,int> > table;  table.resize(e1*e2);
+  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
  int ent=0;
  if ( cbmask == 0x3 ) {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
-      table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
-      if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+      if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
  }
  auto rhs_v = rhs.View();
  auto lhs_v = lhs.View();
-  thread_for(i,ent,{
+  auto table = &Cshift_table[0];
  accelerator_for(i,ent,1,{
    permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
  });
 }
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -1,10 +1,186 @@
 #include <Grid/GridCore.h>
 NAMESPACE_BEGIN(Grid);
-uint32_t accelerator_threads;
+uint32_t accelerator_threads=8;
 uint32_t acceleratorThreads(void)       {return accelerator_threads;};
 void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
-#ifdef GRID_SYCL
+
-cl::sycl::queue *theGridAccelerator;
+#ifdef GRID_CUDA
 cudaDeviceProp *gpu_props;
 void acceleratorInit(void)
 {
  int nDevices = 1;
  cudaGetDeviceCount(&nDevices);
  gpu_props = new cudaDeviceProp[nDevices];
  char * localRankStr = NULL;
  int rank = 0, world_rank=0; 
 #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 #define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  for (int i = 0; i < nDevices; i++) {
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    cudaGetDeviceProperties(&gpu_props[i], i);
    if ( world_rank == 0) {
      cudaDeviceProp prop; 
      prop = gpu_props[i];
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device Number    : %d\n", i);
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device identifier: %s\n", prop.name);
      GPU_PROP(managedMemory);
      GPU_PROP(isMultiGpuBoard);
      GPU_PROP(warpSize);
      //      GPU_PROP(unifiedAddressing);
      //      GPU_PROP(l2CacheSize);
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
 #ifdef GRID_IBM_SUMMIT
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 )  printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
 #else
  if ( world_rank == 0 )  printf("GpuInit: setting device to node rank\n");
  cudaSetDevice(rank);
 #endif
  if ( world_rank == 0 )  printf("GpuInit: ================================================\n");
 }
 #endif
 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
 void acceleratorInit(void)
 {
  int nDevices = 1;
  hipGetDeviceCount(&nDevices);
  gpu_props = new hipDeviceProp_t[nDevices];
  char * localRankStr = NULL;
  int rank = 0, world_rank=0; 
 #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 #define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  for (int i = 0; i < nDevices; i++) {
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    hipGetDeviceProperties(&gpu_props[i], i);
    if ( world_rank == 0) {
      hipDeviceProp_t prop; 
      prop = gpu_props[i];
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device Number    : %d\n", i);
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device identifier: %s\n", prop.name);
      //      GPU_PROP(managedMemory);
      GPU_PROP(isMultiGpuBoard);
      GPU_PROP(warpSize);
      //      GPU_PROP(unifiedAddressing);
      //      GPU_PROP(l2CacheSize);
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
 #ifdef GRID_IBM_SUMMIT
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 )  printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
 #else
  if ( world_rank == 0 )  printf("GpuInit: setting device to node rank\n");
  cudaSetDevice(rank);
 #endif
  if ( world_rank == 0 )  printf("GpuInit: ================================================\n");
 }
 #endif
 #ifdef GRID_SYCL
 cl::sycl::queue *theGridAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
  cl::sycl::gpu_selector selector;
  cl::sycl::device selectedDevice { selector };
  theGridAccelerator = new sycl::queue (selectedDevice);
  char * localRankStr = NULL;
  int rank = 0, world_rank=0; 
 #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 #define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  if ( world_rank == 0 ) {
    GridBanner();
  }
  /*
  for (int i = 0; i < nDevices; i++) {
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    cudaGetDeviceProperties(&gpu_props[i], i);
    if ( world_rank == 0) {
      cudaDeviceProp prop; 
      prop = gpu_props[i];
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device Number    : %d\n", i);
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device identifier: %s\n", prop.name);
    }
  }
  */
  if ( world_rank == 0 ) {
    printf("GpuInit: ================================================\n");
  }
 }
 #endif
 #if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
 void acceleratorInit(void){}
 #endif
 NAMESPACE_END(Grid);
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -51,6 +51,7 @@ NAMESPACE_BEGIN(Grid);
 //
 // Warp control and info:
 //
 //    acceleratorInit;
 //    void     acceleratorSynchronise(void); // synch warp etc..
 //    int      acceleratorSIMTlane(int Nsimd);
 //
@@ -69,6 +70,7 @@ NAMESPACE_BEGIN(Grid);
 uint32_t acceleratorThreads(void);   
 void     acceleratorThreads(uint32_t);
 void     acceleratorInit(void);
 //////////////////////////////////////////////
 // CUDA acceleration
@@ -83,6 +85,32 @@ void     acceleratorThreads(uint32_t);
 #define accelerator        __host__ __device__
 #define accelerator_inline __host__ __device__ inline
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.x; } // CUDA specific
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
    typedef uint64_t Iterator;						\
    auto lambda = [=] accelerator					\
      (Iterator lane,Iterator iter1,Iterator iter2) mutable {		\
      __VA_ARGS__;							\
    };									\
    int nt=acceleratorThreads();					\
    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks (1,(num1+nt-1)/nt,num2);				\
    LambdaApply<<<cu_blocks,cu_threads>>>(nsimd,num1,num2,lambda);		\
  }
 template<typename lambda>  __global__
 void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
 {
  uint64_t x = threadIdx.x;//+ blockDim.x*blockIdx.x;
  uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
  uint64_t z = threadIdx.z + blockDim.z*blockIdx.z;
  if ( (x < num1) && (y<num2) && (z<num3) ) {
    Lambda(x,y,z);
  }
 }
 #define accelerator_barrier(dummy)					\
  {									\
    cudaDeviceSynchronize();						\
@@ -91,25 +119,9 @@ void     acceleratorThreads(uint32_t);
      printf("Cuda error %s \n", cudaGetErrorString( err ));		\
      puts(__FILE__);							\
      printf("Line %d\n",__LINE__);					\
      exit(0);								\
    }									\
  }
 #define accelerator_forNB( iterator, num, nsimd, ... )			\
  {									\
    typedef uint64_t Iterator;						\
    auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
      __VA_ARGS__;							\
    };									\
    dim3 cu_threads(acceleratorThreads(),nsimd);			\
    dim3 cu_blocks ((num+acceleratorThreads()-1)/acceleratorThreads());			\
    LambdaApply<<<cu_blocks,cu_threads>>>(nsimd,num,lambda);	\
  }
 #define accelerator_for( iterator, num, nsimd, ... )		\
  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
  accelerator_barrier(dummy);
 inline void *acceleratorAllocShared(size_t bytes)
 {
  void *ptr=NULL;
@@ -133,15 +145,6 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 template<typename lambda>  __global__
 void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
 {
  uint64_t isite = threadIdx.y;
  uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
  if ( (osite <Osites) && (isite<Isites) ) {
    Lambda(isite,osite);
  }
 }
 #endif
@@ -164,25 +167,29 @@ extern cl::sycl::queue *theGridAccelerator;
 #define accelerator 
 #define accelerator_inline strong_inline
-#define accelerator_forNB(iterator,num,nsimd, ... )			\
+accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[0]; } // SYCL specific
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
-      cl::sycl::range<3> local {acceleratorThreads(),1,nsimd};			\
+      int nt=acceleratorThreads();					\
-      cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
+      unsigned long unum1 = num1;					\
      unsigned long unum2 = num2;					\
      cl::sycl::range<3> local {nsimd,nt,1};				\
      cl::sycl::range<3> global{nsimd,unum1,unum2};			\
      cgh.parallel_for<class dslash>(					\
      cl::sycl::nd_range<3>(global,local),            \
      [=] (cl::sycl::nd_item<3> item) mutable {       \
-      auto iterator = item.get_global_id(0);	      \
+      auto lane     = item.get_global_id(0);	      \
-      auto lane     = item.get_global_id(2);	      \
+      auto iter1    = item.get_global_id(1);	      \
      auto iter2    = item.get_global_id(2);	      \
      { __VA_ARGS__ };				      \
     });	   			              \
    });
 dim3 cu_threads(nsimd,acceleratorThreads(),1);                      \
 dim3 cu_blocks (1,(num1+nt-1)/n,num2);                              \
 #define accelerator_barrier(dummy) theGridAccelerator->wait();
 #define accelerator_for( iterator, num, nsimd, ... )		\
  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
  accelerator_barrier(dummy);
 inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
@@ -204,33 +211,49 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator        __host__ __device__
 #define accelerator_inline __host__ __device__ inline
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_x; } // HIP specific
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
    typedef uint64_t Iterator;						\
    auto lambda = [=] accelerator					\
      (Iterator lane,Iterator iter1,Iterator iter2 ) mutable {		\
      { __VA_ARGS__;}							\
    };									\
    int nt=acceleratorThreads();					\
    dim3 hip_threads(nsimd,nt,1);					\
    dim3 hip_blocks (1,(num1+nt-1)/nt,num2);				\
    hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
 		       0,0,						\
 		       nsimd,num1,num2,lambda);				\
  }
 template<typename lambda>  __global__
 void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 {
  uint64_t x = hipThreadIdx_x;//+ hipBlockDim_x*hipBlockIdx_x;
  uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
  uint64_t z = hipThreadIdx_z + hipBlockDim_z*hipBlockIdx_z;
  if ( (x < numx) && (y<numy) && (z<numz) ) {
    Lambda(x,y,z);
  }
 }
 #define accelerator_barrier(dummy)				\
  {								\
    hipDeviceSynchronize();					\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
-      printf("HIP error %s \n", hipGetErrorString( err )); \
+      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
-      puts(__FILE__); \
+      puts(__FILE__);							\
-      printf("Line %d\n",__LINE__);					\
+      printf("Line %d\n",__LINE__);				\
      exit(0);							\
    }								\
  }
 #define accelerator_forNB( iterator, num, nsimd, ... )			\
  {									\
    typedef uint64_t Iterator;						\
    auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
      __VA_ARGS__;							\
    };									\
    dim3 hip_threads(acceleratorThreads(),nsimd);				\
    dim3 hip_blocks ((num+acceleratorThreads()-1)/acceleratorThreads());			\
    hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,0,0,num,nsimd,lambda);\
  }
 #define accelerator_for( iterator, num, nsimd, ... )		\
  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
  accelerator_barrier(dummy);
 inline void *acceleratorAllocShared(size_t bytes)
 {
  void *ptr=NULL;
@@ -241,6 +264,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  }
  return ptr;
 };
 inline void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
@@ -251,18 +275,25 @@ inline void *acceleratorAllocDevice(size_t bytes)
  }
  return ptr;
 };
 inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
-template<typename lambda>  __global__
+#endif
-void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
+
-{
+//////////////////////////////////////////////
-  uint64_t isite = hipThreadIdx_y;
+// Common on all GPU targets
-  uint64_t osite = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
+//////////////////////////////////////////////
-  if ( (osite <Osites) && (isite<Isites) ) {
+#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
-    Lambda(isite,osite);
+#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
-  }
+
-}
+#define accelerator_for( iter, num, nsimd, ... )		\
  accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } );	\
  accelerator_barrier(dummy);
 #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... )	\
  accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \
  accelerator_barrier(dummy);
 #endif
@@ -280,6 +311,9 @@ void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
 #define accelerator_for(iterator,num,nsimd, ... )   thread_for(iterator, num, { __VA_ARGS__ });
 #define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
 #define accelerator_barrier(dummy) 
 #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 #ifdef HAVE_MALLOC_MALLOC_H
 #include <malloc/malloc.h>
@@ -303,7 +337,6 @@ inline void acceleratorFreeShared(void *ptr){free(ptr);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 #endif
 #endif // CPU target
 ///////////////////////////////////////////////////
@@ -325,25 +358,4 @@ accelerator_inline void acceleratorSynchronise(void)
  return;
 }
 ////////////////////////////////////////////////////
 // Address subvectors on accelerators
 ////////////////////////////////////////////////////
 #ifdef GRID_SIMT
 #ifdef GRID_CUDA
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
 #endif
 #ifdef GRID_SYCL
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
 #endif
 #ifdef GRID_HIP
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_y; } // HIP specific
 #endif
 #else
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 #endif
 NAMESPACE_END(Grid);
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -58,6 +58,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #endif
 #define thread_for( i, num, ... )                           DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 #define thread_for2d( i1, n1,i2,n2, ... )  \
  DO_PRAGMA(omp parallel for collapse(2))  \
  for ( uint64_t i1=0;i1<n1;i1++) {	   \
  for ( uint64_t i2=0;i2<n2;i2++) {	   \
  { __VA_ARGS__ } ;			   \
  }}
 #define thread_foreach( i, container, ... )                 DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
 #define thread_for_in_region( i, num, ... )                 DO_PRAGMA(omp for schedule(static))          for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 #define thread_for_collapse2( i, num, ... )                 DO_PRAGMA(omp parallel for collapse(2))      for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -73,12 +73,6 @@ feenableexcept (unsigned int excepts)
 }
 #endif
 uint32_t gpu_threads=8;
 #ifdef GRID_SYCL
 cl::sycl::queue *theGridAccelerator;
 #endif
 NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////
@@ -196,16 +190,12 @@ void GridParseLayout(char **argv,int argc,
    assert(ompthreads.size()==1);
    GridThread::SetThreads(ompthreads[0]);
  }
-  if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){
+  if( GridCmdOptionExists(argv,argv+argc,"--accelerator-threads") ){
    std::vector<int> gputhreads(0);
-#ifndef GRID_CUDA
+    arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads");
    std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was"
              << " not compiled with GPU support" << std::endl;
 #endif
    arg= GridCmdOptionPayload(argv,argv+argc,"--gpu-threads");
    GridCmdOptionIntVector(arg,gputhreads);
    assert(gputhreads.size()==1);
-    gpu_threads=gputhreads[0];
+    acceleratorThreads(gputhreads[0]);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
@@ -245,8 +235,6 @@ static int Grid_is_initialised;
 /////////////////////////////////////////////////////////
 void GridBanner(void)
 {
  static int printed =0;
  if( !printed ) {
    std::cout <<std::endl;
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
@@ -282,125 +270,7 @@ void GridBanner(void)
    std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
 #endif
    std::cout << std::endl;
    printed=1;
  }
 }
 #ifdef GRID_CUDA
 cudaDeviceProp *gpu_props;
 void GridGpuInit(void)
 {
  int nDevices = 1;
  cudaGetDeviceCount(&nDevices);
  gpu_props = new cudaDeviceProp[nDevices];
  char * localRankStr = NULL;
  int rank = 0, world_rank=0; 
 #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 #define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  if ( world_rank == 0 ) {
    GridBanner();
  }
  for (int i = 0; i < nDevices; i++) {
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    cudaGetDeviceProperties(&gpu_props[i], i);
    if ( world_rank == 0) {
      cudaDeviceProp prop; 
      prop = gpu_props[i];
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device Number    : %d\n", i);
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device identifier: %s\n", prop.name);
      GPU_PROP(managedMemory);
      GPU_PROP(isMultiGpuBoard);
      GPU_PROP(warpSize);
      //      GPU_PROP(unifiedAddressing);
      //      GPU_PROP(l2CacheSize);
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
 #ifdef GRID_IBM_SUMMIT
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 )  printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
 #else
  if ( world_rank == 0 )  printf("GpuInit: setting device to node rank\n");
  cudaSetDevice(rank);
 #endif
  if ( world_rank == 0 )  printf("GpuInit: ================================================\n");
 }
 #endif
 #ifdef GRID_SYCL
 void GridGpuInit(void)
 {
  int nDevices = 1;
  cl::sycl::gpu_selector selector;
  cl::sycl::device selectedDevice { selector };
  theGridAccelerator = new sycl::queue (selectedDevice);
  char * localRankStr = NULL;
  int rank = 0, world_rank=0; 
 #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 #define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  if ( world_rank == 0 ) {
    GridBanner();
  }
  /*
  for (int i = 0; i < nDevices; i++) {
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    cudaGetDeviceProperties(&gpu_props[i], i);
    if ( world_rank == 0) {
      cudaDeviceProp prop; 
      prop = gpu_props[i];
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device Number    : %d\n", i);
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device identifier: %s\n", prop.name);
    }
  }
  */
  if ( world_rank == 0 ) {
    printf("GpuInit: ================================================\n");
  }
 }
 #endif
 #if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))
 void GridGpuInit(void){}
 #endif
 void Grid_init(int *argc,char ***argv)
 {
@@ -414,7 +284,7 @@ void Grid_init(int *argc,char ***argv)
  //////////////////////////////////////////////////////////
  // Early intialisation necessities without rank knowledge
  //////////////////////////////////////////////////////////
-  GridGpuInit(); // Must come first to set device prior to MPI init
+  acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
    int MB;
@@ -483,7 +353,6 @@ void Grid_init(int *argc,char ***argv)
    std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){
    MemoryProfiler::debug = true;
    MemoryProfiler::stats = &dbgMemStats;
--- a/tests/core/Test_cshift_red_black.cc
+++ b/tests/core/Test_cshift_red_black.cc
@@ -82,7 +82,7 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd,Uo,U);
  //  std::cout<<GridLogMessage << U<<std::endl;
-  std::cout<<GridLogMessage<< U <<std::endl;
+  //  std::cout<<GridLogMessage<< U <<std::endl;
  std::cout<<GridLogMessage << "U " <<norm2(U)<<std::endl;
  std::cout<<GridLogMessage << "Ue " <<norm2(Ue)<<std::endl;
  std::cout<<GridLogMessage << "Uo " <<norm2(Uo)<<std::endl;
--- a/tests/core/Test_cshift_rotate.cc
+++ b/tests/core/Test_cshift_rotate.cc
@@ -69,6 +69,7 @@ int main (int argc, char ** argv)
 	ShiftU  = Cshift(U,dir,shift);    // Shift everything
 	std::cout<<GridLogMessage<<"Shifted by "<<shift<<" in direction"<<dir<<" checking the AE35 unit" <<std::endl;
 	/*
 	std::cout << "U[0]" << U[0]<<std::endl;
 	std::cout << "U[1]" << U[1]<<std::endl;
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@@ -73,7 +73,7 @@ int main(int argc, char **argv) {
    omp_set_num_threads(omp);
 #endif
-    for (int lat = 8; lat <= 16; lat += 40) {
+    for (int lat = 16; lat <= 16; lat += 40) {
      std::cout << "Lat " << lat << std::endl;
      latt_size[0] = lat;
@@ -159,15 +159,17 @@ int main(int argc, char **argv) {
      LatticeColourMatrix newFoo = Foo; 
      // confirm correctness of copy constructor
      Bar = Foo - newFoo;
-      std::cout << "Copy constructor diff check: "; 
+      std::cout << "Copy constructor diff check: \n"; 
      double test_cc = norm2(Bar);
      if (test_cc < 1e-5){
        std::cout << "OK\n";
-    }
+      } else{
-      else{
+	std::cout << "Foo\n"<<Foo<<std::endl;
 	std::cout << "newFoo\n"<<newFoo<<std::endl;
 	std::cout << "Bar\n"<<Bar<<std::endl;
        std::cout << "fail\n";
        abort();
-    }
+      }
      // Norm2 check
      LatticeReal BarReal(&Fine);