Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc

Aim to reduce the amount of cuda and other code variations floating around all over the place. Will move GpuInit iinto Accelerator.cc from Init.cc Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
2025-07-26 01:17:06 +01:00 · 2020-05-08 06:23:55 -07:00
parent 28a1fcaaff
commit f8b8e00090
13 changed files with 557 additions and 718 deletions
--- a/Grid/GridCore.h
+++ b/Grid/GridCore.h
@@ -49,7 +49,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/log/Log.h>
 #include <Grid/allocator/AlignedAllocator.h>
 #include <Grid/simd/Simd.h>
-#include <Grid/threads/Threads.h>
+#include <Grid/threads/ThreadReduction.h>
 #include <Grid/serialisation/Serialisation.h>
 #include <Grid/util/Sha.h>
 #include <Grid/communicator/Communicator.h> 
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -1,14 +1,3 @@
-    // blockZaxpy in bockPromote - 3s, 5%
-    // noncoalesced linalg in Preconditionoer ~ 3s 5%
-    // Lancos tuning or replace 10-20s ~ 25%, open ended
-    // setup tuning   5s  ~  8%
-    //    -- e.g. ordermin, orderstep tunables.
-    // MdagM path without norm in LinOp code.     few seconds
-
-    // Mdir calc blocking kernels
-    // Fuse kernels in blockMaskedInnerProduct
-    // preallocate Vectors in Cayley 5D ~ few percent few seconds
-
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -91,34 +80,7 @@ public:
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
-      
-    //// report back
-    std::cout<<GridLogMessage<<"directions    :";
-    for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
-    std::cout<<std::endl;
-    std::cout<<GridLogMessage<<"displacements :";
-    for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
-    std::cout<<std::endl;
  }
-  
-  /*
-  // Original cleaner code
-  Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
-  for(int d=0;d<dimension;d++){
-  directions[2*d  ] = d;
-  directions[2*d+1] = d;
-  displacements[2*d  ] = +1;
-  displacements[2*d+1] = -1;
-  }
-  directions   [2*dimension]=0;
-  displacements[2*dimension]=0;
-  }
-  std::vector<int> GetDelta(int point) {
-  std::vector<int> delta(dimension,0);
-  delta[directions[point]] = displacements[point];
-  return delta;
-  };
-  */    

 };
  
@@ -149,25 +111,7 @@ public:
    CoarseScalar InnerProd(CoarseGrid); 
    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
-    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
-    //    blockOrthogonalise(InnerProd,subspace);
-    //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
-    //      CheckOrthogonal();
  } 
-  void CheckOrthogonal(void){
-    CoarseVector iProj(CoarseGrid); 
-    CoarseVector eProj(CoarseGrid); 
-    for(int i=0;i<nbasis;i++){
-      blockProject(iProj,subspace[i],subspace);
-      eProj=Zero(); 
-      accelerator_for(ss, CoarseGrid->oSites(),1,{
-	eProj[ss](i)=CComplex(1.0);
-      });
-      eProj=eProj - iProj;
-      std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
-    }
-    std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
-  }
  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
    blockProject(CoarseVec,FineVec,subspace);
  }
@@ -175,50 +119,12 @@ public:
    FineVec.Checkerboard() = subspace[0].Checkerboard();
    blockPromote(CoarseVec,FineVec,subspace);
  }
-  void CreateSubspaceRandom(GridParallelRNG &RNG){
-    for(int i=0;i<nbasis;i++){
-      random(RNG,subspace[i]);
-    }
-  }
-
-  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
-
-    RealD scale;
-
-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-
-    for(int b=0;b<nn;b++){
-	
-      subspace[b] = Zero();
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-	
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      for(int i=0;i<1;i++){
-
-	CG(hermop,noise,subspace[b]);
-
-	noise = subspace[b];
-	scale = std::pow(norm2(noise),-0.5); 
-	noise=noise*scale;
-
-      }
-
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
-      subspace[b]   = noise;
-
-    }
-  }

  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
  ////////////////////////////////////////////////////////////////////////////////////////////////
-#if 1
+
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
@@ -313,201 +219,6 @@ public:
    }
    assert(b==nn);
  }
-#endif
-#if 0
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter,
-				       int ordermin,
-				       int orderstep,
-				       double filterlo
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-    FineField combined(FineGrid);
-
-    // New normalised noise
-    gaussian(RNG,noise);
-    scale = std::pow(norm2(noise),-0.5); 
-    noise=noise*scale;
-
-    // Initial matrix element
-    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-    int b =0;
-#define FILTERb(llo,hhi,oorder)						\
-    {									\
-      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
-      Cheb(hermop,noise,Mn);						\
-      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
-      subspace[b]   = Mn;						\
-      hermop.Op(Mn,tmp);						\
-      std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
-      b++;								\
-    }									
-
-    //      JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5);	\
-
-    RealD alpha=-0.8;
-    RealD beta =-0.8;
-#define FILTER(llo,hhi,oorder)						\
-    {									\
-      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
-      /* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
-      Cheb(hermop,noise,Mn);						\
-      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
-      subspace[b]   = Mn;						\
-      hermop.Op(Mn,tmp);						\
-      std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
-      b++;								\
-    }									
-    
-#define FILTERc(llo,hhi,oorder)				\
-    {							\
-      Chebyshev<FineField> Cheb(llo,hhi,oorder);	\
-      Cheb(hermop,noise,combined);			\
-    }									
-
-    double node = 0.000;
-    FILTERb(lo,hi,orderfilter);// 0
-    //    FILTERc(node,hi,51);// 0
-    noise = Mn;
-    int base = 0;
-    int mult = 100;
-    FILTER(node,hi,base+1*mult);
-    FILTER(node,hi,base+2*mult);
-    FILTER(node,hi,base+3*mult);
-    FILTER(node,hi,base+4*mult);
-    FILTER(node,hi,base+5*mult);
-    FILTER(node,hi,base+6*mult);
-    FILTER(node,hi,base+7*mult);
-    FILTER(node,hi,base+8*mult);
-    FILTER(node,hi,base+9*mult);
-    FILTER(node,hi,base+10*mult);
-    FILTER(node,hi,base+11*mult);
-    FILTER(node,hi,base+12*mult);
-    FILTER(node,hi,base+13*mult);
-    FILTER(node,hi,base+14*mult);
-    FILTER(node,hi,base+15*mult);
-    assert(b==nn);
-  }
-#endif
-
-#if 0
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter,
-				       int ordermin,
-				       int orderstep,
-				       double filterlo
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-    FineField combined(FineGrid);
-
-    // New normalised noise
-    gaussian(RNG,noise);
-    scale = std::pow(norm2(noise),-0.5); 
-    noise=noise*scale;
-
-    // Initial matrix element
-    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-    int b =0;
-    {						
-      Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
-      //      JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
-      //JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
-      //      JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
-      JacobiPoly(hermop,noise,Mn);
-      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp);
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; 
-      b++;
-      //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
-      //      subspace[b]   = tmp;      b++;
-      //    }									
-    }									
-
-#define FILTER(lambda)						\
-    {								\
-      hermop.HermOp(subspace[0],tmp);				\
-      tmp = tmp - lambda *subspace[0];				\
-      scale = std::pow(norm2(tmp),-0.5);			\
-      tmp=tmp*scale;							\
-      subspace[b]   = tmp;						\
-      hermop.Op(subspace[b],tmp);					\
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
-      b++;								\
-    }									
-    //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
-    //      subspace[b]   = tmp;      b++;
-    //    }									
-
-    FILTER(2.0e-5);
-    FILTER(2.0e-4);
-    FILTER(4.0e-4);
-    FILTER(8.0e-4);
-    FILTER(8.0e-4);
-
-    FILTER(2.0e-3);
-    FILTER(3.0e-3);
-    FILTER(4.0e-3);
-    FILTER(5.0e-3);
-    FILTER(6.0e-3);
-
-    FILTER(2.5e-3);
-    FILTER(3.5e-3);
-    FILTER(4.5e-3);
-    FILTER(5.5e-3);
-    FILTER(6.5e-3);
-
-    //    FILTER(6.0e-5);//6
-    //    FILTER(7.0e-5);//8
-    //    FILTER(8.0e-5);//9
-    //    FILTER(9.0e-5);//3
-
-    /*
-    //    FILTER(1.0e-4);//10
-    FILTER(2.0e-4);//11
-    //   FILTER(3.0e-4);//12
-    //    FILTER(4.0e-4);//13
-    FILTER(5.0e-4);//14
-
-    FILTER(6.0e-3);//4
-    FILTER(7.0e-4);//1
-    FILTER(8.0e-4);//7
-    FILTER(9.0e-4);//15
-    FILTER(1.0e-3);//2
-
-    FILTER(2.0e-3);//2
-    FILTER(3.0e-3);//2
-    FILTER(4.0e-3);//2
-    FILTER(5.0e-3);//2
-    FILTER(6.0e-3);//2
-
-    FILTER(7.0e-3);//2
-    FILTER(8.0e-3);//2
-    FILTER(1.0e-2);//2
-    */
-    std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
-    assert(b==nn);
-  }
-#endif
-

 };

@@ -580,23 +291,22 @@ public:
      int ptype;
      StencilEntry *SE;

-      int lane=SIMTlane(Nsimd);
      for(int point=0;point<geom.npoint;point++){

 	SE=Stencil.GetEntry(ptype,point,ss);
 	  
 	if(SE->_is_local) { 
-	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
+	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
 	} else {
-	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
 	}
-	synchronise();
+	acceleratorSynchronise();

 	for(int bb=0;bb<nbasis;bb++) {
 	  res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
 	}
      }
-      coalescedWrite(out_v[ss](b),res,lane);
+      coalescedWrite(out_v[ss](b),res);
    });
    usecs +=usecond();

@@ -604,13 +314,6 @@ public:
    RealD Nout= norm2(out);
    nrm_usec+=usecond();

-    /*
-        std::cout << GridLogMessage << "\tNorm        " << nrm_usec << " us" <<std::endl;
-        std::cout << GridLogMessage << "\tHalo        " << comms_usec << " us" <<std::endl;
-        std::cout << GridLogMessage << "\tMatrix      " << usecs << " us" <<std::endl;
-        std::cout << GridLogMessage << "\t  mflop/s   " << flops/usecs<<std::endl;
-        std::cout << GridLogMessage << "\t  MB/s      " << bytes/usecs<<std::endl;
-    */
    return Nout;
  };

@@ -658,45 +361,20 @@ public:
      int ptype;
      StencilEntry *SE;

-      int lane=SIMTlane(Nsimd);
      SE=Stencil.GetEntry(ptype,point,ss);
 	  
      if(SE->_is_local) { 
-	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
+	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
      } else {
-	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
      }
-      synchronise();
+      acceleratorSynchronise();

      for(int bb=0;bb<nbasis;bb++) {
 	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
      }
-      coalescedWrite(out_v[ss](b),res,lane);
+      coalescedWrite(out_v[ss](b),res);
    });
-#if 0
-    accelerator_for(ss,Grid()->oSites(),1,{
-
-      siteVector res = Zero();
-      siteVector nbr;
-      int ptype;
-      StencilEntry *SE;
-      
-      SE=Stencil.GetEntry(ptype,point,ss);
-      
-      if(SE->_is_local&&SE->_permute) {
-	permute(nbr,in_v[SE->_offset],ptype);
-      } else if(SE->_is_local) {
-	nbr = in_v[SE->_offset];
-      } else {
-	nbr = Stencil.CommBuf()[SE->_offset];
-      }
-      synchronise();
-
-      res = res + Aview_p[point][ss]*nbr;
-      
-      out_v[ss]=res;
-    });
-#endif
  }
  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
  {
@@ -912,33 +590,8 @@ public:
      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
      ForceHermitian();
    }
-      // AssertHermitian();
-      // ForceDiagonal();
  }

-#if 0
-    ///////////////////////////
-    // test code worth preserving in if block
-    ///////////////////////////
-    std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
-    for(int p=0;p<geom.npoint;p++){
-      std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
-      std::cout<<GridLogMessage<< A[p] << std::endl;
-    }
-    std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
-
-    phi=Subspace.subspace[0];
-    std::vector<int> bc(FineGrid->_ndimension,0);
-
-    blockPick(Grid(),phi,tmp,bc);      // Pick out a block
-    linop.Op(tmp,Mphi);                // Apply big dop
-    blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
-    std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
-    std::cout<<GridLogMessage<< iProj <<std::endl;
-    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
-#endif
-
-
  void ForceHermitian(void) {
    CoarseMatrix Diff  (Grid());
    for(int p=0;p<geom.npoint;p++){
@@ -958,27 +611,6 @@ public:
      }
    }
  }
-  void AssertHermitian(void) {
-    CoarseMatrix AA    (Grid());
-    CoarseMatrix AAc   (Grid());
-    CoarseMatrix Diff  (Grid());
-    for(int d=0;d<4;d++){
-	
-      int dd=d+1;
-      AAc = Cshift(A[2*d+1],dd,1);
-      AA  = A[2*d];
-	
-      Diff = AA - adj(AAc);
-
-      std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
-      std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
-	  
-    }
-    Diff = A[8] - adj(A[8]);
-    std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
-    std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
-  }
-    
 };

 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -60,6 +60,7 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
  typedef decltype(basis[0].View()) View;
  auto tmp_v = basis[0].View();
  Vector<View> basis_v(basis.size(),tmp_v);
+  View *basis_vp = &basis_v[0];
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();

@@ -129,7 +130,7 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i

      for(int k=k0; k<k1; ++k){
 	auto tmp = coalescedRead(Bp[ss*nrot+j]);
-	coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
+	coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_vp[k][sss]));
      }
    });

@@ -138,7 +139,7 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
      int jj  =j0+j;
      int ss =sj/nrot;
      int sss=ss+s;
-      coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
+      coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
    });
  }
 #endif
@@ -155,6 +156,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
  result.Checkerboard() = basis[0].Checkerboard();
  auto result_v=result.View();
  Vector<View> basis_v(basis.size(),result_v);
+  View * basis_vp = &basis_v[0];
  for(int k=0;k<basis.size();k++){
    basis_v[k] = basis[k].View();
  }
@@ -162,10 +164,10 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
  double * Qt_j = & Qt_jv[0];
  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
-    auto B=coalescedRead(basis_v[k0][ss]);
+    auto B=coalescedRead(basis_vp[k0][ss]);
    B=Zero();
    for(int k=k0; k<k1; ++k){
-      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
+      B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
    }
    coalescedWrite(result_v[ss], B);
  });
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -29,28 +29,16 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALIGNED_ALLOCATOR_H
 #define GRID_ALIGNED_ALLOCATOR_H

-#ifdef HAVE_MALLOC_MALLOC_H
-#include <malloc/malloc.h>
-#endif
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>
-#endif
-
-#ifdef HAVE_MM_MALLOC_H
-#include <mm_malloc.h>
-#endif
-
-#define POINTER_CACHE
-#define GRID_ALLOC_ALIGN (2*1024*1024)

 NAMESPACE_BEGIN(Grid);

-// Move control to configure.ac and Config.h?
+/*Move control to configure.ac and Config.h*/
+#define POINTER_CACHE
+/*Pinning pages is costly*/
+/*Could maintain separate large and small allocation caches*/
 #ifdef POINTER_CACHE
 class PointerCache {
 private:
-/*Pinning pages is costly*/
-/*Could maintain separate large and small allocation caches*/

  static const int Ncache=128;
  static int victim;
@@ -159,44 +147,16 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);

-
 #ifdef POINTER_CACHE
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
 #else
    pointer ptr = nullptr;
 #endif

-#ifdef GRID_CUDA
-    ////////////////////////////////////
-    // Unified (managed) memory
-    ////////////////////////////////////
-    if ( ptr == (_Tp *) NULL ) {
-      //      printf(" alignedAllocater cache miss %ld bytes ",bytes);      BACKTRACEFP(stdout);
-      auto err = cudaMallocManaged((void **)&ptr,bytes);
-      if( err != cudaSuccess ) {
-	ptr = (_Tp *) NULL;
-	std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
-	assert(0);
-      }
-    } 
-    assert( ptr != (_Tp *)NULL);
-#endif
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) acceleratorAllocShared(bytes);

-#ifdef GRID_SYCL
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) malloc_shared(bytes,*theGridAccelerator);
-#endif    
-
-#if ( !defined(GRID_CUDA)) && (!defined(GRID_SYCL))
-    //////////////////////////////////////////////////////////////////////////////////////////
-    // 2MB align; could make option probably doesn't need configurability
-    //////////////////////////////////////////////////////////////////////////////////////////
-  #ifdef HAVE_MM_MALLOC_H
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
-  #else
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
-  #endif
    assert( ptr != (_Tp *)NULL);
-#endif
+
    return ptr;
  }

@@ -211,20 +171,7 @@ public:
    pointer __freeme = __p;
 #endif

-#ifdef GRID_CUDA
-    if ( __freeme ) cudaFree((void *)__freeme);
-#endif
-#ifdef GRID_SYCL
-    if ( __freeme ) free((void *)__freeme,*theGridAccelerator);
-#endif    
-
-#if ( !defined(GRID_CUDA)) && (!defined(GRID_SYCL))
-  #ifdef HAVE_MM_MALLOC_H
-    if ( __freeme ) _mm_free((void *)__freeme); 
-  #else
-    if ( __freeme ) free((void *)__freeme);
-  #endif
-#endif
+    if ( __freeme ) acceleratorFreeShared((void *)__freeme);
  }

  // FIXME: hack for the copy constructor, eventually it must be avoided
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -461,8 +461,8 @@ public:
    }

    {
-      // Obtain one reseeded generator per thread
-      int Nthread = GridThread::GetThreads();
+      // Obtain one reseeded generator per thread      
+      int Nthread = 32; // Hardwire a good level or parallelism
      std::vector<RngEngine> seeders(Nthread);
      for(int t=0;t<Nthread;t++){
 	seeders[t] = Reseed(master_engine);
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -100,7 +100,7 @@ public:
    _Spinor tmp;

    const int Nsimd =SiteDoubledGaugeField::Nsimd();
-    int s = SIMTlane(Nsimd);
+    int s = acceleratorSIMTlane(Nsimd);
    St.iCoorFromIindex(icoor,s);

    int mmu = mu % Nd;
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else {							\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  synchronise();						\
+  acceleratorSynchronise();						\
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
  Recon(result, Uchi);
  
@@ -76,12 +76,12 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else if ( st.same_node[Dir] ) {				\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  synchronise();						\
+  acceleratorSynchronise();						\
  if (SE->_is_local || st.same_node[Dir] ) {			\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
    Recon(result, Uchi);					\
  }								\
-  synchronise();						
+  acceleratorSynchronise();						

 #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
  SE = st.GetEntry(ptype, Dir, sF);				\
@@ -91,7 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
    Recon(result, Uchi);					\
    nmu++;							\
  }								\
-  synchronise();						
+  acceleratorSynchronise();						

 #define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
    if (SE->_is_local ) {					\
@@ -101,7 +101,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
    } else {							\
      chi = coalescedRead(buf[SE->_offset],lane);		\
    }								\
-    synchronise();						\
+    acceleratorSynchronise();					\
    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
    Recon(result, Uchi);					

@@ -128,7 +128,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
@@ -155,7 +155,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
  int ptype;

  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
@@ -183,7 +183,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFi
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);

  result=Zero();
  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
@@ -205,7 +205,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeField
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);

  calcHalfSpinor chi;
  //  calcHalfSpinor *chi_p;
@@ -241,7 +241,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFi
  int ptype;
  int nmu=0;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  result=Zero();
  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
@@ -272,7 +272,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
  int ptype;
  int nmu=0;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  result=Zero();
  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
@@ -302,7 +302,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
  StencilEntry *SE;							\
  int ptype;								\
  const int Nsimd = SiteHalfSpinor::Nsimd();				\
-  const int lane=SIMTlane(Nsimd);					\
+  const int lane=acceleratorSIMTlane(Nsimd);					\
 									\
  SE = st.GetEntry(ptype, dir, sF);					\
  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
@@ -330,7 +330,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);

  SE = st.GetEntry(ptype, dir, sF);
  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
--- a/Grid/tensors/Tensor_SIMT.h
+++ b/Grid/tensors/Tensor_SIMT.h
@@ -31,24 +31,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-//accelerator_inline void SIMTsynchronise(void) 
-accelerator_inline void synchronise(void) 
-{
-#ifdef GRID_SIMT
-#ifdef GRID_CUDA
-//  __syncthreads();
-  __syncwarp();
-#endif
-#endif
-  return;
-}

 #ifndef GRID_SIMT
 //////////////////////////////////////////
 // Trivial mapping of vectors on host
 //////////////////////////////////////////
-accelerator_inline int SIMTlane(int Nsimd) { return 0; } // CUDA specific
-
 template<class vobj> accelerator_inline
 vobj coalescedRead(const vobj & __restrict__ vec,int lane=0)
 {
@@ -68,7 +55,6 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
 template<class vobj> accelerator_inline
 void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
 {
-  //  vstream(vec, extracted);
  vec = extracted;
 }
 template<class vobj> accelerator_inline
@@ -77,31 +63,24 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
  vstream(vec, extracted);
 }
 #else
-#ifdef GRID_CUDA
-accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
-#endif
-#ifdef GRID_SYCL
-//accelerator_inline int SIMTlane(int Nsimd) { return __spirv_BuiltInGlobalInvocationId[2]; } //SYCL specific
-accelerator_inline int SIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
-#endif

 //////////////////////////////////////////
 // Extract and insert slices on the GPU
 //////////////////////////////////////////
 template<class vobj> accelerator_inline
-typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=SIMTlane(vobj::Nsimd()))
+typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 {
  return extractLane(lane,vec);
 }
 template<class vobj> accelerator_inline
-typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=SIMTlane(vobj::Nsimd()))
+typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 {
  int mask = vobj::Nsimd() >> (ptype + 1);		
  int plane= doperm ? lane ^ mask : lane;
  return extractLane(plane,vec);
 }
 template<class vobj> accelerator_inline
-void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=SIMTlane(vobj::Nsimd()))
+void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 {
  insertLane(lane,vec,extracted);
 }
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -0,0 +1,10 @@
+#include <Grid/GridCore.h>
+
+NAMESPACE_BEGIN(Grid);
+uint32_t accelerator_threads;
+uint32_t acceleratorThreads(void)       {return accelerator_threads;};
+void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
+#ifdef GRID_SYCL
+cl::sycl::queue *theGridAccelerator;
+#endif
+NAMESPACE_END(Grid);
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -0,0 +1,345 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Accelerator.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+NAMESPACE_BEGIN(Grid);
+
+//////////////////////////////////////////////////////////////////////////////////
+// Accelerator primitives; fall back to threading if not CUDA or SYCL
+//////////////////////////////////////////////////////////////////////////////////
+//
+// Function attributes
+//
+//    accelerator
+//    accelerator_inline
+//
+// Parallel looping
+// 
+//    accelerator_for
+//    accelerator_forNB 
+//    uint32_t accelerator_barrier();         // device synchronise
+//
+// Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd
+//
+//    uint32_t acceleratorThreads(void);   
+//    void     acceleratorThreads(uint32_t);
+//
+// Warp control and info:
+//
+//    void     acceleratorSynchronise(void); // synch warp etc..
+//    int      acceleratorSIMTlane(int Nsimd);
+//
+// Memory management:
+//
+//    void *acceleratorAllocShared(size_t bytes);
+//    void acceleratorFreeShared(void *ptr);
+//
+//    void *acceleratorAllocDevice(size_t bytes);
+//    void acceleratorFreeDevice(void *ptr);
+//
+//    void *acceleratorCopyToDevice(void *from,void *to,size_t bytes);
+//    void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes);
+//
+//////////////////////////////////////////////////////////////////////////////////
+
+uint32_t acceleratorThreads(void);   
+void     acceleratorThreads(uint32_t);
+
+//////////////////////////////////////////////
+// CUDA acceleration
+//////////////////////////////////////////////
+#ifdef GRID_CUDA
+
+#ifdef __CUDA_ARCH__
+#define GRID_SIMT
+#endif
+
+#define accelerator        __host__ __device__
+#define accelerator_inline __host__ __device__ inline
+
+#define accelerator_barrier(dummy)					\
+  {									\
+    cudaDeviceSynchronize();						\
+    cudaError err = cudaGetLastError();					\
+    if ( cudaSuccess != err ) {						\
+      printf("Cuda error %s \n", cudaGetErrorString( err ));		\
+      puts(__FILE__);							\
+      printf("Line %d\n",__LINE__);					\
+      exit(0);								\
+    }									\
+  }
+
+#define accelerator_forNB( iterator, num, nsimd, ... )			\
+  {									\
+    typedef uint64_t Iterator;						\
+    auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
+      __VA_ARGS__;							\
+    };									\
+    dim3 cu_threads(acceleratorThreads(),nsimd);			\
+    dim3 cu_blocks ((num+acceleratorThreads()-1)/acceleratorThreads());			\
+    LambdaApply<<<cu_blocks,cu_threads>>>(nsimd,num,lambda);	\
+  }
+
+#define accelerator_for( iterator, num, nsimd, ... )		\
+  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
+  accelerator_barrier(dummy);
+
+inline void *acceleratorAllocShared(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = cudaMallocManaged((void **)&ptr,bytes);
+  if( err != cudaSuccess ) {
+    ptr = (_Tp *) NULL;
+    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+  }
+  return ptr;
+};
+inline void *acceleratorAllocDevice(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = cudaMalloc((void **)&ptr,bytes);
+  if( err != cudaSuccess ) {
+    ptr = (_Tp *) NULL;
+    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+  }
+  return ptr;
+};
+inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
+inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
+
+template<typename lambda>  __global__
+void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
+{
+  uint64_t isite = threadIdx.y;
+  uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
+  if ( (osite <Osites) && (isite<Isites) ) {
+    Lambda(isite,osite);
+  }
+}
+
+#endif
+
+//////////////////////////////////////////////
+// SyCL acceleration
+//////////////////////////////////////////////
+
+#ifdef GRID_SYCL
+NAMESPACE_END(Grid);
+#include <CL/sycl.hpp>
+#include <CL/sycl/usm.hpp>
+NAMESPACE_BEGIN(Grid);
+
+extern cl::sycl::queue *theGridAccelerator;
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define GRID_SIMT
+#endif
+
+#define accelerator 
+#define accelerator_inline strong_inline
+
+#define accelerator_forNB(iterator,num,nsimd, ... )			\
+  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
+      cl::sycl::range<3> local {acceleratorThreads(),1,nsimd};			\
+      cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
+      cgh.parallel_for<class dslash>(					\
+      cl::sycl::nd_range<3>(global,local),            \
+      [=] (cl::sycl::nd_item<3> item) mutable {       \
+      auto iterator = item.get_global_id(0);	      \
+      auto lane     = item.get_global_id(2);	      \
+      { __VA_ARGS__ };				      \
+     });	   			              \
+    });
+
+#define accelerator_barrier(dummy) theGridAccelerator->wait();
+
+#define accelerator_for( iterator, num, nsimd, ... )		\
+  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
+  accelerator_barrier(dummy);
+
+inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
+inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
+inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
+inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
+
+#endif
+
+//////////////////////////////////////////////
+// HIP acceleration
+//////////////////////////////////////////////
+#ifdef GRID_HIP
+
+#ifdef __HIP_DEVICE_COMPILE__
+#define GRID_SIMT
+#endif
+
+#define accelerator        __host__ __device__
+#define accelerator_inline __host__ __device__ inline
+#define accelerator_barrier(dummy)				\
+  {								\
+    hipDeviceSynchronize();					\
+    auto err = hipGetLastError();				\
+    if ( err != hipSuccess ) {					\
+      printf("HIP error %s \n", hipGetErrorString( err )); \
+      puts(__FILE__); \
+      printf("Line %d\n",__LINE__);					\
+      exit(0);							\
+    }								\
+  }
+
+#define accelerator_forNB( iterator, num, nsimd, ... )			\
+  {									\
+    typedef uint64_t Iterator;						\
+    auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
+      __VA_ARGS__;							\
+    };									\
+    dim3 hip_threads(acceleratorThreads(),nsimd);				\
+    dim3 hip_blocks ((num+acceleratorThreads()-1)/acceleratorThreads());			\
+    hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,0,0,num,simd,lambda);\
+  }
+
+#define accelerator_for( iterator, num, nsimd, ... )		\
+  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
+  accelerator_barrier(dummy);
+
+inline void *acceleratorAllocShared(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = hipMallocManaged((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
+    ptr = (_Tp *) NULL;
+    printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
+  }
+  return ptr;
+};
+inline void *acceleratorAllocDevice(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = hipMalloc((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
+    ptr = (_Tp *) NULL;
+    printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
+  }
+  return ptr;
+};
+inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
+inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
+
+template<typename lambda>  __global__
+void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
+{
+  uint64_t isite = hipThreadIdx_y;
+  uint64_t osite = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
+  if ( (osite <Osites) && (isite<Isites) ) {
+    Lambda(isite,osite);
+  }
+}
+
+#endif
+
+//////////////////////////////////////////////
+// CPU Target - No accelerator just thread instead
+//////////////////////////////////////////////
+#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
+
+#undef GRID_SIMT
+
+#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned 
+
+#define accelerator 
+#define accelerator_inline strong_inline
+#define accelerator_for(iterator,num,nsimd, ... )   thread_for(iterator, num, { __VA_ARGS__ });
+#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
+#define accelerator_barrier(dummy) 
+
+#ifdef HAVE_MALLOC_MALLOC_H
+#include <malloc/malloc.h>
+#endif
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+#ifdef HAVE_MM_MALLOC_H
+#include <mm_malloc.h>
+#endif
+
+#ifdef HAVE_MM_MALLOC_H
+inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
+inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
+inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
+inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
+#else
+inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
+inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
+inline void acceleratorFreeShared(void *ptr){free(ptr);};
+inline void acceleratorFreeDevice(void *ptr){free(ptr);};
+#endif
+
+
+#endif // CPU target
+
+///////////////////////////////////////////////////
+// Synchronise across local threads for divergence resynch
+///////////////////////////////////////////////////
+accelerator_inline void acceleratorSynchronise(void) 
+{
+#ifdef GRID_SIMT
+#ifdef GRID_CUDA
+  __syncwarp();
+#endif
+#ifdef GRID_SYCL
+  // No barrier call on SYCL??  // Option get __spir:: stuff to do warp barrier
+#endif
+#ifdef GRID_HIP
+  __syncthreads();
+#endif
+#endif
+  return;
+}
+
+////////////////////////////////////////////////////
+// Address subvectors on accelerators
+////////////////////////////////////////////////////
+#ifdef GRID_SIMT
+
+#ifdef GRID_CUDA
+accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
+#endif
+#ifdef GRID_SYCL
+accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
+#endif
+#ifdef GRID_HIP
+accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_y; } // HIP specific
+#endif
+
+#else
+
+accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
+
+#endif
+
+NAMESPACE_END(Grid);
--- a/Grid/threads/Pragmas.h
+++ b/Grid/threads/Pragmas.h
@@ -2,7 +2,7 @@

    Grid physics library, www.github.com/paboyle/Grid 

-    Source file: ./lib/Threads.h
+    Source file: ./lib/Pragmas.h

    Copyright (C) 2015

@@ -28,148 +28,5 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once

-#ifndef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
-#define MIN(x,y) ((x)>(y)?(y):(x))
-#endif
-
-#define strong_inline     __attribute__((always_inline)) inline
-#define UNROLL  _Pragma("unroll")
-
-//////////////////////////////////////////////////////////////////////////////////
-// New primitives; explicit host thread calls, and accelerator data parallel calls
-//////////////////////////////////////////////////////////////////////////////////
-
-#ifdef _OPENMP
-#define GRID_OMP
-#include <omp.h>
-#endif
-
-#ifdef GRID_OMP
-#define DO_PRAGMA_(x) _Pragma (#x)
-#define DO_PRAGMA(x) DO_PRAGMA_(x)
-#define thread_num(a) omp_get_thread_num()
-#define thread_max(a) omp_get_max_threads()
-#else 
-#define DO_PRAGMA_(x) 
-#define DO_PRAGMA(x) 
-#define thread_num(a) (0)
-#define thread_max(a) (1)
-#endif
-
-#define thread_for( i, num, ... )                           DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
-#define thread_foreach( i, container, ... )                 DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
-#define thread_for_in_region( i, num, ... )                 DO_PRAGMA(omp for schedule(static))          for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
-#define thread_for_collapse2( i, num, ... )                 DO_PRAGMA(omp parallel for collapse(2))      for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
-#define thread_for_collapse( N , i, num, ... )              DO_PRAGMA(omp parallel for collapse ( N ) )  for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
-#define thread_for_collapse_in_region( N , i, num, ... )    DO_PRAGMA(omp for collapse ( N ))            for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
-#define thread_region                                       DO_PRAGMA(omp parallel)
-#define thread_critical                                     DO_PRAGMA(omp critical)
-
-
-//////////////////////////////////////////////////////////////////////////////////
-// Accelerator primitives; fall back to threading if not CUDA or SYCL
-//////////////////////////////////////////////////////////////////////////////////
-
-#ifdef GRID_CUDA
-
-extern uint32_t gpu_threads;
-
-#ifdef __CUDA_ARCH__
-#define GRID_SIMT
-#endif
-
-#define accelerator        __host__ __device__
-#define accelerator_inline __host__ __device__ inline
-
-template<typename lambda>  __global__
-void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda)
-{
-  uint64_t isite = threadIdx.y;
-  uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
-  if ( (osite <Osites) && (isite<Isites) ) {
-    Lambda(isite,osite);
-  }
-}
-
-/////////////////////////////////////////////////////////////////
-// Internal only really... but need to call when 
-/////////////////////////////////////////////////////////////////
-#define accelerator_barrier(dummy)				\
-  {								\
-    cudaDeviceSynchronize();					\
-    cudaError err = cudaGetLastError();				\
-    if ( cudaSuccess != err ) {					\
-      printf("Cuda error %s \n", cudaGetErrorString( err )); \
-      puts(__FILE__); \
-      printf("Line %d\n",__LINE__);					\
-      exit(0);							\
-    }								\
-  }
-
-// Copy the for_each_n style ; Non-blocking variant
-#define accelerator_forNB( iterator, num, nsimd, ... )			\
-  {									\
-    typedef uint64_t Iterator;						\
-    auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
-      __VA_ARGS__;							\
-    };									\
-    dim3 cu_threads(gpu_threads,nsimd);					\
-    dim3 cu_blocks ((num+gpu_threads-1)/gpu_threads);			\
-    LambdaApplySIMT<<<cu_blocks,cu_threads>>>(nsimd,num,lambda);	\
-  }
-
-// Copy the for_each_n style ; Non-blocking variant (default
-#define accelerator_for( iterator, num, nsimd, ... )		\
-  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
-  accelerator_barrier(dummy);
-
-#endif
-
-#ifdef GRID_SYCL
-
-#ifdef __SYCL_DEVICE_ONLY__
-#define GRID_SIMT
-#endif
-
-#include <CL/sycl.hpp>
-#include <CL/sycl/usm.hpp>
-
-extern cl::sycl::queue *theGridAccelerator;
-
-extern uint32_t gpu_threads;
-
-#define accelerator 
-#define accelerator_inline strong_inline
-
-#define accelerator_forNB(iterator,num,nsimd, ... )			\
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
-      cl::sycl::range<3> local {gpu_threads,1,nsimd};			\
-      cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
-      cgh.parallel_for<class dslash>(					\
-      cl::sycl::nd_range<3>(global,local),            \
-      [=] (cl::sycl::nd_item<3> item) mutable {       \
-      auto iterator = item.get_global_id(0);	      \
-      auto lane     = item.get_global_id(2);	      \
-      { __VA_ARGS__ };				      \
-     });	   			              \
-    });
-
-#define accelerator_barrier(dummy) theGridAccelerator->wait();
-
-#define accelerator_for( iterator, num, nsimd, ... )		\
-  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
-  accelerator_barrier(dummy);
-
-
-#endif
-
-#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) )
-
-#define accelerator 
-#define accelerator_inline strong_inline
-#define accelerator_for(iterator,num,nsimd, ... )   thread_for(iterator, num, { __VA_ARGS__ });
-#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
-#define accelerator_barrier(dummy) 
-
-#endif
+#include <Grid/threads/Threads.h>
+#include <Grid/threads/Accelerator.h>
--- a/Grid/threads/ThreadReduction.h
+++ b/Grid/threads/ThreadReduction.h
@@ -0,0 +1,127 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/ThreadReduction.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once 
+
+// Introduce a class to gain deterministic bit reproducible reduction.
+// make static; perhaps just a namespace is required.
+NAMESPACE_BEGIN(Grid);
+
+class GridThread {
+public:
+  static int _threads;
+  static int _hyperthreads;
+  static int _cores;
+
+  static void SetCores(int cr) { 
+#ifdef GRID_OMP
+    _cores = cr;
+#else 
+    _cores = 1;
+#endif
+  }
+  static void SetThreads(int thr) { 
+#ifdef GRID_OMP
+    _threads = MIN(thr,omp_get_max_threads()) ;
+    omp_set_num_threads(_threads);
+#else 
+    _threads = 1;
+#endif
+  };
+  static void SetMaxThreads(void) { 
+#ifdef GRID_OMP
+    _threads = omp_get_max_threads();
+    omp_set_num_threads(_threads);
+#else 
+    _threads = 1;
+#endif
+  };
+  static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
+  static int GetCores(void)   { return _cores; };
+  static int GetThreads(void) { return _threads; };
+  static int SumArraySize(void) {return _threads;};
+
+  static void GetWork(int nwork, int me, int & mywork, int & myoff){
+    GetWork(nwork,me,mywork,myoff,_threads);
+  }
+  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
+    int basework = nwork/units;
+    int backfill = units-(nwork%units);
+    if ( me >= units ) { 
+      mywork = myoff = 0;
+    } else { 
+      mywork = (nwork+me)/units;
+      myoff  = basework * me;
+      if ( me > backfill ) 
+	myoff+= (me-backfill);
+    }
+    return;
+  };
+
+  static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
+    me     = ThreadBarrier();
+    GetWork(nwork,me,mywork,myoff);
+  };
+
+  static int  ThreadBarrier(void) {
+#ifdef GRID_OMP
+#pragma omp barrier
+    return omp_get_thread_num();
+#else
+    return 0;
+#endif
+  };
+  
+  template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
+    sum_array[me] = val;
+    val=Zero();
+    ThreadBarrier();
+    for(int i=0;i<_threads;i++) val+= sum_array[i];
+    ThreadBarrier();
+  }
+
+  static void bcopy(const void *src, void *dst, size_t len) {
+#ifdef GRID_OMP
+#pragma omp parallel 
+    {
+      const char *c_src =(char *) src;
+      char *c_dest=(char *) dst;
+      int me,mywork,myoff;
+      GridThread::GetWorkBarrier(len,me, mywork,myoff);
+      bcopy(&c_src[myoff],&c_dest[myoff],mywork);
+    }
+#else 
+    bcopy(src,dst,len);
+#endif
+  }
+
+
+};
+
+NAMESPACE_END(Grid);
+
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -28,101 +28,41 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once 

+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#define MIN(x,y) ((x)>(y)?(y):(x))
+#endif

-// Introduce a class to gain deterministic bit reproducible reduction.
-// make static; perhaps just a namespace is required.
-NAMESPACE_BEGIN(Grid);
+#define strong_inline     __attribute__((always_inline)) inline
+#define UNROLL  _Pragma("unroll")

-class GridThread {
-public:
-  static int _threads;
-  static int _hyperthreads;
-  static int _cores;
+//////////////////////////////////////////////////////////////////////////////////
+// New primitives; explicit host thread calls, and accelerator data parallel calls
+//////////////////////////////////////////////////////////////////////////////////
+
+#ifdef _OPENMP
+#define GRID_OMP
+#include <omp.h>
+#endif

-  static void SetCores(int cr) { 
 #ifdef GRID_OMP
-    _cores = cr;
+#define DO_PRAGMA_(x) _Pragma (#x)
+#define DO_PRAGMA(x) DO_PRAGMA_(x)
+#define thread_num(a) omp_get_thread_num()
+#define thread_max(a) omp_get_max_threads()
 #else 
-    _cores = 1;
+#define DO_PRAGMA_(x) 
+#define DO_PRAGMA(x) 
+#define thread_num(a) (0)
+#define thread_max(a) (1)
 #endif
-  }
-  static void SetThreads(int thr) { 
-#ifdef GRID_OMP
-    _threads = MIN(thr,omp_get_max_threads()) ;
-    omp_set_num_threads(_threads);
-#else 
-    _threads = 1;
-#endif
-  };
-  static void SetMaxThreads(void) { 
-#ifdef GRID_OMP
-    _threads = omp_get_max_threads();
-    omp_set_num_threads(_threads);
-#else 
-    _threads = 1;
-#endif
-  };
-  static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
-  static int GetCores(void)   { return _cores; };
-  static int GetThreads(void) { return _threads; };
-  static int SumArraySize(void) {return _threads;};

-  static void GetWork(int nwork, int me, int & mywork, int & myoff){
-    GetWork(nwork,me,mywork,myoff,_threads);
-  }
-  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
-    int basework = nwork/units;
-    int backfill = units-(nwork%units);
-    if ( me >= units ) { 
-      mywork = myoff = 0;
-    } else { 
-      mywork = (nwork+me)/units;
-      myoff  = basework * me;
-      if ( me > backfill ) 
-	myoff+= (me-backfill);
-    }
-    return;
-  };
-
-  static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
-    me     = ThreadBarrier();
-    GetWork(nwork,me,mywork,myoff);
-  };
-
-  static int  ThreadBarrier(void) {
-#ifdef GRID_OMP
-#pragma omp barrier
-    return omp_get_thread_num();
-#else
-    return 0;
-#endif
-  };
-  
-  template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
-    sum_array[me] = val;
-    val=Zero();
-    ThreadBarrier();
-    for(int i=0;i<_threads;i++) val+= sum_array[i];
-    ThreadBarrier();
-  }
-
-  static void bcopy(const void *src, void *dst, size_t len) {
-#ifdef GRID_OMP
-#pragma omp parallel 
-    {
-      const char *c_src =(char *) src;
-      char *c_dest=(char *) dst;
-      int me,mywork,myoff;
-      GridThread::GetWorkBarrier(len,me, mywork,myoff);
-      bcopy(&c_src[myoff],&c_dest[myoff],mywork);
-    }
-#else 
-    bcopy(src,dst,len);
-#endif
-  }
-
-
-};
-
-NAMESPACE_END(Grid);
+#define thread_for( i, num, ... )                           DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_foreach( i, container, ... )                 DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
+#define thread_for_in_region( i, num, ... )                 DO_PRAGMA(omp for schedule(static))          for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_for_collapse2( i, num, ... )                 DO_PRAGMA(omp parallel for collapse(2))      for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_for_collapse( N , i, num, ... )              DO_PRAGMA(omp parallel for collapse ( N ) )  for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_for_collapse_in_region( N , i, num, ... )    DO_PRAGMA(omp for collapse ( N ))            for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_region                                       DO_PRAGMA(omp parallel)
+#define thread_critical                                     DO_PRAGMA(omp critical)