Merge branch 'develop' of github.com:paboyle/Grid into develop

2025-09-19 09:41:05 +01:00 · 2021-10-29 13:01:34 +01:00
parent b27b12828e fe9edf8526
commit a65a497bae
64 changed files with 1580 additions and 302 deletions
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -358,7 +358,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
      
    Vector<Aview> AcceleratorViewContainer;
@@ -380,7 +380,7 @@ public:
      int ptype;
      StencilEntry *SE;

-      for(int point=0;point<geom_v.npoint;point++){
+      for(int point=0;point<npoint;point++){

 	SE=Stencil_v.GetEntry(ptype,point,ss);
 	  
@@ -424,7 +424,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;

    Vector<Aview> AcceleratorViewContainer;
@@ -454,7 +454,7 @@ public:
      int ptype;
      StencilEntry *SE;

-      for(int p=0;p<geom_v.npoint;p++){
+      for(int p=0;p<npoint;p++){
        int point = points_p[p];

 	SE=Stencil_v.GetEntry(ptype,point,ss);
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -52,6 +52,7 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
+  virtual ~LinearOperatorBase(){};
 };


@@ -507,7 +508,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  virtual  void MpcDag   (const Field &in, Field &out){
    Mpc(in,out);
  }
-  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+  virtual void MpcDagMpc(const Field &in, Field &out) {
    assert(0);// Never need with staggered
  }
 };
@@ -585,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 template<typename Field>
 class PlainHermOp : public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator();
  LinearOperatorBase<Field> &_Linop;
      
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
@@ -598,6 +600,7 @@ public:
 template<typename Field>
 class FunctionHermOp : public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator(); 
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
      
--- a/Grid/algorithms/Preconditioner.h
+++ b/Grid/algorithms/Preconditioner.h
@@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Field> class Preconditioner :  public LinearFunction<Field> { 
+template<class Field> using Preconditioner =  LinearFunction<Field> ;
+
+/*
+template<class Field> class Preconditioner :  public LinearFunction<Field> {
+  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field & psi)=0;
 };
+*/

 template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 public:
-  void operator()(const Field &src, Field & psi){
+  using Preconditioner<Field>::operator();
+  virtual void operator()(const Field &src, Field & psi){
    psi = src;
  }
  TrivialPrecon(void){};
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -48,6 +48,7 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
+  virtual ~SparseMatrixBase() {};
 };

 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -72,7 +73,7 @@ public:
  virtual  void MeooeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
-
+  virtual ~CheckerBoardedSparseMatrixBase() {};
 };

 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
+++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
@@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
 template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
-  public:                                                
+  public:
+    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -35,7 +35,8 @@ NAMESPACE_BEGIN(Grid);
    typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
-  public:                                                
+  public:
+    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -33,16 +33,19 @@ namespace Grid {
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator();
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };

@@ -57,6 +60,7 @@ private:
  const unsigned int       N;

 public:
+  using LinearFunction<Field>::operator();

  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
  : DeflatedGuesser(_evec, _eval, _evec.size())
@@ -87,6 +91,7 @@ private:
  const std::vector<RealD>       &eval_coarse;
 public:
  
+  using LinearFunction<FineField>::operator();
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -67,6 +67,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
+  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -97,6 +98,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
+  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@@ -119,7 +119,8 @@ public:
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){

    RealD cp;
-    ComplexD a, b, zAz;
+    ComplexD a, b;
+    //    ComplexD zAz;
    RealD zAAz;
    ComplexD rq;

@@ -146,7 +147,7 @@ public:
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
    
@@ -170,7 +171,7 @@ public:

    LinalgTimer.Start();

-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);

    //p[0],q[0],qq[0] 
@@ -212,7 +213,7 @@ public:
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
-      zAz = innerProduct(Az,psi);
+      //      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);

      LinalgTimer.Start();
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -159,7 +159,6 @@ void MemoryManager::Init(void)

  char * str;
  int Nc;
-  int NcS;
  
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -389,7 +389,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
    assert(shm!=NULL);
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
-    acceleratorCopySynchronise(); // MPI prob slower
  }

  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
@@ -405,6 +404,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
  if (nreq==0) return;

  std::vector<MPI_Status> status(nreq);
+  acceleratorCopySynchronise(); 
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -42,7 +42,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
-  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
  
  if (warpSize != WARP_SIZE) {
@@ -52,6 +51,10 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  
  // let the number of threads in a block be a multiple of 2, starting from warpSize
  threads = warpSize;
+  if ( threads*sizeofsobj > sharedMemPerBlock ) {
+    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
+    exit(EXIT_FAILURE);
+  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -85,6 +85,76 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }

+template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
+{
+  half.Checkerboard() = cb;
+  autoView(half_v, half, AcceleratorWrite);
+  autoView(full_v, full, AcceleratorRead);
+  Coordinate rdim_full             = full.Grid()->_rdimensions;
+  Coordinate rdim_half             = half.Grid()->_rdimensions;
+  unsigned long ndim_half          = half.Grid()->_ndimension;
+  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
+  Coordinate ostride_half          = half.Grid()->_ostride;
+  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
+    
+    Coordinate coor;
+    int cbos;
+    int linear=0;
+
+    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
+    assert(coor.size()==ndim_half);
+
+    for(int d=0;d<ndim_half;d++){ 
+      if(checker_dim_mask_half[d]) linear += coor[d];
+    }
+    cbos = (linear&0x1);
+
+    if (cbos==cb) {
+      int ssh=0;
+      for(int d=0;d<ndim_half;d++) {
+        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
+        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
+      }
+      coalescedWrite(half_v[ssh],full_v(ss));
+    }
+  });
+}
+template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
+{
+  int cb = half.Checkerboard();
+  autoView(half_v , half, AcceleratorRead);
+  autoView(full_v , full, AcceleratorWrite);
+  Coordinate rdim_full             = full.Grid()->_rdimensions;
+  Coordinate rdim_half             = half.Grid()->_rdimensions;
+  unsigned long ndim_half          = half.Grid()->_ndimension;
+  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
+  Coordinate ostride_half          = half.Grid()->_ostride;
+  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
+
+    Coordinate coor;
+    int cbos;
+    int linear=0;
+  
+    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
+    assert(coor.size()==ndim_half);
+
+    for(int d=0;d<ndim_half;d++){ 
+      if(checker_dim_mask_half[d]) linear += coor[d];
+    }
+    cbos = (linear&0x1);
+
+    if (cbos==cb) {
+      int ssh=0;
+      for(int d=0;d<ndim_half;d++){
+        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
+        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
+      }
+      coalescedWrite(full_v[ss],half_v(ssh));
+    }
+
+  });
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Flexible Type Conversion for internal promotion to double as well as graceful
 // treatment of scalar-compatible types
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -828,6 +828,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
+  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
@@ -880,7 +881,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }

  std::vector<RealD> G_s(Ls,1.0);
-  RealD sign = 1; // sign flip for vector/tadpole
+  RealD sign = 1.0; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
@@ -890,7 +891,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
-      sign = -1;    
+      sign = -1.0;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
@@ -934,7 +935,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
-    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
+    // Mask the time
+    if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
+      unsigned int t0 = 0;
+      tmp    = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
+    } else {
+      tmp    = where((lcoor>=tmin+tshift),tmp,zz);
+    }
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated

    InsertSlice(L_Q, q_out, s , 0);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define REGISTER

 #ifdef GRID_SIMT
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
-    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
-    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
-    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
-    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
-    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
-    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
-    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
-    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
-    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
-    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
-    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
+    Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane);	\
+    Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane);		\
+    Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane);		\
+    Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane);		\
+    Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane);		\
+    Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane);		\
+    Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane);		\
+    Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane);		\
+    Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane);		\
+    Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane);		\
+    Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane);		\
+    Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane);	}
 #define PERMUTE_DIR(dir) ;
 #else
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
@@ -109,12 +109,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chimu_32=ref()(3)(2);}

 #define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_00,Chi_00);	\
-      permute##dir(Chi_01,Chi_01);\
-      permute##dir(Chi_02,Chi_02);\
-      permute##dir(Chi_10,Chi_10);	\
-      permute##dir(Chi_11,Chi_11);\
-      permute##dir(Chi_12,Chi_12);
+  permute##dir(Chi_00,Chi_00);			\
+  permute##dir(Chi_01,Chi_01);			\
+  permute##dir(Chi_02,Chi_02);			\
+  permute##dir(Chi_10,Chi_10);			\
+  permute##dir(Chi_11,Chi_11);			\
+  permute##dir(Chi_12,Chi_12);

 #endif

@@ -371,88 +371,91 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_32-= UChi_12;

 #define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU(PERM);				\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else {					\
-    LOAD_CHI;					\
-  }						\
-  acceleratorSynchronise();			\
-  MULT_2SPIN(DIR);				\
-  RECON;					
+  {int ptype;					\
+   SE=st.GetEntry(ptype,DIR,ss);		\
+   auto offset = SE->_offset;			\
+   auto local  = SE->_is_local;			\
+   auto perm   = SE->_permute;			\
+   if ( local ) {				\
+     LOAD_CHIMU(PERM);				\
+     PROJ;					\
+     if ( perm) {				\
+       PERMUTE_DIR(PERM);			\
+     }						\
+   } else {					\
+     LOAD_CHI;					\
+   }						\
+   acceleratorSynchronise();			\
+   MULT_2SPIN(DIR);				\
+   RECON;					}

-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
-  SE=&st_p[DIR+8*ss];				\
-  ptype=st_perm[DIR];				\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU(PERM);				\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else {					\
-    LOAD_CHI;					\
-  }						\
-  acceleratorSynchronise();			\
-  MULT_2SPIN(DIR);				\
-  RECON;					
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)		\
+  { SE=&st_p[DIR+8*ss];						\
+  auto ptype=st_perm[DIR];					\
+  auto offset = SE->_offset;					\
+  auto local  = SE->_is_local;					\
+  auto perm   = SE->_permute;					\
+  if ( local ) {						\
+    LOAD_CHIMU(PERM);						\
+    PROJ;							\
+    if ( perm) {						\
+      PERMUTE_DIR(PERM);					\
+    }								\
+  } else {							\
+    LOAD_CHI;							\
+  }								\
+  acceleratorSynchronise();					\
+  MULT_2SPIN(DIR);						\
+  RECON;					}

 #define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
-  SE=&st_p[DIR+8*ss];							\
-  ptype=st_perm[DIR];							\
- /*SE=st.GetEntry(ptype,DIR,ss);*/					\
-  offset = SE->_offset;				\
-  perm   = SE->_permute;			\
-  LOAD_CHIMU(PERM);				\
-  PROJ;						\
-  MULT_2SPIN(DIR);				\
-  RECON;					
+  { SE=&st_p[DIR+8*ss];							\
+    auto ptype=st_perm[DIR];						\
+    /*SE=st.GetEntry(ptype,DIR,ss);*/					\
+    auto offset = SE->_offset;						\
+    auto perm   = SE->_permute;						\
+    LOAD_CHIMU(PERM);							\
+    PROJ;								\
+    MULT_2SPIN(DIR);							\
+    RECON;					}

 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU(PERM);				\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI;					\
-  }						\
-  acceleratorSynchronise();			\
-  if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN(DIR);				\
-    RECON;					\
-  }						\
-  acceleratorSynchronise();			
+  { int ptype;						\
+  SE=st.GetEntry(ptype,DIR,ss);				\
+  auto offset = SE->_offset;					\
+  auto local  = SE->_is_local;					\
+  auto perm   = SE->_permute;					\
+  if ( local ) {						\
+    LOAD_CHIMU(PERM);						\
+    PROJ;							\
+    if ( perm) {						\
+      PERMUTE_DIR(PERM);					\
+    }								\
+  } else if ( st.same_node[DIR] ) {				\
+    LOAD_CHI;							\
+  }								\
+  acceleratorSynchronise();					\
+  if (local || st.same_node[DIR] ) {				\
+    MULT_2SPIN(DIR);						\
+    RECON;							\
+  }								\
+  acceleratorSynchronise();			}

 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI;					\
-    MULT_2SPIN(DIR);				\
-    RECON;					\
-    nmu++;					\
-  }						\
-  acceleratorSynchronise();			
+  { int ptype;						\
+  SE=st.GetEntry(ptype,DIR,ss);				\
+  auto offset = SE->_offset;				\
+  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {		\
+    LOAD_CHI;						\
+    MULT_2SPIN(DIR);					\
+    RECON;						\
+    nmu++;						\
+  }							\
+  acceleratorSynchronise();			}

-#define HAND_RESULT(ss)				\
-  {						\
-    SiteSpinor & ref (out[ss]);			\
+#define HAND_RESULT(ss)					\
+  {							\
+    SiteSpinor & ref (out[ss]);				\
    coalescedWrite(ref()(0)(0),result_00,lane);		\
    coalescedWrite(ref()(0)(1),result_01,lane);		\
    coalescedWrite(ref()(0)(2),result_02,lane);		\
@@ -563,7 +566,6 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,

  HAND_DECLARATIONS(Simt);

-  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
@@ -593,9 +595,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site

  HAND_DECLARATIONS(Simt);

-  int offset,local,perm, ptype;
  StencilEntry *SE;
-
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@@ -623,8 +623,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@@ -640,8 +638,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_p = st._entries_p;						
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -652,7 +650,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si

  HAND_DECLARATIONS(Simt);

-  int offset,local,perm, ptype;
  StencilEntry *SE;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@@ -670,8 +667,8 @@ template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_p = st._entries_p;						
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -682,7 +679,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
-  int offset,local,perm, ptype;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@@ -699,8 +695,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_p = st._entries_p;						
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -711,7 +707,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si

  HAND_DECLARATIONS(Simt);

-  int offset, ptype;
+  //  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@@ -730,8 +726,8 @@ template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_p = st._entries_p;						
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -742,7 +738,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
-  int offset, ptype;
+  //  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
--- a/Grid/qcd/action/gauge/GaugeImplTypes.h
+++ b/Grid/qcd/action/gauge/GaugeImplTypes.h
@@ -78,6 +78,8 @@ public:
  typedef Lattice<SiteLink>    LinkField; 
  typedef Lattice<SiteField>   Field;

+  typedef SU<Nrepresentation> Group;
+
  // Guido: we can probably separate the types from the HMC functions
  // this will create 2 kind of implementations
  // probably confusing the users
@@ -118,7 +120,7 @@ public:
    LinkField Pmu(P.Grid());
    Pmu = Zero();
    for (int mu = 0; mu < Nd; mu++) {
-      SU<Nrepresentation>::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
      RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ;
      Pmu = Pmu*scale;
      PokeIndex<LorentzIndex>(P, Pmu, mu);
@@ -159,15 +161,15 @@ public:
  }

  static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::HotConfiguration(pRNG, U);
+    Group::HotConfiguration(pRNG, U);
  }

  static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::TepidConfiguration(pRNG, U);
+    Group::TepidConfiguration(pRNG, U);
  }

  static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::ColdConfiguration(pRNG, U);
+    Group::ColdConfiguration(pRNG, U);
  }
 };

--- a/Grid/qcd/hmc/UsingHMC.md
+++ b/Grid/qcd/hmc/UsingHMC.md
@@ -1,61 +1,63 @@
-Using HMC in Grid version 0.5.1
+# Using HMC in Grid

-These are the instructions to use the Generalised HMC on Grid version 0.5.1.
-Disclaimer: GRID is still under active development so any information here can be changed in future releases.
+These are the instructions to use the Generalised HMC on Grid as of commit `749b802`.
+Disclaimer: Grid is still under active development so any information here can be changed in future releases.


-Command line options
-===================
-(relevant file GenericHMCrunner.h)
+## Command line options
+
+(relevant file `GenericHMCrunner.h`)
 The initial configuration can be changed at the command line using 
--StartType <your choice>
-valid choices, one among these
-HotStart, ColdStart, TepidStart, CheckpointStart
-default: HotStart
+`--StartingType STARTING_TYPE`, where `STARTING_TYPE` is one of
+`HotStart`, `ColdStart`, `TepidStart`, and `CheckpointStart`.
+Default: `--StartingType HotStart`

-example
-./My_hmc_exec  --StartType HotStart
+Example:
+```
+./My_hmc_exec  --StartingType HotStart
+```

-The CheckpointStart option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
--StartTrajectory <integer>
-default: 0
+The `CheckpointStart` option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
+`--StartingTrajectory STARTING_TRAJECTORY`, where `STARTING_TRAJECTORY` is an integer.
+Default: `--StartingTrajectory 0`

 The number of trajectories for a specific run are specified at command line by
--Trajectories <integer>
-default: 1
+`--Trajectories TRAJECTORIES`, where `TRAJECTORIES` is an integer.
+Default: `--Trajectories 1`

 The number of thermalization steps (i.e. steps when the Metropolis acceptance check is turned off) is specified by
--Thermalizations <integer>
-default: 10
-
+`--Thermalizations THERMALIZATIONS`, where `THERMALIZATIONS` is an integer.
+Default: `--Thermalizations 10`

 Any other parameter is defined in the source for the executable.

-HMC controls
-===========
+## HMC controls

 The lines 

+```
  std::vector<int> SerSeed({1, 2, 3, 4, 5});
  std::vector<int> ParSeed({6, 7, 8, 9, 10});
+```

 define the seeds for the serial and the parallel RNG.

 The line 

+```
  TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
+```

 declares the number of molecular dynamics steps and the total trajectory length.


-Actions
-======
+## Actions

-Action names are defined in the file
-lib/qcd/Actions.h
+Action names are defined in the directory `Grid/qcd/action`.

-Gauge actions list:
+Gauge actions list (from `Grid/qcd/action/gauge/Gauge.h`):

+```
 WilsonGaugeActionR;
 WilsonGaugeActionF;
 WilsonGaugeActionD;
@@ -68,8 +70,9 @@ IwasakiGaugeActionD;
 SymanzikGaugeActionR;
 SymanzikGaugeActionF;
 SymanzikGaugeActionD;
+```

-
+```
 ConjugateWilsonGaugeActionR;
 ConjugateWilsonGaugeActionF;
 ConjugateWilsonGaugeActionD;
@@ -82,26 +85,23 @@ ConjugateIwasakiGaugeActionD;
 ConjugateSymanzikGaugeActionR;
 ConjugateSymanzikGaugeActionF;
 ConjugateSymanzikGaugeActionD;
+```

+Each of these action accepts one single parameter at creation time (beta).
+Example for creating a Symanzik action with beta=4.0

+```
+  SymanzikGaugeActionR(4.0)
+```
+
+Scalar actions list (from `Grid/qcd/action/scalar/Scalar.h`):
+
+```
 ScalarActionR;
 ScalarActionF;
 ScalarActionD;
+```

-
-each of these action accept one single parameter at creation time (beta).
-Example for creating a Symanzik action with beta=4.0
-
-	SymanzikGaugeActionR(4.0)
-
-The suffixes R,F,D in the action names refer to the Real
-(the precision is defined at compile time by the --enable-precision flag in the configure),
-Float and Double, that force the precision of the action to be 32, 64 bit respectively.
-
-
-
-
-
-
-
-
+The suffixes `R`, `F`, `D` in the action names refer to the `Real`
+(the precision is defined at compile time by the `--enable-precision` flag in the configure),
+`Float` and `Double`, that force the precision of the action to be 32, 64 bit respectively.
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -322,8 +322,8 @@ public:
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;

-    int recv_from_rank;
-    int xmit_to_rank;
+    //    int recv_from_rank;
+    //    int xmit_to_rank;

    if ( ! comm_dim ) return 1;
    if ( displacement == 0 ) return 1;
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -47,20 +47,20 @@ NAMESPACE_BEGIN(Grid);
  class TypePair {
  public:
    T _internal[2];
-    TypePair<T>& operator=(const Grid::Zero& o) {
+    accelerator TypePair<T>& operator=(const Grid::Zero& o) {
      _internal[0] = Zero();
      _internal[1] = Zero();
      return *this;
    }

-    TypePair<T> operator+(const TypePair<T>& o) const {
+    accelerator TypePair<T> operator+(const TypePair<T>& o) const {
      TypePair<T> r;
      r._internal[0] = _internal[0] + o._internal[0];
      r._internal[1] = _internal[1] + o._internal[1];
      return r;
    }

-    TypePair<T>& operator+=(const TypePair<T>& o) {
+    accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
      _internal[0] += o._internal[0];
      _internal[1] += o._internal[1];
      return *this;
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -95,7 +95,7 @@ void acceleratorInit(void)
 #endif

  cudaSetDevice(device);
-
+  cudaStreamCreate(&copyStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -95,6 +95,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////

 #ifdef GRID_CUDA
+
 #include <cuda.h>

 #ifdef __CUDA_ARCH__
@@ -133,11 +134,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    std::cout << "========================== CUDA KERNEL CALL\n";	\
-    cuda_mem();								\
    LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
-    cuda_mem();								\
-    std::cout << "========================== CUDA KERNEL DONE\n";	\
  }

 #define accelerator_for6dNB(iter1, num1,				\
--- a/Grid/util/Coordinate.h
+++ b/Grid/util/Coordinate.h
@@ -88,7 +88,7 @@ public:
 // Coordinate class, maxdims = 8 for now.
 ////////////////////////////////////////////////////////////////
 #define GRID_MAX_LATTICE_DIMENSION (8)
-#define GRID_MAX_SIMD              (16)
+#define GRID_MAX_SIMD              (32)

 static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION;