Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2025-11-23 07:59:32 +00:00 · 2021-12-07 09:06:42 -08:00
parent 2bf3b4d576 f34d34bd17
commit 7f7d06d963
52 changed files with 850 additions and 88 deletions
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -358,7 +358,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
      
    Vector<Aview> AcceleratorViewContainer;
@@ -380,7 +380,7 @@ public:
      int ptype;
      StencilEntry *SE;

-      for(int point=0;point<geom_v.npoint;point++){
+      for(int point=0;point<npoint;point++){

 	SE=Stencil_v.GetEntry(ptype,point,ss);
 	  
@@ -424,7 +424,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;

    Vector<Aview> AcceleratorViewContainer;
@@ -454,7 +454,7 @@ public:
      int ptype;
      StencilEntry *SE;

-      for(int p=0;p<geom_v.npoint;p++){
+      for(int p=0;p<npoint;p++){
        int point = points_p[p];

 	SE=Stencil_v.GetEntry(ptype,point,ss);
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -508,7 +508,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  virtual  void MpcDag   (const Field &in, Field &out){
    Mpc(in,out);
  }
-  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+  virtual void MpcDagMpc(const Field &in, Field &out) {
    assert(0);// Never need with staggered
  }
 };
@@ -586,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 template<typename Field>
 class PlainHermOp : public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator();
  LinearOperatorBase<Field> &_Linop;
      
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
@@ -599,6 +600,7 @@ public:
 template<typename Field>
 class FunctionHermOp : public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator(); 
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
      
--- a/Grid/algorithms/Preconditioner.h
+++ b/Grid/algorithms/Preconditioner.h
@@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Field> class Preconditioner :  public LinearFunction<Field> { 
+template<class Field> using Preconditioner =  LinearFunction<Field> ;
+
+/*
+template<class Field> class Preconditioner :  public LinearFunction<Field> {
+  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field & psi)=0;
 };
+*/

 template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 public:
-  void operator()(const Field &src, Field & psi){
+  using Preconditioner<Field>::operator();
+  virtual void operator()(const Field &src, Field & psi){
    psi = src;
  }
  TrivialPrecon(void){};
--- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
+++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
@@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
 template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
-  public:                                                
+  public:
+    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -67,6 +67,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
+  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -97,6 +98,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
+  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@@ -119,7 +119,8 @@ public:
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){

    RealD cp;
-    ComplexD a, b, zAz;
+    ComplexD a, b;
+    //    ComplexD zAz;
    RealD zAAz;
    ComplexD rq;

@@ -146,7 +147,7 @@ public:
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
    
@@ -170,7 +171,7 @@ public:

    LinalgTimer.Start();

-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);

    //p[0],q[0],qq[0] 
@@ -212,7 +213,7 @@ public:
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
-      zAz = innerProduct(Az,psi);
+      //      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);

      LinalgTimer.Start();
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -170,6 +170,7 @@ private:

 public:
  static void Print(void);
+  static void PrintState( void* CpuPtr);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -474,6 +474,32 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
  }
 }

+void MemoryManager::PrintState(void* _CpuPtr)
+{
+  uint64_t CpuPtr = (uint64_t)_CpuPtr;
+
+  if ( EntryPresent(CpuPtr) ){
+    auto AccCacheIterator = EntryLookup(CpuPtr);
+    auto & AccCache = AccCacheIterator->second;
+    std::string str;
+    if ( AccCache.state==Empty    ) str = std::string("Empty");
+    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
+    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
+    if ( AccCache.state==Consistent)str = std::string("Consistent");
+    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
+
+    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
+    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
+    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
+    << "\t" << AccCache.cpuLock
+    << "\t" << AccCache.accLock
+    << "\t" << AccCache.LRU_valid<<std::endl;
+
+  } else {
+    std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl; 
+  }
+}
+
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@@ -16,6 +16,10 @@ uint64_t  MemoryManager::DeviceToHostXfer;
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
+void  MemoryManager::PrintState(void* CpuPtr)
+{
+std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
+};
 void  MemoryManager::Print(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};

--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -88,6 +88,13 @@ public:
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    accessor.ViewClose();
  }
+
+  // Helper function to print the state of this object in the AccCache
+  void PrintCacheState(void)
+  {
+    MemoryManager::PrintState(this->_odata);
+  }
+
  /////////////////////////////////////////////////////////////////////////////////
  // Return a view object that may be dereferenced in site loops.
  // The view is trivially copy constructible and may be copied to an accelerator device
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -576,7 +576,8 @@ class ScidacReader : public GridLimeReader {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
-  skipPastObjectRecord(std::string(GRID_FIELD_NORM));
+  // in principle should do the line below, but that breaks backard compatibility with old data
+  // skipPastObjectRecord(std::string(GRID_FIELD_NORM));
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -828,6 +828,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
+  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
@@ -880,7 +881,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }

  std::vector<RealD> G_s(Ls,1.0);
-  RealD sign = 1; // sign flip for vector/tadpole
+  RealD sign = 1.0; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
@@ -890,7 +891,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
-      sign = -1;    
+      sign = -1.0;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
@@ -934,7 +935,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
-    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
+    // Mask the time
+    if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
+      unsigned int t0 = 0;
+      tmp    = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
+    } else {
+      tmp    = where((lcoor>=tmin+tshift),tmp,zz);
+    }
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated

    InsertSlice(L_Q, q_out, s , 0);
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -47,20 +47,20 @@ NAMESPACE_BEGIN(Grid);
  class TypePair {
  public:
    T _internal[2];
-    TypePair<T>& operator=(const Grid::Zero& o) {
+    accelerator TypePair<T>& operator=(const Grid::Zero& o) {
      _internal[0] = Zero();
      _internal[1] = Zero();
      return *this;
    }

-    TypePair<T> operator+(const TypePair<T>& o) const {
+    accelerator TypePair<T> operator+(const TypePair<T>& o) const {
      TypePair<T> r;
      r._internal[0] = _internal[0] + o._internal[0];
      r._internal[1] = _internal[1] + o._internal[1];
      return r;
    }

-    TypePair<T>& operator+=(const TypePair<T>& o) {
+    accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
      _internal[0] += o._internal[0];
      _internal[1] += o._internal[1];
      return *this;
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -84,7 +84,8 @@ void acceleratorInit(void)
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 ) {
    printf("AcceleratorCudaInit: using default device \n");
-    printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
+    printf("AcceleratorCudaInit: assume user either uses\n");
+    printf("AcceleratorCudaInit: a) IBM jsrun, or \n");
    printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
    printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
  }
@@ -109,6 +110,7 @@ void acceleratorInit(void)

 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
+hipStream_t copyStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -166,16 +168,25 @@ void acceleratorInit(void)
 #ifdef GRID_DEFAULT_GPU
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: using default device \n");
-    printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
-    printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
+    printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n");
  }
+  int device = 0;
 #else
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
-    printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
  }
-  hipSetDevice(rank);
+  int device = rank;
 #endif
+  hipSetDevice(device);
+  hipStreamCreate(&copyStream);
+  const int len=64;
+  char busid[len];
+  if( rank == world_rank ) { 
+    hipDeviceGetPCIBusId(busid, len, device);
+    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
+  }
  if ( world_rank == 0 )  printf("AcceleratorHipInit: ================================================\n");
 }
 #endif
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -230,6 +230,7 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
+
 inline int  acceleratorIsCommunicable(void *ptr)
 {
  //  int uvm=0;
@@ -337,6 +338,7 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator        __host__ __device__
 #define accelerator_inline __host__ __device__ inline

+extern hipStream_t copyStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@@ -411,10 +413,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
-inline void acceleratorCopySynchronise(void) {  }
+//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
+//inline void acceleratorCopySynchronise(void) {  }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}

+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+{
+  hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream);
+}
+inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
+
 #endif

 //////////////////////////////////////////////
@@ -485,18 +493,12 @@ inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 ///////////////////////////////////////////////////
 // Synchronise across local threads for divergence resynch
 ///////////////////////////////////////////////////
-accelerator_inline void acceleratorSynchronise(void) 
+accelerator_inline void acceleratorSynchronise(void)  // Only Nvidia needs 
 {
 #ifdef GRID_SIMT
 #ifdef GRID_CUDA
  __syncwarp();
 #endif
-#ifdef GRID_SYCL
-  //cl::sycl::detail::workGroupBarrier();
-#endif
-#ifdef GRID_HIP
-  __syncthreads();
-#endif
 #endif
  return;
 }