Memory Vector UVM and Lattice alignedAllocator separate

2025-11-05 06:19:31 +00:00 · 2020-06-22 20:21:53 -04:00
parent 6c5fa8dcd8
commit c48da35921
4 changed files with 350 additions and 30 deletions
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@@ -0,0 +1,241 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_PREC_GCR_NON_HERM_H
+#define GRID_PREC_GCR_NON_HERM_H
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+//VPGCR Abe and Zhang, 2005.
+//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
+//Computing and Information Volume 2, Number 2, Pages 147-161
+//NB. Likely not original reference since they are focussing on a preconditioner variant.
+//    but VPGCR was nicely written up in their paper
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+NAMESPACE_BEGIN(Grid);
+
+#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" " 
+
+template<class Field>
+class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
+public:                                                
+
+  RealD   Tolerance;
+  Integer MaxIterations;
+  int verbose;
+  int mmax;
+  int nstep;
+  int steps;
+  int level;
+  GridStopWatch PrecTimer;
+  GridStopWatch MatTimer;
+  GridStopWatch LinalgTimer;
+
+  LinearFunction<Field>     &Preconditioner;
+  LinearOperatorBase<Field> &Linop;
+
+  void Level(int lv) { level=lv; };
+
+  PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
+    Tolerance(tol), 
+    MaxIterations(maxit),
+    Linop(_Linop),
+    Preconditioner(Prec),
+    mmax(_mmax),
+    nstep(_nstep)
+  { 
+    level=1;
+    verbose=1;
+  };
+
+  void operator() (const Field &src, Field &psi){
+
+    psi=Zero();
+    RealD cp, ssq,rsq;
+    ssq=norm2(src);
+    rsq=Tolerance*Tolerance*ssq;
+      
+    Field r(src.Grid());
+
+    PrecTimer.Reset();
+    MatTimer.Reset();
+    LinalgTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    steps=0;
+    for(int k=0;k<MaxIterations;k++){
+
+      cp=GCRnStep(src,psi,rsq);
+
+      GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
+
+      if(cp<rsq) {
+
+	SolverTimer.Stop();
+
+	Linop.Op(psi,r);
+	axpy(r,-1.0,src,r);
+	RealD tr = norm2(r);
+	GCRLogLevel<<"PGCR: Converged on iteration " <<steps
+		 << " computed residual "<<sqrt(cp/ssq)
+		 << " true residual "    <<sqrt(tr/ssq)
+		 << " target "           <<Tolerance <<std::endl;
+
+	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
+	return;
+      }
+
+    }
+    GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
+    //    assert(0);
+  }
+
+  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
+
+    RealD cp;
+    ComplexD a, b, zAz;
+    RealD zAAz;
+    ComplexD rq;
+
+    GridBase *grid = src.Grid();
+
+    Field r(grid);
+    Field z(grid);
+    Field tmp(grid);
+    Field ttmp(grid);
+    Field Az(grid);
+
+    ////////////////////////////////
+    // history for flexible orthog
+    ////////////////////////////////
+    std::vector<Field> q(mmax,grid);
+    std::vector<Field> p(mmax,grid);
+    std::vector<RealD> qq(mmax);
+      
+    GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
+
+    //////////////////////////////////
+    // initial guess x0 is taken as nonzero.
+    // r0=src-A x0 = src
+    //////////////////////////////////
+    MatTimer.Start();
+    Linop.Op(psi,Az);
+    zAz = innerProduct(Az,psi);
+    zAAz= norm2(Az);
+    MatTimer.Stop();
+    
+
+    LinalgTimer.Start();
+    r=src-Az;
+    LinalgTimer.Stop();
+    GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl;
+    
+    /////////////////////
+    // p = Prec(r)
+    /////////////////////
+
+    PrecTimer.Start();
+    Preconditioner(r,z);
+    PrecTimer.Stop();
+
+    MatTimer.Start();
+    Linop.Op(z,Az);
+    MatTimer.Stop();
+
+    LinalgTimer.Start();
+
+    zAz = innerProduct(Az,psi);
+    zAAz= norm2(Az);
+
+    //p[0],q[0],qq[0] 
+    p[0]= z;
+    q[0]= Az;
+    qq[0]= zAAz;
+    
+    cp =norm2(r);
+    LinalgTimer.Stop();
+
+    for(int k=0;k<nstep;k++){
+
+      steps++;
+
+      int kp     = k+1;
+      int peri_k = k %mmax;
+      int peri_kp= kp%mmax;
+
+      LinalgTimer.Start();
+      rq= innerProduct(q[peri_k],r); // what if rAr not real?
+      a = rq/qq[peri_k];
+
+      axpy(psi,a,p[peri_k],psi);         
+
+      cp = axpy_norm(r,-a,q[peri_k],r);
+      LinalgTimer.Stop();
+
+      GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
+
+      if((k==nstep-1)||(cp<rsq)){
+	return cp;
+      }
+
+
+      PrecTimer.Start();
+      Preconditioner(r,z);// solve Az = r
+      PrecTimer.Stop();
+
+      MatTimer.Start();
+      Linop.Op(z,Az);
+      MatTimer.Stop();
+      zAz = innerProduct(Az,psi);
+      zAAz= norm2(Az);
+
+      LinalgTimer.Start();
+
+      q[peri_kp]=Az;
+      p[peri_kp]=z;
+
+      int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history.
+      for(int back=0;back<northog;back++){
+
+	int peri_back=(k-back)%mmax;   	  assert((k-back)>=0);
+
+	b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
+	p[peri_kp]=p[peri_kp]+b*p[peri_back];
+	q[peri_kp]=q[peri_kp]+b*q[peri_back];
+
+      }
+      qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
+      LinalgTimer.Stop();
+    }
+    assert(0); // never reached
+    return cp;
+  }
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -52,41 +52,79 @@ public:
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
-
    profilerAllocate(bytes);
-
    _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
-    
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
-
    return ptr;
  }

  void deallocate(pointer __p, size_type __n) 
  { 
    size_type bytes = __n * sizeof(_Tp);
-
    profilerFree(bytes);
-
    MemoryManager::CpuFree((void *)__p,bytes);
  }

+  // FIXME: hack for the copy constructor, eventually it must be avoided
+  //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
+  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p) { };
+  void destroy(pointer __p) { };
+};
+template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
+template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+
+template<typename _Tp>
+class uvmAllocator {
+public: 
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef _Tp*       pointer;
+  typedef const _Tp* const_pointer;
+  typedef _Tp&       reference;
+  typedef const _Tp& const_reference;
+  typedef _Tp        value_type;
+
+  template<typename _Tp1>  struct rebind { typedef uvmAllocator<_Tp1> other; };
+  uvmAllocator() throw() { }
+  uvmAllocator(const uvmAllocator&) throw() { }
+  template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
+  ~uvmAllocator() throw() { }
+  pointer       address(reference __x)       const { return &__x; }
+  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+
+  pointer allocate(size_type __n, const void* _p= 0)
+  { 
+    size_type bytes = __n*sizeof(_Tp);
+    profilerAllocate(bytes);
+    _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
+    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
+    return ptr;
+  }
+
+  void deallocate(pointer __p, size_type __n) 
+  { 
+    size_type bytes = __n * sizeof(_Tp);
+    profilerFree(bytes);
+    MemoryManager::SharedFree((void *)__p,bytes);
+  }
+
  // FIXME: hack for the copy constructor, eventually it must be avoided
  void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
  //void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
-template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
-template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+template<typename _Tp>  inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
+template<typename _Tp>  inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }

 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-template<class T> using commAllocator = alignedAllocator<T>;
-template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
-template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
-template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
+template<class T> using commAllocator = uvmAllocator<T>;
+template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
+template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
+//template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;

 NAMESPACE_END(Grid);

--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -7,6 +7,17 @@ NAMESPACE_BEGIN(Grid);
 #define CpuSmall (1)
 #define Acc      (2)
 #define AccSmall (3)
+#define Shared   (4)
+#define SharedSmall (5)
+uint64_t total_shared;
+uint64_t total_device;
+uint64_t total_host;;
+void MemoryManager::PrintBytes(void)
+{
+  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
+  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
+  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
+}

 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
@@ -21,39 +32,63 @@ int MemoryManager::Ncache[MemoryManager::NallocType];
 void *MemoryManager::AcceleratorAllocate(size_t bytes)
 {
  void *ptr = (void *) Lookup(bytes,Acc);
-
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocDevice(bytes);
-    //    std::cout <<"AcceleratorAllocate: allocated Accelerator pointer "<<std::hex<<ptr<<std::endl;
+    total_device+=bytes;
+    //    std::cout <<"AcceleratorAllocate: allocated Accelerator pointer "<<std::hex<<ptr<<std::dec<<std::endl;
+    //    PrintBytes();
  }
-
  return ptr;
 }
 void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 {
  void *__freeme = Insert(ptr,bytes,Acc);
-
-  if ( __freeme ) acceleratorFreeDevice(__freeme);
+  if ( __freeme ) {
+    acceleratorFreeDevice(__freeme);
+    total_device-=bytes;
+    //    PrintBytes();
+  }
+}
+void *MemoryManager::SharedAllocate(size_t bytes)
+{
+  void *ptr = (void *) Lookup(bytes,Shared);
+  if ( ptr == (void *) NULL ) {
+    ptr = (void *) acceleratorAllocShared(bytes);
+    total_shared+=bytes;
+    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
+    //    PrintBytes();
+  }
+  return ptr;
+}
+void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
+{
+  void *__freeme = Insert(ptr,bytes,Shared);
+  if ( __freeme ) {
+    acceleratorFreeShared(__freeme);
+    total_shared-=bytes;
+    //    PrintBytes();
+  }
 }
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  void *ptr = (void *) Lookup(bytes,Cpu);
-
  if ( ptr == (void *) NULL ) {
-    ptr = (void *) acceleratorAllocShared(bytes);
-    //    std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::endl;
+    ptr = (void *) acceleratorAllocCpu(bytes);
+    total_host+=bytes;
+    //    std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::dec<<std::endl;
+    //    PrintBytes();
  }
-
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  NotifyDeletion(_ptr);
-
-  // If present remove entry and free accelerator too.
-  // Can we ever hit a free event with a view still in scope?
  void *__freeme = Insert(_ptr,bytes,Cpu);
-  if ( __freeme ) acceleratorFreeShared(__freeme);
+  if ( __freeme ) { 
+    acceleratorFreeCpu(__freeme);
+    total_host-=bytes;
+    //    PrintBytes();
+  }
 }
 //////////////////////////////////////////
 // call only once
@@ -62,8 +97,10 @@ void MemoryManager::Init(void)
 {
  Ncache[Cpu] = 8;
  Ncache[Acc] = 8;
+  Ncache[Shared] = 8;
  Ncache[CpuSmall] = 32;
  Ncache[AccSmall] = 32;
+  Ncache[SharedSmall] = 32;

  char * str;
  int Nc;
@@ -75,6 +112,7 @@ void MemoryManager::Init(void)
    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
      Ncache[Cpu]=Nc;
      Ncache[Acc]=Nc;
+      Ncache[Shared]=Nc;
    }
  }

@@ -84,6 +122,7 @@ void MemoryManager::Init(void)
    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
      Ncache[CpuSmall]=Nc;
      Ncache[AccSmall]=Nc;
+      Ncache[SharedSmall]=Nc;
    }
  }
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -44,14 +44,14 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////////////////
 enum ViewAdvise {
 AdviseDefault       = 0x0,    // Regular data
- AdviseInfrequentUse = 0x1,    // Advise that the data is used infrequently.  This can
+ AdviseInfrequentUse = 0x1     // Advise that the data is used infrequently.  This can
                               // significantly influence performance of bulk storage.
 
- AdviseTransient      = 0x2,   // Data will mostly be read.  On some architectures
+ // AdviseTransient      = 0x2,   // Data will mostly be read.  On some architectures
                               // enables read-only copies of memory to be kept on
                               // host and device.

- AdviseAcceleratorWriteDiscard = 0x4  // Field will be written in entirety on device
+ // AdviseAcceleratorWriteDiscard = 0x4  // Field will be written in entirety on device

 };

@@ -80,7 +80,7 @@ private:
  } AllocationCacheEntry;

  static const int NallocCacheMax=128; 
-  static const int NallocType=4;
+  static const int NallocType=6;
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
@@ -95,9 +95,11 @@ private:

  static void *AcceleratorAllocate(size_t bytes);
  static void  AcceleratorFree    (void *ptr,size_t bytes);
-
+  static void PrintBytes(void);
 public:
  static void Init(void);
+  static void *SharedAllocate(size_t bytes);
+  static void  SharedFree    (void *ptr,size_t bytes);
  static void *CpuAllocate(size_t bytes);
  static void  CpuFree    (void *ptr,size_t bytes);