mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet
Conflicts: systems/PVC/benchmarks/run-2tile-mpi.sh systems/PVC/config-command
This commit is contained in:
		@@ -45,7 +45,7 @@ directory
 | 
			
		||||
 //disables nvcc specific warning in json.hpp
 | 
			
		||||
#pragma clang diagnostic ignored "-Wdeprecated-register"
 | 
			
		||||
 | 
			
		||||
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 | 
			
		||||
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 | 
			
		||||
 //disables nvcc specific warning in json.hpp
 | 
			
		||||
#pragma nv_diag_suppress unsigned_compare_with_zero
 | 
			
		||||
#pragma nv_diag_suppress cast_to_qualified_type
 | 
			
		||||
 
 | 
			
		||||
@@ -14,7 +14,7 @@
 | 
			
		||||
/* NVCC save and restore compile environment*/
 | 
			
		||||
#ifdef __NVCC__
 | 
			
		||||
#pragma push
 | 
			
		||||
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 | 
			
		||||
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 | 
			
		||||
#pragma nv_diag_suppress code_is_unreachable
 | 
			
		||||
#else
 | 
			
		||||
#pragma diag_suppress code_is_unreachable
 | 
			
		||||
 
 | 
			
		||||
@@ -109,6 +109,9 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
    
 | 
			
		||||
    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
 | 
			
		||||
 | 
			
		||||
    precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
 | 
			
		||||
    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
 | 
			
		||||
    
 | 
			
		||||
    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
 | 
			
		||||
      //Compute double precision rsd and also new RHS vector.
 | 
			
		||||
      Linop_d.HermOp(sol_d, tmp_d);
 | 
			
		||||
@@ -123,7 +126,7 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 | 
			
		||||
 | 
			
		||||
      PrecChangeTimer.Start();
 | 
			
		||||
      precisionChange(src_f, src_d);
 | 
			
		||||
      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
 | 
			
		||||
      PrecChangeTimer.Stop();
 | 
			
		||||
      
 | 
			
		||||
      sol_f = Zero();
 | 
			
		||||
@@ -142,7 +145,7 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
      
 | 
			
		||||
      //Convert sol back to double and add to double prec solution
 | 
			
		||||
      PrecChangeTimer.Start();
 | 
			
		||||
      precisionChange(tmp_d, sol_f);
 | 
			
		||||
      precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
 | 
			
		||||
      PrecChangeTimer.Stop();
 | 
			
		||||
      
 | 
			
		||||
      axpy(sol_d, 1.0, tmp_d, sol_d);
 | 
			
		||||
 
 | 
			
		||||
@@ -131,6 +131,9 @@ public:
 | 
			
		||||
    GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
 | 
			
		||||
    GridBase *DoublePrecGrid = src_d.Grid();
 | 
			
		||||
 | 
			
		||||
    precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
 | 
			
		||||
    precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
 | 
			
		||||
    
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Convenience references to the info stored in "MultiShiftFunction"
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -201,10 +204,10 @@ public:
 | 
			
		||||
    r_d = p_d;
 | 
			
		||||
    
 | 
			
		||||
    //MdagM+m[0]
 | 
			
		||||
    precisionChangeFast(p_f,p_d);
 | 
			
		||||
    precisionChange(p_f, p_d, pc_wk_d_to_s);
 | 
			
		||||
 | 
			
		||||
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
 | 
			
		||||
    precisionChangeFast(tmp_d,mmp_f);
 | 
			
		||||
    precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
 | 
			
		||||
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
 | 
			
		||||
    tmp_d = tmp_d - mmp_d;
 | 
			
		||||
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
 | 
			
		||||
@@ -264,7 +267,7 @@ public:
 | 
			
		||||
      AXPYTimer.Stop();
 | 
			
		||||
 | 
			
		||||
      PrecChangeTimer.Start();
 | 
			
		||||
      precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
 | 
			
		||||
      precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
 | 
			
		||||
      PrecChangeTimer.Stop();
 | 
			
		||||
 | 
			
		||||
      cp=c;
 | 
			
		||||
@@ -273,7 +276,7 @@ public:
 | 
			
		||||
      MatrixTimer.Stop();  
 | 
			
		||||
 | 
			
		||||
      PrecChangeTimer.Start();
 | 
			
		||||
      precisionChangeFast(mmp_d, mmp_f); // From Float to Double
 | 
			
		||||
      precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
 | 
			
		||||
      PrecChangeTimer.Stop();
 | 
			
		||||
 | 
			
		||||
      AXPYTimer.Start();
 | 
			
		||||
 
 | 
			
		||||
@@ -48,7 +48,7 @@ public:
 | 
			
		||||
  LinearOperatorBase<FieldF> &Linop_f;
 | 
			
		||||
  LinearOperatorBase<FieldD> &Linop_d;
 | 
			
		||||
  GridBase* SinglePrecGrid;
 | 
			
		||||
  RealD Delta; //reliable update parameter
 | 
			
		||||
  RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
 | 
			
		||||
 | 
			
		||||
  //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
 | 
			
		||||
  LinearOperatorBase<FieldF> *Linop_fallback;
 | 
			
		||||
@@ -65,7 +65,9 @@ public:
 | 
			
		||||
      ErrorOnNoConverge(err_on_no_conv),
 | 
			
		||||
      DoFinalCleanup(true),
 | 
			
		||||
      Linop_fallback(NULL)
 | 
			
		||||
  {};
 | 
			
		||||
  {
 | 
			
		||||
    assert(Delta > 0. && Delta < 1. && "Expect  0 < Delta < 1");
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
 | 
			
		||||
    Linop_fallback = &_Linop_fallback;
 | 
			
		||||
@@ -116,9 +118,12 @@ public:
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //Single prec initialization
 | 
			
		||||
    precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
 | 
			
		||||
    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
 | 
			
		||||
    
 | 
			
		||||
    FieldF r_f(SinglePrecGrid);
 | 
			
		||||
    r_f.Checkerboard() = r.Checkerboard();
 | 
			
		||||
    precisionChange(r_f, r);
 | 
			
		||||
    precisionChange(r_f, r, pc_wk_dp_to_sp);
 | 
			
		||||
 | 
			
		||||
    FieldF psi_f(r_f);
 | 
			
		||||
    psi_f = Zero();
 | 
			
		||||
@@ -134,6 +139,7 @@ public:
 | 
			
		||||
    GridStopWatch LinalgTimer;
 | 
			
		||||
    GridStopWatch MatrixTimer;
 | 
			
		||||
    GridStopWatch SolverTimer;
 | 
			
		||||
    GridStopWatch PrecChangeTimer;
 | 
			
		||||
    
 | 
			
		||||
    SolverTimer.Start();
 | 
			
		||||
    int k = 0;
 | 
			
		||||
@@ -173,7 +179,9 @@ public:
 | 
			
		||||
      // Stopping condition
 | 
			
		||||
      if (cp <= rsq) {
 | 
			
		||||
	//Although not written in the paper, I assume that I have to add on the final solution
 | 
			
		||||
	precisionChange(mmp, psi_f);
 | 
			
		||||
	PrecChangeTimer.Start();
 | 
			
		||||
	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
 | 
			
		||||
	PrecChangeTimer.Stop();
 | 
			
		||||
	psi = psi + mmp;
 | 
			
		||||
	
 | 
			
		||||
	
 | 
			
		||||
@@ -194,6 +202,9 @@ public:
 | 
			
		||||
	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
 | 
			
		||||
 | 
			
		||||
	
 | 
			
		||||
	IterationsToComplete = k;	
 | 
			
		||||
	ReliableUpdatesPerformed = l;
 | 
			
		||||
@@ -214,14 +225,21 @@ public:
 | 
			
		||||
      else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 | 
			
		||||
	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 | 
			
		||||
		  << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
 | 
			
		||||
	precisionChange(mmp, psi_f);
 | 
			
		||||
	PrecChangeTimer.Start();
 | 
			
		||||
	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
 | 
			
		||||
	PrecChangeTimer.Stop();
 | 
			
		||||
	psi = psi + mmp;
 | 
			
		||||
 | 
			
		||||
	MatrixTimer.Start();
 | 
			
		||||
	Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 | 
			
		||||
	MatrixTimer.Stop();
 | 
			
		||||
	
 | 
			
		||||
	r = src - mmp;
 | 
			
		||||
 | 
			
		||||
	psi_f = Zero();
 | 
			
		||||
	precisionChange(r_f, r);
 | 
			
		||||
	PrecChangeTimer.Start();
 | 
			
		||||
	precisionChange(r_f, r, pc_wk_dp_to_sp);
 | 
			
		||||
	PrecChangeTimer.Stop();
 | 
			
		||||
	cp = norm2(r);
 | 
			
		||||
	MaxResidSinceLastRelUp = cp;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -97,8 +97,8 @@ private:
 | 
			
		||||
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
 | 
			
		||||
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
 | 
			
		||||
 | 
			
		||||
  static void PrintBytes(void);
 | 
			
		||||
 public:
 | 
			
		||||
  static void PrintBytes(void);
 | 
			
		||||
  static void Audit(std::string s);
 | 
			
		||||
  static void Init(void);
 | 
			
		||||
  static void InitMessage(void);
 | 
			
		||||
@@ -119,6 +119,8 @@ private:
 | 
			
		||||
  static uint64_t     DeviceToHostBytes;
 | 
			
		||||
  static uint64_t     HostToDeviceXfer;
 | 
			
		||||
  static uint64_t     DeviceToHostXfer;
 | 
			
		||||
  static uint64_t     DeviceEvictions;
 | 
			
		||||
  static uint64_t     DeviceDestroy;
 | 
			
		||||
 
 | 
			
		||||
 private:
 | 
			
		||||
#ifndef GRID_UVM
 | 
			
		||||
@@ -176,6 +178,7 @@ private:
 | 
			
		||||
 | 
			
		||||
 public:
 | 
			
		||||
  static void Print(void);
 | 
			
		||||
  static void PrintAll(void);
 | 
			
		||||
  static void PrintState( void* CpuPtr);
 | 
			
		||||
  static int   isOpen   (void* CpuPtr);
 | 
			
		||||
  static void  ViewClose(void* CpuPtr,ViewMode mode);
 | 
			
		||||
 
 | 
			
		||||
@@ -28,6 +28,8 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostBytes;
 | 
			
		||||
uint64_t  MemoryManager::HostToDeviceXfer;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostXfer;
 | 
			
		||||
uint64_t  MemoryManager::DeviceEvictions;
 | 
			
		||||
uint64_t  MemoryManager::DeviceDestroy;
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////
 | 
			
		||||
// Priority ordering for unlocked entries
 | 
			
		||||
@@ -115,8 +117,10 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
 | 
			
		||||
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
 | 
			
		||||
  if(AccCache.AccPtr) {
 | 
			
		||||
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
 | 
			
		||||
    DeviceDestroy++;
 | 
			
		||||
    DeviceBytes   -=AccCache.bytes;
 | 
			
		||||
    LRUremove(AccCache);
 | 
			
		||||
    AccCache.AccPtr=(uint64_t) NULL;
 | 
			
		||||
    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
 | 
			
		||||
  }
 | 
			
		||||
  uint64_t CpuPtr = AccCache.CpuPtr;
 | 
			
		||||
@@ -126,8 +130,14 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
 | 
			
		||||
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 | 
			
		||||
{
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Make CPU consistent, remove from Accelerator, remove entry
 | 
			
		||||
  // Cannot be locked. If allocated must be in LRU pool.
 | 
			
		||||
  // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
 | 
			
		||||
  // Cannot be acclocked. If allocated must be in LRU pool.
 | 
			
		||||
  //
 | 
			
		||||
  // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
 | 
			
		||||
  //                          and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
 | 
			
		||||
  //                          but there is a weakness where CpuLock entries are attempted for erase
 | 
			
		||||
  //                          Take these OUT LRU queue when CPU locked?
 | 
			
		||||
  //                          Cannot take out the table as cpuLock data is important.
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  assert(AccCache.state!=Empty);
 | 
			
		||||
  
 | 
			
		||||
@@ -139,15 +149,17 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 | 
			
		||||
  if(AccCache.state==AccDirty) {
 | 
			
		||||
    Flush(AccCache);
 | 
			
		||||
  }
 | 
			
		||||
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
 | 
			
		||||
  if(AccCache.AccPtr) {
 | 
			
		||||
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
 | 
			
		||||
    DeviceBytes   -=AccCache.bytes;
 | 
			
		||||
    LRUremove(AccCache);
 | 
			
		||||
    AccCache.AccPtr=(uint64_t)NULL;
 | 
			
		||||
    AccCache.state=CpuDirty; // CPU primary now
 | 
			
		||||
    DeviceBytes   -=AccCache.bytes;
 | 
			
		||||
    dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
 | 
			
		||||
  }
 | 
			
		||||
  uint64_t CpuPtr = AccCache.CpuPtr;
 | 
			
		||||
  EntryErase(CpuPtr);
 | 
			
		||||
  //  uint64_t CpuPtr = AccCache.CpuPtr;
 | 
			
		||||
  DeviceEvictions++;
 | 
			
		||||
  //  EntryErase(CpuPtr);
 | 
			
		||||
}
 | 
			
		||||
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
 | 
			
		||||
{
 | 
			
		||||
@@ -221,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::EvictVictims(uint64_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  assert(bytes<DeviceMaxBytes);
 | 
			
		||||
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
 | 
			
		||||
    if ( DeviceLRUBytes > 0){
 | 
			
		||||
      assert(LRU.size()>0);
 | 
			
		||||
      uint64_t victim = LRU.back();
 | 
			
		||||
      uint64_t victim = LRU.back(); // From the LRU
 | 
			
		||||
      auto AccCacheIterator = EntryLookup(victim);
 | 
			
		||||
      auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
      Evict(AccCache);
 | 
			
		||||
    } else {
 | 
			
		||||
      return;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -322,7 +337,8 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // If view is opened on device remove from LRU
 | 
			
		||||
  assert(AccCache.accLock>0);
 | 
			
		||||
  // If view is opened on device must remove from LRU
 | 
			
		||||
  if(AccCache.LRU_valid==1){
 | 
			
		||||
    // must possibly remove from LRU as now locked on GPU
 | 
			
		||||
    dprintf("AccCache entry removed from LRU \n");
 | 
			
		||||
@@ -388,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
 | 
			
		||||
  auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
			
		||||
  auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
 | 
			
		||||
  if (!AccCache.AccPtr) {
 | 
			
		||||
     EvictVictims(bytes);
 | 
			
		||||
  }
 | 
			
		||||
  // CPU doesn't need to free space
 | 
			
		||||
  //  if (!AccCache.AccPtr) {
 | 
			
		||||
  //    EvictVictims(bytes);
 | 
			
		||||
  //  }
 | 
			
		||||
 | 
			
		||||
  assert((mode==CpuRead)||(mode==CpuWrite));
 | 
			
		||||
  assert(AccCache.accLock==0);  // Programming error
 | 
			
		||||
@@ -444,20 +461,28 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 | 
			
		||||
void  MemoryManager::Print(void)
 | 
			
		||||
{
 | 
			
		||||
  PrintBytes();
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceBytes   << " bytes allocated on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device       " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << HostToDeviceXfer << " transfers        to   device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceToHostXfer << " transfers        from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Memory Manager                             " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceBytes   << " bytes allocated on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device       " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << HostToDeviceXfer << " transfers        to   device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceToHostXfer << " transfers        from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceEvictions  << " Evictions from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceDestroy    << " Destroyed vectors on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::PrintAll(void)
 | 
			
		||||
{
 | 
			
		||||
  Print();
 | 
			
		||||
  std::cout << GridLogMessage << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
 | 
			
		||||
    auto &AccCache = it->second;
 | 
			
		||||
    
 | 
			
		||||
@@ -467,13 +492,13 @@ void  MemoryManager::Print(void)
 | 
			
		||||
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
 | 
			
		||||
    if ( AccCache.state==Consistent)str = std::string("Consistent");
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
			
		||||
    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
			
		||||
	      << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 | 
			
		||||
	      << "\t" << AccCache.cpuLock
 | 
			
		||||
	      << "\t" << AccCache.accLock
 | 
			
		||||
	      << "\t" << AccCache.LRU_valid<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
int   MemoryManager::isOpen   (void* _CpuPtr) 
 | 
			
		||||
@@ -489,6 +514,25 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
 | 
			
		||||
}
 | 
			
		||||
void MemoryManager::Audit(std::string s)
 | 
			
		||||
{
 | 
			
		||||
  uint64_t CpuBytes=0;
 | 
			
		||||
  uint64_t AccBytes=0;
 | 
			
		||||
  uint64_t LruBytes1=0;
 | 
			
		||||
  uint64_t LruBytes2=0;
 | 
			
		||||
  uint64_t LruCnt=0;
 | 
			
		||||
  uint64_t LockedBytes=0;
 | 
			
		||||
  
 | 
			
		||||
  std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
 | 
			
		||||
  for(auto it=LRU.begin();it!=LRU.end();it++){
 | 
			
		||||
    uint64_t cpuPtr = *it;
 | 
			
		||||
    assert(EntryPresent(cpuPtr));
 | 
			
		||||
    auto AccCacheIterator = EntryLookup(cpuPtr);
 | 
			
		||||
    auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
    LruBytes2+=AccCache.bytes;
 | 
			
		||||
    assert(AccCache.LRU_valid==1);
 | 
			
		||||
    assert(AccCache.LRU_entry==it);
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
 | 
			
		||||
    auto &AccCache = it->second;
 | 
			
		||||
    
 | 
			
		||||
@@ -498,7 +542,14 @@ void MemoryManager::Audit(std::string s)
 | 
			
		||||
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
 | 
			
		||||
    if ( AccCache.state==Consistent)str = std::string("Consistent");
 | 
			
		||||
 | 
			
		||||
    CpuBytes+=AccCache.bytes;
 | 
			
		||||
    if( AccCache.AccPtr )    AccBytes+=AccCache.bytes;
 | 
			
		||||
    if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
 | 
			
		||||
    if( AccCache.LRU_valid ) LruCnt++;
 | 
			
		||||
    
 | 
			
		||||
    if ( AccCache.cpuLock || AccCache.accLock ) {
 | 
			
		||||
      assert(AccCache.LRU_valid==0);
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
			
		||||
		<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 | 
			
		||||
		<< "\t cpuLock  " << AccCache.cpuLock
 | 
			
		||||
@@ -509,6 +560,15 @@ void MemoryManager::Audit(std::string s)
 | 
			
		||||
    assert( AccCache.cpuLock== 0 ) ;
 | 
			
		||||
    assert( AccCache.accLock== 0 ) ;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
 | 
			
		||||
  assert(LruBytes1==LruBytes2);
 | 
			
		||||
  assert(LruBytes1==DeviceLRUBytes);
 | 
			
		||||
  std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
 | 
			
		||||
  assert(AccBytes==DeviceBytes);
 | 
			
		||||
  std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
 | 
			
		||||
  assert(LruCnt == LRU.size());
 | 
			
		||||
  std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void MemoryManager::PrintState(void* _CpuPtr)
 | 
			
		||||
@@ -526,8 +586,8 @@ void MemoryManager::PrintState(void* _CpuPtr)
 | 
			
		||||
    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
			
		||||
    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 | 
			
		||||
    std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
			
		||||
    << "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 | 
			
		||||
    << "\t" << AccCache.cpuLock
 | 
			
		||||
    << "\t" << AccCache.accLock
 | 
			
		||||
    << "\t" << AccCache.LRU_valid<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -12,6 +12,8 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostBytes;
 | 
			
		||||
uint64_t  MemoryManager::HostToDeviceXfer;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostXfer;
 | 
			
		||||
uint64_t  MemoryManager::DeviceEvictions;
 | 
			
		||||
uint64_t  MemoryManager::DeviceDestroy;
 | 
			
		||||
 | 
			
		||||
void  MemoryManager::Audit(std::string s){};
 | 
			
		||||
void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 | 
			
		||||
@@ -22,6 +24,7 @@ void  MemoryManager::PrintState(void* CpuPtr)
 | 
			
		||||
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
 | 
			
		||||
};
 | 
			
		||||
void  MemoryManager::Print(void){};
 | 
			
		||||
void  MemoryManager::PrintAll(void){};
 | 
			
		||||
void  MemoryManager::NotifyDeletion(void *ptr){};
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 
 | 
			
		||||
@@ -291,8 +291,8 @@ public:
 | 
			
		||||
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
 | 
			
		||||
    conformable(*this,r);
 | 
			
		||||
    this->checkerboard = r.Checkerboard();
 | 
			
		||||
    auto me =   View(AcceleratorWriteDiscard);
 | 
			
		||||
    auto him= r.View(AcceleratorRead);
 | 
			
		||||
    auto me =   View(AcceleratorWriteDiscard);
 | 
			
		||||
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 | 
			
		||||
      coalescedWrite(me[ss],him(ss));
 | 
			
		||||
    });
 | 
			
		||||
@@ -306,8 +306,8 @@ public:
 | 
			
		||||
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
 | 
			
		||||
    this->checkerboard = r.Checkerboard();
 | 
			
		||||
    conformable(*this,r);
 | 
			
		||||
    auto me =   View(AcceleratorWriteDiscard);
 | 
			
		||||
    auto him= r.View(AcceleratorRead);
 | 
			
		||||
    auto me =   View(AcceleratorWriteDiscard);
 | 
			
		||||
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 | 
			
		||||
      coalescedWrite(me[ss],him(ss));
 | 
			
		||||
    });
 | 
			
		||||
 
 | 
			
		||||
@@ -211,13 +211,28 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
 | 
			
		||||
  assert(ok);
 | 
			
		||||
 | 
			
		||||
  Integer smemSize = numThreads * sizeof(sobj);
 | 
			
		||||
 | 
			
		||||
  Vector<sobj> buffer(numBlocks);
 | 
			
		||||
  // UVM seems to be buggy under later CUDA drivers
 | 
			
		||||
  // This fails on A100 and driver 5.30.02 / CUDA 12.1
 | 
			
		||||
  // Fails with multiple NVCC versions back to 11.4,
 | 
			
		||||
  // which worked with earlier drivers.
 | 
			
		||||
  // Not sure which driver had first fail and this bears checking
 | 
			
		||||
  // Is awkward as must install multiple driver versions
 | 
			
		||||
#undef UVM_BLOCK_BUFFER  
 | 
			
		||||
#ifndef UVM_BLOCK_BUFFER  
 | 
			
		||||
  commVector<sobj> buffer(numBlocks);
 | 
			
		||||
  sobj *buffer_v = &buffer[0];
 | 
			
		||||
  
 | 
			
		||||
  sobj result;
 | 
			
		||||
  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
 | 
			
		||||
  accelerator_barrier();
 | 
			
		||||
  auto result = buffer_v[0];
 | 
			
		||||
  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
 | 
			
		||||
#else
 | 
			
		||||
  Vector<sobj> buffer(numBlocks);
 | 
			
		||||
  sobj *buffer_v = &buffer[0];
 | 
			
		||||
  sobj result;
 | 
			
		||||
  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
 | 
			
		||||
  accelerator_barrier();
 | 
			
		||||
  result = *buffer_v;
 | 
			
		||||
#endif
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1080,6 +1080,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
 | 
			
		||||
template<class VobjOut, class VobjIn>
 | 
			
		||||
void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 | 
			
		||||
{
 | 
			
		||||
@@ -1097,9 +1098,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 | 
			
		||||
      precisionChange(vout,vin,N);
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
//Convert a Lattice from one precision to another
 | 
			
		||||
//Convert a Lattice from one precision to another (original, slow implementation)
 | 
			
		||||
template<class VobjOut, class VobjIn>
 | 
			
		||||
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 | 
			
		||||
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 | 
			
		||||
{
 | 
			
		||||
  assert(out.Grid()->Nd() == in.Grid()->Nd());
 | 
			
		||||
  for(int d=0;d<out.Grid()->Nd();d++){
 | 
			
		||||
@@ -1145,6 +1146,128 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
 | 
			
		||||
class precisionChangeWorkspace{
 | 
			
		||||
  std::pair<Integer,Integer>* fmap_device; //device pointer
 | 
			
		||||
  //maintain grids for checking
 | 
			
		||||
  GridBase* _out_grid;
 | 
			
		||||
  GridBase* _in_grid;
 | 
			
		||||
public:
 | 
			
		||||
  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
 | 
			
		||||
    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
 | 
			
		||||
    assert(out_grid->Nd() == in_grid->Nd());
 | 
			
		||||
    for(int d=0;d<out_grid->Nd();d++){
 | 
			
		||||
      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
 | 
			
		||||
    }
 | 
			
		||||
    int Nsimd_out = out_grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
 | 
			
		||||
    for(int lane=0; lane < out_grid->Nsimd(); lane++)
 | 
			
		||||
      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
 | 
			
		||||
  
 | 
			
		||||
    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
 | 
			
		||||
    thread_for(out_oidx,out_grid->oSites(),{
 | 
			
		||||
	Coordinate out_ocorr; 
 | 
			
		||||
	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
 | 
			
		||||
      
 | 
			
		||||
	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
 | 
			
		||||
	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
 | 
			
		||||
	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
 | 
			
		||||
	
 | 
			
		||||
	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
 | 
			
		||||
	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
 | 
			
		||||
	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
 | 
			
		||||
	  int in_oidx = 0, in_lane = 0;
 | 
			
		||||
	  for(int d=0;d<in_grid->_ndimension;d++){
 | 
			
		||||
	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
 | 
			
		||||
	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
 | 
			
		||||
	  }
 | 
			
		||||
	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
 | 
			
		||||
	}
 | 
			
		||||
      });
 | 
			
		||||
 | 
			
		||||
    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
 | 
			
		||||
    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
 | 
			
		||||
    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
 | 
			
		||||
    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //Prevent moving or copying
 | 
			
		||||
  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
 | 
			
		||||
  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
 | 
			
		||||
  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
 | 
			
		||||
  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
 | 
			
		||||
  
 | 
			
		||||
  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
 | 
			
		||||
 | 
			
		||||
  void checkGrids(GridBase* out, GridBase* in) const{
 | 
			
		||||
    conformable(out, _out_grid);
 | 
			
		||||
    conformable(in, _in_grid);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  ~precisionChangeWorkspace(){
 | 
			
		||||
    acceleratorFreeDevice(fmap_device);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
 | 
			
		||||
//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
 | 
			
		||||
template<class VobjOut, class VobjIn>
 | 
			
		||||
auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
 | 
			
		||||
  if(out.Grid() == in.Grid()){
 | 
			
		||||
    precisionChangeFast(out,in);
 | 
			
		||||
    return 1;
 | 
			
		||||
  }else{
 | 
			
		||||
    return 0;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
template<class VobjOut, class VobjIn>
 | 
			
		||||
int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
 | 
			
		||||
//which contains the mapping data.
 | 
			
		||||
template<class VobjOut, class VobjIn>
 | 
			
		||||
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
 | 
			
		||||
  if(_precisionChangeFastWrap(out,in,0)) return;
 | 
			
		||||
  
 | 
			
		||||
  static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
 | 
			
		||||
 | 
			
		||||
  out.Checkerboard() = in.Checkerboard();
 | 
			
		||||
  constexpr int Nsimd_out = VobjOut::Nsimd();
 | 
			
		||||
 | 
			
		||||
  workspace.checkGrids(out.Grid(),in.Grid());
 | 
			
		||||
  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
 | 
			
		||||
 | 
			
		||||
  //Do the copy/precision change
 | 
			
		||||
  autoView( out_v , out, AcceleratorWrite);
 | 
			
		||||
  autoView( in_v , in, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
 | 
			
		||||
      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
 | 
			
		||||
      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
 | 
			
		||||
	int in_oidx = fmap_osite[out_lane].first;
 | 
			
		||||
	int in_lane = fmap_osite[out_lane].second;
 | 
			
		||||
	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
 | 
			
		||||
//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
 | 
			
		||||
template<class VobjOut, class VobjIn>
 | 
			
		||||
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
 | 
			
		||||
  if(_precisionChangeFastWrap(out,in,0)) return;   
 | 
			
		||||
  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
 | 
			
		||||
  precisionChange(out, in, workspace);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Communicate between grids
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -30,6 +30,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifndef GRID_PERFCOUNT_H
 | 
			
		||||
#define GRID_PERFCOUNT_H
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#ifndef __SSC_START
 | 
			
		||||
#define __SSC_START
 | 
			
		||||
#define __SSC_STOP
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#include <sys/time.h>
 | 
			
		||||
#include <ctime>
 | 
			
		||||
#include <chrono>
 | 
			
		||||
 
 | 
			
		||||
@@ -16,7 +16,7 @@
 | 
			
		||||
 | 
			
		||||
#ifdef __NVCC__
 | 
			
		||||
#pragma push
 | 
			
		||||
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 | 
			
		||||
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 | 
			
		||||
#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 | 
			
		||||
#else
 | 
			
		||||
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 | 
			
		||||
 
 | 
			
		||||
@@ -226,7 +226,7 @@ template<class vobjOut, class vobjIn>
 | 
			
		||||
accelerator_inline 
 | 
			
		||||
void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
 | 
			
		||||
{
 | 
			
		||||
  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
 | 
			
		||||
  static_assert( std::is_same<typename vobjOut::scalar_typeD, typename vobjIn::scalar_typeD>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
 | 
			
		||||
 | 
			
		||||
  typedef typename vobjOut::vector_type ovector_type;  
 | 
			
		||||
  typedef typename vobjIn::vector_type ivector_type;  
 | 
			
		||||
@@ -251,9 +251,9 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest
 | 
			
		||||
  ovector_type * __restrict__ op = (ovector_type *)&vecOut;
 | 
			
		||||
  ivector_type * __restrict__ ip = (ivector_type *)&vecIn;
 | 
			
		||||
  for(int w=0;w<owords;w++){
 | 
			
		||||
    itmp = ip[iNsimd*w].getlane(lane_in);
 | 
			
		||||
    itmp = ip[w].getlane(lane_in);
 | 
			
		||||
    otmp = itmp; //potential precision change
 | 
			
		||||
    op[oNsimd*w].putlane(otmp,lane_out);
 | 
			
		||||
    op[w].putlane(otmp,lane_out);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										8
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								TODO
									
									
									
									
									
								
							@@ -1,3 +1,10 @@
 | 
			
		||||
- - Slice sum optimisation & A2A - atomic addition
 | 
			
		||||
- - Also faster non-atomic reduction
 | 
			
		||||
- - Remaining PRs
 | 
			
		||||
- - DDHMC
 | 
			
		||||
 | 
			
		||||
=================
 | 
			
		||||
=================
 | 
			
		||||
Lattice_basis.h -- > HIP and SYCL GPU code
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -8,6 +15,7 @@ DDHMC
 | 
			
		||||
-- Multishift Mixed Precision - DONE
 | 
			
		||||
-- Pole dependent residual  - DONE
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
=======
 | 
			
		||||
-- comms threads issue??
 | 
			
		||||
-- Part done: Staggered kernel performance on GPU
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										189
									
								
								benchmarks/Benchmark_prec_change.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										189
									
								
								benchmarks/Benchmark_prec_change.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,189 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./benchmarks/Benchmark_prec_change.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Christopher Kelly <ckelly@bnl.gov>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  int Ls = 12;
 | 
			
		||||
  Coordinate latt4 = GridDefaultLatt();
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGridD   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);
 | 
			
		||||
  GridCartesian         * FGridD   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
 | 
			
		||||
  GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
 | 
			
		||||
  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
 | 
			
		||||
  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
  
 | 
			
		||||
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
 | 
			
		||||
  GridParallelRNG          RNG4(UGridD);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
 | 
			
		||||
  GridParallelRNG          RNG5(FGridD);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
  LatticeFermionD field_d(FGridD), tmp_d(FGridD);
 | 
			
		||||
  random(RNG5,field_d); tmp_d = field_d;
 | 
			
		||||
 | 
			
		||||
  LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);
 | 
			
		||||
  precisionChange(field_d2, field_d); tmp_d2 = field_d2;
 | 
			
		||||
 | 
			
		||||
  LatticeFermionF field_f(FGridF), tmp_f(FGridF);
 | 
			
		||||
  precisionChange(field_f, field_d); tmp_f = field_f;
 | 
			
		||||
 | 
			
		||||
  int N = 500;
 | 
			
		||||
 | 
			
		||||
  double time_ds = 0, time_sd = 0;
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "Benchmarking single<->double original implementation (fields initially device-resident)" << std::endl;
 | 
			
		||||
  for(int i=0;i<N;i++){
 | 
			
		||||
    //We want to benchmark the typical scenario of both fields being device resident
 | 
			
		||||
    //To do this, invoke an operation that will open a device view and touch all sites
 | 
			
		||||
    //with a write operation that invalidates the CPU copy
 | 
			
		||||
    field_d = tmp_d;
 | 
			
		||||
    field_f = tmp_f;
 | 
			
		||||
 | 
			
		||||
    double start=usecond();
 | 
			
		||||
    precisionChangeOrig(field_d,field_f);
 | 
			
		||||
    double stop=usecond();
 | 
			
		||||
    time_sd += stop - start;
 | 
			
		||||
 | 
			
		||||
    field_d = tmp_d;
 | 
			
		||||
    field_f = tmp_f;
 | 
			
		||||
 | 
			
		||||
    start=usecond();
 | 
			
		||||
    precisionChangeOrig(field_f,field_d);
 | 
			
		||||
    stop=usecond();
 | 
			
		||||
    time_ds += stop - start;   
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());
 | 
			
		||||
  precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());
 | 
			
		||||
  
 | 
			
		||||
  std::cout<<GridLogMessage << "Benchmarking single<->double with pregenerated workspace(fields initially device-resident)" << std::endl;
 | 
			
		||||
  time_sd = time_ds = 0;
 | 
			
		||||
  for(int i=0;i<N;i++){
 | 
			
		||||
    field_d = tmp_d;
 | 
			
		||||
    field_f = tmp_f;
 | 
			
		||||
 | 
			
		||||
    double start=usecond();
 | 
			
		||||
    precisionChange(field_d,field_f, wk_sp_to_dp);
 | 
			
		||||
    double stop=usecond();
 | 
			
		||||
    time_sd += stop - start;
 | 
			
		||||
 | 
			
		||||
    field_d = tmp_d;
 | 
			
		||||
    field_f = tmp_f;
 | 
			
		||||
 | 
			
		||||
    start=usecond();
 | 
			
		||||
    precisionChange(field_f,field_d, wk_dp_to_sp);
 | 
			
		||||
    stop=usecond();
 | 
			
		||||
    time_ds += stop - start;   
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
 | 
			
		||||
  
 | 
			
		||||
  std::cout<<GridLogMessage << "Benchmarking single<->double with workspace generated on-the-fly (fields initially device-resident)" << std::endl;
 | 
			
		||||
  time_sd = time_ds = 0;
 | 
			
		||||
  for(int i=0;i<N;i++){
 | 
			
		||||
    field_d = tmp_d;
 | 
			
		||||
    field_f = tmp_f;
 | 
			
		||||
 | 
			
		||||
    double start=usecond();
 | 
			
		||||
    precisionChange(field_d,field_f);
 | 
			
		||||
    double stop=usecond();
 | 
			
		||||
    time_sd += stop - start;
 | 
			
		||||
 | 
			
		||||
    field_d = tmp_d;
 | 
			
		||||
    field_f = tmp_f;
 | 
			
		||||
 | 
			
		||||
    start=usecond();
 | 
			
		||||
    precisionChange(field_f,field_d);
 | 
			
		||||
    stop=usecond();
 | 
			
		||||
    time_ds += stop - start;
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "Benchmarking single<->double2 (fields initially device-resident)" << std::endl;
 | 
			
		||||
  time_sd = time_ds = 0;
 | 
			
		||||
  for(int i=0;i<N;i++){
 | 
			
		||||
    field_d2 = tmp_d2;
 | 
			
		||||
    field_f = tmp_f;
 | 
			
		||||
 | 
			
		||||
    double start=usecond();
 | 
			
		||||
    precisionChangeFast(field_d2,field_f);
 | 
			
		||||
    double stop=usecond();
 | 
			
		||||
    time_sd += stop - start;
 | 
			
		||||
 | 
			
		||||
    field_d2 = tmp_d2;
 | 
			
		||||
    field_f = tmp_f;
 | 
			
		||||
 | 
			
		||||
    start=usecond();
 | 
			
		||||
    precisionChangeFast(field_f,field_d2);
 | 
			
		||||
    stop=usecond();
 | 
			
		||||
    time_ds += stop - start;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "Benchmarking single<->double2 through standard precisionChange call(fields initially device-resident) [NB: perf should be the same as the previous test!]" << std::endl;
 | 
			
		||||
  time_sd = time_ds = 0;
 | 
			
		||||
  for(int i=0;i<N;i++){
 | 
			
		||||
    field_d2 = tmp_d2;
 | 
			
		||||
    field_f = tmp_f;
 | 
			
		||||
 | 
			
		||||
    double start=usecond();
 | 
			
		||||
    precisionChange(field_d2,field_f);
 | 
			
		||||
    double stop=usecond();
 | 
			
		||||
    time_sd += stop - start;
 | 
			
		||||
 | 
			
		||||
    field_d2 = tmp_d2;
 | 
			
		||||
    field_f = tmp_f;
 | 
			
		||||
 | 
			
		||||
    start=usecond();
 | 
			
		||||
    precisionChange(field_f,field_d2);
 | 
			
		||||
    stop=usecond();
 | 
			
		||||
    time_ds += stop - start;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -1,3 +1,2 @@
 | 
			
		||||
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi
 | 
			
		||||
#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU-RRII --enable-comms=mpi
 | 
			
		||||
#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU --enable-debug --enable-comms=mpi
 | 
			
		||||
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi --enable-unified=yes
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -101,7 +101,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
 | 
			
		||||
  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
 | 
			
		||||
  int iters;
 | 
			
		||||
  for(int i=0;i<200;i++){
 | 
			
		||||
  for(int i=0;i<10;i++){
 | 
			
		||||
    result_o = Zero();
 | 
			
		||||
    t1=usecond();
 | 
			
		||||
    mCG(src_o,result_o);
 | 
			
		||||
 
 | 
			
		||||
@@ -1,35 +1,12 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    grid` physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_cshift.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 ;
 | 
			
		||||
Gamma::Algebra Gmu [] = {
 | 
			
		||||
  Gamma::Algebra::GammaX,
 | 
			
		||||
  Gamma::Algebra::GammaY,
 | 
			
		||||
  Gamma::Algebra::GammaZ,
 | 
			
		||||
  Gamma::Algebra::GammaT,
 | 
			
		||||
  Gamma::Algebra::Gamma5
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
@@ -49,22 +26,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian RBGRID(&GRID);
 | 
			
		||||
 | 
			
		||||
  LatticeComplexD     one(&GRID);
 | 
			
		||||
  LatticeComplexD      zz(&GRID);
 | 
			
		||||
  LatticeComplexD       C(&GRID);
 | 
			
		||||
  LatticeComplexD  Ctilde(&GRID);
 | 
			
		||||
  LatticeComplexD  Cref  (&GRID);
 | 
			
		||||
  LatticeComplexD  Csav  (&GRID);
 | 
			
		||||
  LatticeComplexD    coor(&GRID);
 | 
			
		||||
 | 
			
		||||
  LatticeSpinMatrixD    S(&GRID);
 | 
			
		||||
  LatticeSpinMatrixD    Stilde(&GRID);
 | 
			
		||||
  
 | 
			
		||||
  Coordinate p({1,3,2,3});
 | 
			
		||||
 | 
			
		||||
  one = ComplexD(1.0,0.0);
 | 
			
		||||
  zz  = ComplexD(0.0,0.0);
 | 
			
		||||
 | 
			
		||||
  ComplexD ci(0.0,1.0);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
@@ -73,7 +36,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeFieldD Umu(&GRID);
 | 
			
		||||
 | 
			
		||||
  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
@@ -81,16 +43,78 @@ int main (int argc, char ** argv)
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
  {
 | 
			
		||||
    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
 | 
			
		||||
    LatticeFermionD    src_p(&GRID);
 | 
			
		||||
    LatticeFermionD    tmp(&GRID);
 | 
			
		||||
    LatticeFermionD    ref(&GRID);
 | 
			
		||||
    LatticeFermionD    result(&GRID);
 | 
			
		||||
    
 | 
			
		||||
    RealD mass=0.01;
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
    WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
 | 
			
		||||
    
 | 
			
		||||
    Dw.M(src,tmp);
 | 
			
		||||
    Dw.M(src,ref);
 | 
			
		||||
    std::cout << "Norm src "<<norm2(src)<<std::endl;
 | 
			
		||||
    std::cout << "Norm Dw x src "<<norm2(ref)<<std::endl;
 | 
			
		||||
    {
 | 
			
		||||
      FFT theFFT(&GRID);
 | 
			
		||||
 | 
			
		||||
      ////////////////
 | 
			
		||||
      // operator in Fourier space
 | 
			
		||||
      ////////////////
 | 
			
		||||
      tmp =ref;
 | 
			
		||||
      theFFT.FFT_all_dim(result,tmp,FFT::forward);
 | 
			
		||||
      std::cout<<"FFT[ Dw x src ]  "<< norm2(result)<<std::endl;    
 | 
			
		||||
 | 
			
		||||
      tmp = src;
 | 
			
		||||
      theFFT.FFT_all_dim(src_p,tmp,FFT::forward);
 | 
			
		||||
      std::cout<<"FFT[ src      ]  "<< norm2(src_p)<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
      /////////////////////////////////////////////////////////////////
 | 
			
		||||
      // work out the predicted FT from Fourier
 | 
			
		||||
      /////////////////////////////////////////////////////////////////
 | 
			
		||||
      auto FGrid = &GRID;
 | 
			
		||||
      LatticeFermionD    Kinetic(FGrid); Kinetic = Zero();
 | 
			
		||||
      LatticeComplexD    kmu(FGrid); 
 | 
			
		||||
      LatticeInteger     scoor(FGrid); 
 | 
			
		||||
      LatticeComplexD    sk (FGrid); sk = Zero();
 | 
			
		||||
      LatticeComplexD    sk2(FGrid); sk2= Zero();
 | 
			
		||||
      LatticeComplexD    W(FGrid); W= Zero();
 | 
			
		||||
      LatticeComplexD    one(FGrid); one =ComplexD(1.0,0.0);
 | 
			
		||||
      ComplexD ci(0.0,1.0);
 | 
			
		||||
    
 | 
			
		||||
      for(int mu=0;mu<Nd;mu++) {
 | 
			
		||||
	
 | 
			
		||||
	RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
 | 
			
		||||
 | 
			
		||||
	LatticeCoordinate(kmu,mu);
 | 
			
		||||
 | 
			
		||||
	kmu = TwoPiL * kmu;
 | 
			
		||||
      
 | 
			
		||||
	sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
 | 
			
		||||
	sk  = sk  +     sin(kmu)    *sin(kmu); 
 | 
			
		||||
      
 | 
			
		||||
	// -1/2 Dw ->  1/2 gmu (eip - emip) = i sinp gmu
 | 
			
		||||
	Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src_p);
 | 
			
		||||
	
 | 
			
		||||
      }
 | 
			
		||||
    
 | 
			
		||||
      W = mass + sk2; 
 | 
			
		||||
      Kinetic = Kinetic + W * src_p;
 | 
			
		||||
    
 | 
			
		||||
      std::cout<<"Momentum space src         "<< norm2(src_p)<<std::endl;
 | 
			
		||||
      std::cout<<"Momentum space Dw x src    "<< norm2(Kinetic)<<std::endl;
 | 
			
		||||
      std::cout<<"FT[Coordinate space Dw]    "<< norm2(result)<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
      result = result - Kinetic;
 | 
			
		||||
      std::cout<<"diff "<< norm2(result)<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    std::cout << " =======================================" <<std::endl;
 | 
			
		||||
    std::cout << " Checking FourierFreePropagator x Dw = 1" <<std::endl;
 | 
			
		||||
    std::cout << " =======================================" <<std::endl;
 | 
			
		||||
    std::cout << "Dw src = " <<norm2(src)<<std::endl;
 | 
			
		||||
    std::cout << "Dw tmp = " <<norm2(tmp)<<std::endl;
 | 
			
		||||
    Dw.M(src,tmp);
 | 
			
		||||
 | 
			
		||||
    Dw.FreePropagator(tmp,ref,mass);
 | 
			
		||||
 | 
			
		||||
@@ -122,7 +146,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
    ferm()(0)(0) = ComplexD(1.0);
 | 
			
		||||
    pokeSite(ferm,src,point);
 | 
			
		||||
 | 
			
		||||
    RealD mass=0.01;
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
 | 
			
		||||
    WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
 | 
			
		||||
 | 
			
		||||
    // Momentum space prop
 | 
			
		||||
@@ -155,6 +180,65 @@ int main (int argc, char ** argv)
 | 
			
		||||
    DumpSliceNorm("Slice Norm Solution ",result,Nd-1);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
  //Gauge invariance test
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
  {
 | 
			
		||||
    std::cout<<"****************************************"<<std::endl;
 | 
			
		||||
    std::cout << "Gauge invariance test \n";
 | 
			
		||||
    std::cout<<"****************************************"<<std::endl;
 | 
			
		||||
    LatticeGaugeField     U_GT(&GRID); // Gauge transformed field
 | 
			
		||||
    LatticeColourMatrix   g(&GRID);    // local Gauge xform matrix
 | 
			
		||||
    U_GT = Umu;
 | 
			
		||||
    // Make a random xform to teh gauge field
 | 
			
		||||
    SU<Nc>::RandomGaugeTransform(pRNG,U_GT,g); // Unit gauge
 | 
			
		||||
 | 
			
		||||
    LatticeFermionD    src(&GRID);
 | 
			
		||||
    LatticeFermionD    tmp(&GRID);
 | 
			
		||||
    LatticeFermionD    ref(&GRID);
 | 
			
		||||
    LatticeFermionD    diff(&GRID);
 | 
			
		||||
 | 
			
		||||
    // could loop over colors
 | 
			
		||||
    src=Zero();
 | 
			
		||||
    Coordinate point(4,0); // 0,0,0,0
 | 
			
		||||
    SpinColourVectorD ferm;
 | 
			
		||||
    ferm=Zero();
 | 
			
		||||
    ferm()(0)(0) = ComplexD(1.0);
 | 
			
		||||
    pokeSite(ferm,src,point);
 | 
			
		||||
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
    WilsonFermionD Dw(U_GT,GRID,RBGRID,mass);
 | 
			
		||||
 | 
			
		||||
    // Momentum space prop
 | 
			
		||||
    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
 | 
			
		||||
    Dw.FreePropagator(src,ref,mass) ;
 | 
			
		||||
 | 
			
		||||
    Gamma G5(Gamma::Algebra::Gamma5);
 | 
			
		||||
 | 
			
		||||
    LatticeFermionD    result(&GRID); 
 | 
			
		||||
    const int sdir=0;
 | 
			
		||||
    
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Conjugate gradient on normal equations system
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
 | 
			
		||||
    Dw.Mdag(src,tmp);
 | 
			
		||||
    src=tmp;
 | 
			
		||||
    MdagMLinearOperator<WilsonFermionD,LatticeFermionD> HermOp(Dw);
 | 
			
		||||
    ConjugateGradient<LatticeFermionD> CG(1.0e-10,10000);
 | 
			
		||||
    CG(HermOp,src,result);
 | 
			
		||||
    
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    std::cout << " Taking difference" <<std::endl;
 | 
			
		||||
    std::cout << "Dw result "<<norm2(result)<<std::endl;
 | 
			
		||||
    std::cout << "Dw ref     "<<norm2(ref)<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    diff = ref - result;
 | 
			
		||||
    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
 | 
			
		||||
 | 
			
		||||
    DumpSliceNorm("Slice Norm Solution ",result,Nd-1);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										110
									
								
								tests/core/Test_memory_manager.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										110
									
								
								tests/core/Test_memory_manager.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,110 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_memory_manager.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2022
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pboyle@bnl.gov>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
void  MemoryTest(GridCartesian         * FGrid,int N);
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
 | 
			
		||||
  int N=100;
 | 
			
		||||
  for(int i=0;i<N;i++){
 | 
			
		||||
    std::cout << "============================"<<std::endl;
 | 
			
		||||
    std::cout << "Epoch "<<i<<"/"<<N<<std::endl;
 | 
			
		||||
    std::cout << "============================"<<std::endl;
 | 
			
		||||
    MemoryTest(UGrid,256);
 | 
			
		||||
    MemoryManager::Print();
 | 
			
		||||
    AUDIT();
 | 
			
		||||
  }
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void  MemoryTest(GridCartesian         * FGrid, int N)
 | 
			
		||||
{
 | 
			
		||||
  LatticeComplexD zero(FGrid); zero=Zero();
 | 
			
		||||
  std::vector<LatticeComplexD> A(N,zero);//FGrid);
 | 
			
		||||
 | 
			
		||||
  std::vector<ComplexD> B(N,ComplexD(0.0)); // Update sequentially on host
 | 
			
		||||
 | 
			
		||||
  for(int v=0;v<N;v++) A[v] = Zero();
 | 
			
		||||
 | 
			
		||||
  uint64_t counter = 0;
 | 
			
		||||
  for(int epoch = 0;epoch<10000;epoch++){
 | 
			
		||||
 | 
			
		||||
    int v  = random() %N; // Which vec
 | 
			
		||||
    int w  = random() %2; // Write or read
 | 
			
		||||
    int e  = random() %3; // expression or for loop
 | 
			
		||||
    int dev= random() %2; // On device?
 | 
			
		||||
    //    int e=1;
 | 
			
		||||
    ComplexD zc = counter++;
 | 
			
		||||
    
 | 
			
		||||
    if ( w ) {
 | 
			
		||||
      B[v] = B[v] + zc;
 | 
			
		||||
      if ( e == 0 ) {
 | 
			
		||||
	A[v] = A[v] + zc - A[v] + A[v];
 | 
			
		||||
      } else {
 | 
			
		||||
	if ( dev ) { 
 | 
			
		||||
	  autoView(A_v,A[v],AcceleratorWrite);
 | 
			
		||||
	  accelerator_for(ss,FGrid->oSites(),1,{
 | 
			
		||||
	    A_v[ss] = A_v[ss] + zc;
 | 
			
		||||
	    });
 | 
			
		||||
	} else {
 | 
			
		||||
	  autoView(A_v,A[v],CpuWrite);
 | 
			
		||||
	  thread_for(ss,FGrid->oSites(),{
 | 
			
		||||
	      A_v[ss] = A_v[ss] + zc;
 | 
			
		||||
	    });
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      if ( e == 0 ) {
 | 
			
		||||
	A[v] = A[v] + A[v] - A[v];
 | 
			
		||||
      } else { 
 | 
			
		||||
	if ( dev ) { 
 | 
			
		||||
	  autoView(A_v,A[v],AcceleratorRead);
 | 
			
		||||
	  accelerator_for(ss,FGrid->oSites(),1,{
 | 
			
		||||
	      assert(B[v]==A_v[ss]()()().getlane(0));
 | 
			
		||||
	    });
 | 
			
		||||
	  //	std::cout << "["<<v<<"] checked on GPU"<<B[v]<<std::endl;
 | 
			
		||||
	} else {
 | 
			
		||||
	  autoView(A_v,A[v],CpuRead);
 | 
			
		||||
	  thread_for(ss,FGrid->oSites(),{
 | 
			
		||||
	      assert(B[v]==A_v[ss]()()().getlane(0));
 | 
			
		||||
	    });
 | 
			
		||||
	  //	std::cout << "["<<v<<"] checked on CPU"<<B[v]<<std::endl;
 | 
			
		||||
	}
 | 
			
		||||
      }    
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										124
									
								
								tests/core/Test_prec_change.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										124
									
								
								tests/core/Test_prec_change.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,124 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/core/Test_prec_change.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Christopher Kelly <ckelly@bnl.gov>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  int Ls = 12;
 | 
			
		||||
  Coordinate latt4 = GridDefaultLatt();
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGridD   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);
 | 
			
		||||
  GridCartesian         * FGridD   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
 | 
			
		||||
  GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
 | 
			
		||||
  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
 | 
			
		||||
  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
  
 | 
			
		||||
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
 | 
			
		||||
  GridParallelRNG          RNG5(FGridD);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
  GridParallelRNG          RNG5F(FGridF);  RNG5F.SeedFixedIntegers(seeds5);
 | 
			
		||||
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
  LatticeFermionD field_d(FGridD), tmp_d(FGridD);
 | 
			
		||||
  random(RNG5,field_d);
 | 
			
		||||
  RealD norm2_field_d = norm2(field_d);
 | 
			
		||||
  
 | 
			
		||||
  LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);
 | 
			
		||||
  random(RNG5F,field_d2);
 | 
			
		||||
  RealD norm2_field_d2 = norm2(field_d2);
 | 
			
		||||
  
 | 
			
		||||
  LatticeFermionF field_f(FGridF);
 | 
			
		||||
  
 | 
			
		||||
  //Test original implementation
 | 
			
		||||
  {
 | 
			
		||||
    std::cout << GridLogMessage << "Testing original implementation" << std::endl;
 | 
			
		||||
    field_f = Zero();
 | 
			
		||||
    precisionChangeOrig(field_f,field_d);
 | 
			
		||||
    RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
 | 
			
		||||
    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
 | 
			
		||||
    tmp_d = Zero();
 | 
			
		||||
    precisionChangeOrig(tmp_d, field_f);
 | 
			
		||||
    Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
 | 
			
		||||
    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  //Test new implementation with pregenerated workspace
 | 
			
		||||
  {
 | 
			
		||||
    std::cout << GridLogMessage << "Testing new implementation with pregenerated workspace" << std::endl;
 | 
			
		||||
    precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());
 | 
			
		||||
    precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());
 | 
			
		||||
    
 | 
			
		||||
    field_f = Zero();
 | 
			
		||||
    precisionChange(field_f,field_d,wk_dp_to_sp);
 | 
			
		||||
    RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
 | 
			
		||||
    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
 | 
			
		||||
    tmp_d = Zero();
 | 
			
		||||
    precisionChange(tmp_d, field_f,wk_sp_to_dp);
 | 
			
		||||
    Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
 | 
			
		||||
    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  //Test new implementation without pregenerated workspace
 | 
			
		||||
  {
 | 
			
		||||
    std::cout << GridLogMessage << "Testing new implementation without pregenerated workspace" << std::endl;
 | 
			
		||||
    field_f = Zero();
 | 
			
		||||
    precisionChange(field_f,field_d);
 | 
			
		||||
    RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
 | 
			
		||||
    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
 | 
			
		||||
    tmp_d = Zero();
 | 
			
		||||
    precisionChange(tmp_d, field_f);
 | 
			
		||||
    Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
 | 
			
		||||
    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
 | 
			
		||||
  } 
 | 
			
		||||
  //Test fast implementation
 | 
			
		||||
  {
 | 
			
		||||
    std::cout << GridLogMessage << "Testing fast (double2) implementation" << std::endl;
 | 
			
		||||
    field_f = Zero();
 | 
			
		||||
    precisionChangeFast(field_f,field_d2);
 | 
			
		||||
    RealD Ndiff = (norm2_field_d2 - norm2(field_f))/norm2_field_d2;
 | 
			
		||||
    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
 | 
			
		||||
    tmp_d2 = Zero();
 | 
			
		||||
    precisionChangeFast(tmp_d2, field_f);
 | 
			
		||||
    Ndiff = norm2( LatticeFermionD2(tmp_d2-field_d2) ) / norm2_field_d2;
 | 
			
		||||
    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << "Done" << std::endl;
 | 
			
		||||
  
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										122
									
								
								tests/solver/Test_dwf_mixedcg_prec.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										122
									
								
								tests/solver/Test_dwf_mixedcg_prec.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,122 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_dwf_cg_prec.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
//using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  const int Ls=12;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid_f   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f);
 | 
			
		||||
  GridCartesian         * FGrid_f   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
 | 
			
		||||
  
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
 | 
			
		||||
  LatticeFermionD    src(FGrid); random(RNG5,src);
 | 
			
		||||
  LatticeFermionD result(FGrid); result=Zero();
 | 
			
		||||
  LatticeGaugeFieldD Umu(UGrid);
 | 
			
		||||
  LatticeGaugeFieldF Umu_f(UGrid_f); 
 | 
			
		||||
  
 | 
			
		||||
  SU<Nc>::HotConfiguration(RNG4,Umu);
 | 
			
		||||
 | 
			
		||||
  precisionChange(Umu_f,Umu);
 | 
			
		||||
  
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD M5=1.8;
 | 
			
		||||
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
  DomainWallFermionF Ddwf_f(Umu_f,*FGrid_f,*FrbGrid_f,*UGrid_f,*UrbGrid_f,mass,M5);
 | 
			
		||||
 | 
			
		||||
  LatticeFermionD    src_o(FrbGrid);
 | 
			
		||||
  LatticeFermionD result_o(FrbGrid);
 | 
			
		||||
  LatticeFermionD result_o_2(FrbGrid);
 | 
			
		||||
  pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
  result_o.Checkerboard() = Odd;
 | 
			
		||||
  result_o = Zero();
 | 
			
		||||
  result_o_2.Checkerboard() = Odd;
 | 
			
		||||
  result_o_2 = Zero();
 | 
			
		||||
 | 
			
		||||
  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
 | 
			
		||||
  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
 | 
			
		||||
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
 | 
			
		||||
  double t1,t2,flops;
 | 
			
		||||
  double MdagMsiteflops = 1452; // Mobius (real coeffs)
 | 
			
		||||
  // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of)
 | 
			
		||||
  double CGsiteflops = (8+4+8+4+4)*Nc*Ns ;
 | 
			
		||||
  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
 | 
			
		||||
  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
 | 
			
		||||
 | 
			
		||||
  result_o = Zero();
 | 
			
		||||
  t1=usecond();
 | 
			
		||||
  mCG(src_o,result_o);
 | 
			
		||||
  t2=usecond();
 | 
			
		||||
  int iters = mCG.TotalInnerIterations; //Number of inner CG iterations
 | 
			
		||||
  flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
 | 
			
		||||
  flops+= CGsiteflops*FrbGrid->gSites()*iters;
 | 
			
		||||
  std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
			
		||||
  std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
 | 
			
		||||
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
 | 
			
		||||
  result_o_2 = Zero();
 | 
			
		||||
  t1=usecond();
 | 
			
		||||
  CG(HermOpEO,src_o,result_o_2);
 | 
			
		||||
  t2=usecond();
 | 
			
		||||
  iters = CG.IterationsToComplete;
 | 
			
		||||
  flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; 
 | 
			
		||||
  flops+= CGsiteflops*FrbGrid->gSites()*iters;
 | 
			
		||||
  
 | 
			
		||||
  std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
			
		||||
  std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
			
		||||
 | 
			
		||||
  LatticeFermionD diff_o(FrbGrid);
 | 
			
		||||
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
 | 
			
		||||
 | 
			
		||||
  MemoryManager::Print();
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										143
									
								
								tests/solver/Test_dwf_relupcg_prec.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										143
									
								
								tests/solver/Test_dwf_relupcg_prec.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,143 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/solver/Test_dwf_relupcg_prec.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Christopher Kelly <ckelly@bnl.gov>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  double relup_delta = 0.2;
 | 
			
		||||
  for(int i=1;i<argc-1;i++){
 | 
			
		||||
    std::string sarg = argv[i];
 | 
			
		||||
    if(sarg == "--relup_delta"){
 | 
			
		||||
      std::stringstream ss; ss << argv[i+1]; ss >> relup_delta;
 | 
			
		||||
      std::cout << GridLogMessage << "Set reliable update Delta to " << relup_delta << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
  }   
 | 
			
		||||
  
 | 
			
		||||
  const int Ls=12;
 | 
			
		||||
 | 
			
		||||
  { 
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid_f   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f);
 | 
			
		||||
  GridCartesian         * FGrid_f   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
 | 
			
		||||
  
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
 | 
			
		||||
  LatticeFermionD    src(FGrid); random(RNG5,src);
 | 
			
		||||
  LatticeFermionD result(FGrid); result=Zero();
 | 
			
		||||
  LatticeGaugeFieldD Umu(UGrid);
 | 
			
		||||
  LatticeGaugeFieldF Umu_f(UGrid_f); 
 | 
			
		||||
  
 | 
			
		||||
  SU<Nc>::HotConfiguration(RNG4,Umu);
 | 
			
		||||
 | 
			
		||||
  precisionChange(Umu_f,Umu);
 | 
			
		||||
  
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD M5=1.8;
 | 
			
		||||
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
  DomainWallFermionF Ddwf_f(Umu_f,*FGrid_f,*FrbGrid_f,*UGrid_f,*UrbGrid_f,mass,M5);
 | 
			
		||||
 | 
			
		||||
  LatticeFermionD    src_o(FrbGrid);
 | 
			
		||||
  LatticeFermionD result_o(FrbGrid);
 | 
			
		||||
  LatticeFermionD result_o_2(FrbGrid);
 | 
			
		||||
  pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
  result_o.Checkerboard() = Odd;
 | 
			
		||||
  result_o = Zero();
 | 
			
		||||
  result_o_2.Checkerboard() = Odd;
 | 
			
		||||
  result_o_2 = Zero();
 | 
			
		||||
 | 
			
		||||
  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
 | 
			
		||||
  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
 | 
			
		||||
  ConjugateGradientReliableUpdate<LatticeFermionD,LatticeFermionF> mCG(1e-8, 10000, relup_delta, FrbGrid_f, HermOpEO_f, HermOpEO);
 | 
			
		||||
  double t1,t2,flops;
 | 
			
		||||
  double MdagMsiteflops = 1452; // Mobius (real coeffs)
 | 
			
		||||
  // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of)
 | 
			
		||||
  double CGsiteflops = (8+4+8+4+4)*Nc*Ns ;
 | 
			
		||||
  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
 | 
			
		||||
  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
 | 
			
		||||
  int iters, iters_cleanup, relups, tot_iters;
 | 
			
		||||
  for(int i=0;i<10;i++){
 | 
			
		||||
    result_o = Zero();
 | 
			
		||||
    t1=usecond();
 | 
			
		||||
    mCG(src_o,result_o);
 | 
			
		||||
    t2=usecond();
 | 
			
		||||
    iters = mCG.IterationsToComplete; //Number of single prec CG iterations
 | 
			
		||||
    iters_cleanup = mCG.IterationsToCleanup;
 | 
			
		||||
    relups = mCG.ReliableUpdatesPerformed;
 | 
			
		||||
    tot_iters  = iters + iters_cleanup + relups; //relup cost MdagM application in double
 | 
			
		||||
    
 | 
			
		||||
    flops = MdagMsiteflops*4*FrbGrid->gSites()*tot_iters;
 | 
			
		||||
    flops+= CGsiteflops*FrbGrid->gSites()*tot_iters;
 | 
			
		||||
    std::cout << " SinglePrecision single prec iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
			
		||||
    std::cout << " SinglePrecision double prec cleanup iterations/sec "<< iters_cleanup/(t2-t1)*1000.*1000.<<std::endl;
 | 
			
		||||
    std::cout << " SinglePrecision reliable updates/sec "<< relups/(t2-t1)*1000.*1000.<<std::endl;
 | 
			
		||||
    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
 | 
			
		||||
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
 | 
			
		||||
  for(int i=0;i<1;i++){
 | 
			
		||||
    result_o_2 = Zero();
 | 
			
		||||
    t1=usecond();
 | 
			
		||||
    CG(HermOpEO,src_o,result_o_2);
 | 
			
		||||
    t2=usecond();
 | 
			
		||||
    iters = CG.IterationsToComplete;
 | 
			
		||||
    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; 
 | 
			
		||||
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
 | 
			
		||||
    
 | 
			
		||||
    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
			
		||||
    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  //  MemoryManager::Print();
 | 
			
		||||
 | 
			
		||||
  LatticeFermionD diff_o(FrbGrid);
 | 
			
		||||
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  MemoryManager::Print();
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user