From 4a359fa9e996bb6c728848f5b4bcd8ad802565e4 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2022 00:24:08 -0500 Subject: [PATCH] Bug fix --- Grid/allocator/MemoryManagerCache.cc | 116 ++++++++++++++++++++------- 1 file changed, 86 insertions(+), 30 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index f03ee79f..3420c9cc 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -28,6 +28,8 @@ uint64_t MemoryManager::HostToDeviceBytes; uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; +uint64_t MemoryManager::DeviceEvictions; +uint64_t MemoryManager::DeviceDestroy; //////////////////////////////////// // Priority ordering for unlocked entries @@ -115,8 +117,10 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); + DeviceDestroy++; DeviceBytes -=AccCache.bytes; LRUremove(AccCache); + AccCache.AccPtr=(uint64_t) NULL; dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; @@ -126,28 +130,36 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::Evict(AcceleratorViewEntry &AccCache) { /////////////////////////////////////////////////////////////////////////// - // Make CPU consistent, remove from Accelerator, remove entry - // Cannot be locked. If allocated must be in LRU pool. + // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry + // Cannot be acclocked. If allocated must be in LRU pool. + // + // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock. + // and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen + // but there is a weakness where CpuLock entries are attempted for erase + // Take these OUT LRU queue when CPU locked? + // Cannot take out the table as cpuLock data is important. /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); - if (AccCache.accLock!=0) return; - if (AccCache.cpuLock!=0) return; + assert(AccCache.accLock==0); // Cannot evict so logic bomb + assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.state==AccDirty) { Flush(AccCache); } - assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); - DeviceBytes -=AccCache.bytes; LRUremove(AccCache); + AccCache.AccPtr=(uint64_t)NULL; + AccCache.state=CpuDirty; // CPU primary now + DeviceBytes -=AccCache.bytes; dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } - uint64_t CpuPtr = AccCache.CpuPtr; - EntryErase(CpuPtr); + // uint64_t CpuPtr = AccCache.CpuPtr; + DeviceEvictions++; + // EntryErase(CpuPtr); } void MemoryManager::Flush(AcceleratorViewEntry &AccCache) { @@ -221,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis } void MemoryManager::EvictVictims(uint64_t bytes) { + assert(bytes DeviceMaxBytes){ if ( DeviceLRUBytes > 0){ assert(LRU.size()>0); - uint64_t victim = LRU.back(); + uint64_t victim = LRU.back(); // From the LRU auto AccCacheIterator = EntryLookup(victim); auto & AccCache = AccCacheIterator->second; Evict(AccCache); + } else { + return; } } } @@ -322,7 +337,8 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(0); } - // If view is opened on device remove from LRU + assert(AccCache.accLock>0); + // If view is opened on device must remove from LRU if(AccCache.LRU_valid==1){ // must possibly remove from LRU as now locked on GPU dprintf("AccCache entry removed from LRU \n"); @@ -389,7 +405,7 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V auto & AccCache = AccCacheIterator->second; if (!AccCache.AccPtr) { - EvictVictims(bytes); + EvictVictims(bytes); } assert((mode==CpuRead)||(mode==CpuWrite)); @@ -444,20 +460,28 @@ void MemoryManager::NotifyDeletion(void *_ptr) void MemoryManager::Print(void) { PrintBytes(); - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "Memory Manager " << std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl; - std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl; - std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl; - std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl; - std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl; - std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<second; @@ -467,13 +491,13 @@ void MemoryManager::Print(void) if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); if ( AccCache.state==Consistent)str = std::string("Consistent"); - std::cout << GridLogDebug << "0x"<second; + LruBytes2+=AccCache.bytes; + assert(AccCache.LRU_valid==1); + assert(AccCache.LRU_entry==it); + } + std::cout << " Memory Manager::Audit() LRU queue matches table entries "<second; @@ -498,7 +540,13 @@ void MemoryManager::Audit(std::string s) if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); if ( AccCache.state==Consistent)str = std::string("Consistent"); - if ( AccCache.cpuLock || AccCache.accLock ) { + CpuBytes+=AccCache.bytes; + if( AccCache.AccPtr ) AccBytes+=AccCache.bytes; + if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes; + if( AccCache.LRU_valid ) LruCnt++; + + if ( AccCache.cpuLock || AccCache.accLock ) { + assert(AccCache.LRU_valid==0); std::cout << GridLogError << s<< "\n\t 0x"<