diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index ef02f6aa..d055898f 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -40,7 +40,7 @@ void MemoryManager::PrintBytes(void) ////////////////////////////////////////////////////////////////////// MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; int MemoryManager::Victim[MemoryManager::NallocType]; -int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 }; +int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 }; uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; ////////////////////////////////////////////////////////////////////// // Actual allocation and deallocation utils diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 740d8d92..c22a54f3 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -36,6 +36,11 @@ NAMESPACE_BEGIN(Grid); #define GRID_ALLOC_SMALL_LIMIT (4096) +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) +#define FILE_LINE __FILE__ ":" TOSTRING(__LINE__) +#define AUDIT(a) MemoryManager::Audit(FILE_LINE) + /*Pinning pages is costly*/ //////////////////////////////////////////////////////////////////////////// // Advise the LatticeAccelerator class @@ -92,8 +97,9 @@ private: static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ; static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ; - static void PrintBytes(void); public: + static void PrintBytes(void); + static void Audit(std::string s); static void Init(void); static void InitMessage(void); static void *AcceleratorAllocate(size_t bytes); @@ -113,6 +119,8 @@ private: static uint64_t DeviceToHostBytes; static uint64_t HostToDeviceXfer; static uint64_t DeviceToHostXfer; + static uint64_t DeviceEvictions; + static uint64_t DeviceDestroy; private: #ifndef GRID_UVM @@ -170,6 +178,7 @@ private: public: static void Print(void); + static void PrintAll(void); static void PrintState( void* CpuPtr); static int isOpen (void* CpuPtr); static void ViewClose(void* CpuPtr,ViewMode mode); diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 04b3fe95..bae184ec 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -3,8 +3,13 @@ #warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); -//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout); -#define dprintf(...) + +#define MAXLINE 512 +static char print_buffer [ MAXLINE ]; + +#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; +#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; +//#define dprintf(...) //////////////////////////////////////////////////////////// @@ -23,6 +28,8 @@ uint64_t MemoryManager::HostToDeviceBytes; uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; +uint64_t MemoryManager::DeviceEvictions; +uint64_t MemoryManager::DeviceDestroy; //////////////////////////////////// // Priority ordering for unlocked entries @@ -104,15 +111,17 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); + DeviceDestroy++; DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + AccCache.AccPtr=(uint64_t) NULL; + dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -121,26 +130,36 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::Evict(AcceleratorViewEntry &AccCache) { /////////////////////////////////////////////////////////////////////////// - // Make CPU consistent, remove from Accelerator, remove entry - // Cannot be locked. If allocated must be in LRU pool. + // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry + // Cannot be acclocked. If allocated must be in LRU pool. + // + // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock. + // and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen + // but there is a weakness where CpuLock entries are attempted for erase + // Take these OUT LRU queue when CPU locked? + // Cannot take out the table as cpuLock data is important. /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); - assert(AccCache.accLock==0); - assert(AccCache.cpuLock==0); + mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n", + (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, + (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); + assert(AccCache.accLock==0); // Cannot evict so logic bomb + assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.state==AccDirty) { Flush(AccCache); } - assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); - DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + AccCache.AccPtr=(uint64_t)NULL; + AccCache.state=CpuDirty; // CPU primary now + DeviceBytes -=AccCache.bytes; + dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } - uint64_t CpuPtr = AccCache.CpuPtr; - EntryErase(CpuPtr); + // uint64_t CpuPtr = AccCache.CpuPtr; + DeviceEvictions++; + // EntryErase(CpuPtr); } void MemoryManager::Flush(AcceleratorViewEntry &AccCache) { @@ -150,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Flush %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -165,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -191,6 +210,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::ViewClose(void* Ptr,ViewMode mode) { if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr); } else if( (mode==CpuRead)||(mode==CpuWrite)){ CpuViewClose((uint64_t)Ptr); @@ -202,6 +222,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis { uint64_t CpuPtr = (uint64_t)_CpuPtr; if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); } else if( (mode==CpuRead)||(mode==CpuWrite)){ return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); @@ -212,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis } void MemoryManager::EvictVictims(uint64_t bytes) { + assert(bytes DeviceMaxBytes){ if ( DeviceLRUBytes > 0){ assert(LRU.size()>0); - uint64_t victim = LRU.back(); + uint64_t victim = LRU.back(); // From the LRU auto AccCacheIterator = EntryLookup(victim); auto & AccCache = AccCacheIterator->second; Evict(AccCache); + } else { + return; } } } @@ -241,11 +265,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { - dprintf("ViewOpen found entry %llx %llx : %lld %lld\n", + dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr, (uint64_t)CpuPtr, (uint64_t)AccCache.bytes, - (uint64_t)bytes); + (uint64_t)bytes, + (uint64_t)AccCache.accLock); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -280,6 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // Empty + AccRead => Consistent } AccCache.accLock= 1; + dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); } else if(AccCache.state==CpuDirty ){ if(mode==AcceleratorWriteDiscard) { CpuDiscard(AccCache); @@ -292,28 +318,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); } else { assert(0); } - // If view is opened on device remove from LRU + assert(AccCache.accLock>0); + // If view is opened on device must remove from LRU if(AccCache.LRU_valid==1){ // must possibly remove from LRU as now locked on GPU + dprintf("AccCache entry removed from LRU \n"); LRUremove(AccCache); } @@ -334,10 +362,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) assert(AccCache.accLock>0); AccCache.accLock--; - // Move to LRU queue if not locked and close on device if(AccCache.accLock==0) { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); LRUinsert(AccCache); + } else { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); } } void MemoryManager::CpuViewClose(uint64_t CpuPtr) @@ -374,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - if (!AccCache.AccPtr) { - EvictVictims(bytes); - } + // CPU doesn't need to free space + // if (!AccCache.AccPtr) { + // EvictVictims(bytes); + // } assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error @@ -430,20 +461,28 @@ void MemoryManager::NotifyDeletion(void *_ptr) void MemoryManager::Print(void) { PrintBytes(); - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "Memory Manager " << std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl; - std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl; - std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl; - std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl; - std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl; - std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<second; @@ -453,13 +492,13 @@ void MemoryManager::Print(void) if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); if ( AccCache.state==Consistent)str = std::string("Consistent"); - std::cout << GridLogDebug << "0x"<second; + LruBytes2+=AccCache.bytes; + assert(AccCache.LRU_valid==1); + assert(AccCache.LRU_entry==it); + } + std::cout << " Memory Manager::Audit() LRU queue matches table entries "<second; + + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + + CpuBytes+=AccCache.bytes; + if( AccCache.AccPtr ) AccBytes+=AccCache.bytes; + if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes; + if( AccCache.LRU_valid ) LruCnt++; + + if ( AccCache.cpuLock || AccCache.accLock ) { + assert(AccCache.LRU_valid==0); + std::cout << GridLogError << s<< "\n\t 0x"<Device memory movement not currently managed by Grid." << std::endl; }; void MemoryManager::Print(void){}; +void MemoryManager::PrintAll(void){}; void MemoryManager::NotifyDeletion(void *ptr){}; NAMESPACE_END(Grid); diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 9f5f1da7..c4adf86a 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -46,3 +46,4 @@ Author: Peter Boyle #include #include #include +#include diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 9c3d723f..34f13fa6 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -129,7 +129,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -152,7 +152,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -174,7 +174,7 @@ public: this->checkerboard=cb; auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -245,7 +245,7 @@ public: /////////////////////////////////////////// // user defined constructor /////////////////////////////////////////// - Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { + Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) { this->_grid = grid; resize(this->_grid->oSites()); assert((((uint64_t)&this->_odata[0])&0xF) ==0); @@ -288,7 +288,7 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); @@ -303,7 +303,7 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); diff --git a/Grid/lattice/Lattice_crc.h b/Grid/lattice/Lattice_crc.h new file mode 100644 index 00000000..142e2349 --- /dev/null +++ b/Grid/lattice/Lattice_crc.h @@ -0,0 +1,55 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/lattice/Lattice_crc.h + + Copyright (C) 2021 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +template void DumpSliceNorm(std::string s,Lattice &f,int mu=-1) +{ + auto ff = localNorm2(f); + if ( mu==-1 ) mu = f.Grid()->Nd()-1; + typedef typename vobj::tensor_reduced normtype; + typedef typename normtype::scalar_object scalar; + std::vector sff; + sliceSum(ff,sff,mu); + for(int t=0;t uint32_t crc(Lattice & buf) +{ + autoView( buf_v , buf, CpuRead); + return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites()); +} + +#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "< &logstreams) { - GridLogError.Active(0); + GridLogError.Active(1); GridLogWarning.Active(0); GridLogMessage.Active(1); // at least the messages should be always on + GridLogMemory.Active(0); + GridLogTracing.Active(0); GridLogIterative.Active(0); GridLogDebug.Active(0); GridLogPerformance.Active(0); + GridLogDslash.Active(0); GridLogIntegrator.Active(1); GridLogColours.Active(0); + GridLogHMC.Active(1); for (int i = 0; i < logstreams.size(); i++) { - if (logstreams[i] == std::string("Error")) GridLogError.Active(1); + if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1); + if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1); if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1); if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0); if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); - if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1); + if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); + if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0); + if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); } } diff --git a/Grid/log/Log.h b/Grid/log/Log.h index 68693647..2d663a3c 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -138,7 +138,8 @@ public: stream << std::setw(log.topWidth); } stream << log.topName << log.background()<< " : "; - stream << log.colour() << std::left; + // stream << log.colour() << std::left; + stream << std::left; if (log.chanWidth > 0) { stream << std::setw(log.chanWidth); @@ -153,9 +154,9 @@ public: stream << log.evidence() << now << log.background() << " : " ; } - stream << log.colour(); + // stream << log.colour(); + stream << std::right; stream.flags(f); - return stream; } else { return devnull; @@ -180,8 +181,12 @@ extern GridLogger GridLogWarning; extern GridLogger GridLogMessage; extern GridLogger GridLogDebug ; extern GridLogger GridLogPerformance; +extern GridLogger GridLogDslash; extern GridLogger GridLogIterative ; extern GridLogger GridLogIntegrator ; +extern GridLogger GridLogHMC; +extern GridLogger GridLogMemory; +extern GridLogger GridLogTracing; extern Colours GridLogColours; std::string demangle(const char* name) ; diff --git a/Grid/perfmon/PerfCount.cc b/Grid/perfmon/PerfCount.cc index 2062bb59..114c36a0 100644 --- a/Grid/perfmon/PerfCount.cc +++ b/Grid/perfmon/PerfCount.cc @@ -27,10 +27,13 @@ Author: paboyle /* END LEGAL */ #include -#include +#include +#include NAMESPACE_BEGIN(Grid); +GridTimePoint theProgramStart = GridClock::now(); + #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16)) #define RawConfig(A,B) (A<<8|B) const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = { diff --git a/Grid/perfmon/PerfCount.h b/Grid/perfmon/PerfCount.h index dd25b41e..62b2a740 100644 --- a/Grid/perfmon/PerfCount.h +++ b/Grid/perfmon/PerfCount.h @@ -30,6 +30,12 @@ Author: paboyle #ifndef GRID_PERFCOUNT_H #define GRID_PERFCOUNT_H + +#ifndef __SSC_START +#define __SSC_START +#define __SSC_STOP +#endif + #include #include #include @@ -72,17 +78,9 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, inline uint64_t cyclecount(void){ return 0; } -#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx") -#define __SSC_STOP __SSC_MARK(0x110) -#define __SSC_START __SSC_MARK(0x111) - #else -#define __SSC_MARK(mark) -#define __SSC_STOP -#define __SSC_START - /* * cycle counters arch dependent */ diff --git a/Grid/perfmon/Timer.h b/Grid/perfmon/Timer.h index 2a44faee..ba5df85a 100644 --- a/Grid/perfmon/Timer.h +++ b/Grid/perfmon/Timer.h @@ -35,17 +35,8 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid) -// Dress the output; use std::chrono -// C++11 time facilities better? -inline double usecond(void) { - struct timeval tv; -#ifdef TIMERS_ON - gettimeofday(&tv,NULL); -#endif - return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec; -} - -typedef std::chrono::system_clock GridClock; +//typedef std::chrono::system_clock GridClock; +typedef std::chrono::high_resolution_clock GridClock; typedef std::chrono::time_point GridTimePoint; typedef std::chrono::seconds GridSecs; @@ -53,6 +44,15 @@ typedef std::chrono::milliseconds GridMillisecs; typedef std::chrono::microseconds GridUsecs; typedef std::chrono::microseconds GridTime; +extern GridTimePoint theProgramStart; +// Dress the output; use std::chrono +// C++11 time facilities better? +inline double usecond(void) { + auto usecs = std::chrono::duration_cast(GridClock::now()-theProgramStart); + return 1.0*usecs.count(); +} + + inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time) { stream << time.count()<<" s"; diff --git a/Grid/perfmon/Tracing.h b/Grid/perfmon/Tracing.h new file mode 100644 index 00000000..5000cef4 --- /dev/null +++ b/Grid/perfmon/Tracing.h @@ -0,0 +1,70 @@ +#pragma once + +NAMESPACE_BEGIN(Grid); + +#ifdef GRID_TRACING_NVTX +#include +class GridTracer { +public: + GridTracer(const char* name) { + nvtxRangePushA(name); + } + ~GridTracer() { + nvtxRangePop(); + } +}; +inline void tracePush(const char *name) { nvtxRangePushA(name); } +inline void tracePop(const char *name) { nvtxRangePop(); } +inline int traceStart(const char *name) { } +inline void traceStop(int ID) { } +#endif + +#ifdef GRID_TRACING_ROCTX +#include +class GridTracer { + public: + GridTracer(const char* name) { + roctxRangePushA(name); + std::cout << "roctxRangePush "< +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + +Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT, + Gamma::Algebra::Gamma5 +}; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< seeds({1,2,3,4}); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds); // naughty seeding + GridParallelRNG pRNG(&GRID); + pRNG.SeedFixedIntegers(seeds); + + LatticeGaugeFieldD Umu(&GRID); + SU::ColdConfiguration(pRNG,Umu); // Unit gauge + + //////////////////////////////////////////////////// + // Wilson test + //////////////////////////////////////////////////// + { + LatticeFermionD src(&GRID); gaussian(pRNG,src); + LatticeFermionD src_p(&GRID); + LatticeFermionD tmp(&GRID); + LatticeFermionD ref(&GRID); + LatticeFermionD result(&GRID); + + RealD mass=0.1; + WilsonFermionD Dw(Umu,GRID,RBGRID,mass); + + Dw.M(src,ref); + std::cout << "Norm src "< 1/2 gmu (eip - emip) = i sinp gmu + Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src_p); + + } + + W = mass + sk2; + Kinetic = Kinetic + W * src_p; + + std::cout<<"Momentum space src "<< norm2(src_p)< HermOp(Dw); + ConjugateGradient CG(1.0e-10,10000); + CG(HermOp,src,result); + + //////////////////////////////////////////////////////////////////////// + std::cout << " Taking difference" <::RandomGaugeTransform(pRNG,U_GT,g); // Unit gauge + + LatticeFermionD src(&GRID); + LatticeFermionD tmp(&GRID); + LatticeFermionD ref(&GRID); + LatticeFermionD diff(&GRID); + + // could loop over colors + src=Zero(); + Coordinate point(4,0); // 0,0,0,0 + SpinColourVectorD ferm; + ferm=Zero(); + ferm()(0)(0) = ComplexD(1.0); + pokeSite(ferm,src,point); + + RealD mass=0.1; + WilsonFermionD Dw(U_GT,GRID,RBGRID,mass); + + // Momentum space prop + std::cout << " Solving by FFT and Feynman rules" < HermOp(Dw); + ConjugateGradient CG(1.0e-10,10000); + CG(HermOp,src,result); + + //////////////////////////////////////////////////////////////////////// + std::cout << " Taking difference" < + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +void MemoryTest(GridCartesian * FGrid,int N); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + + int N=100; + for(int i=0;i A(N,zero);//FGrid); + + std::vector B(N,ComplexD(0.0)); // Update sequentially on host + + for(int v=0;voSites(),1,{ + A_v[ss] = A_v[ss] + zc; + }); + } else { + autoView(A_v,A[v],CpuWrite); + thread_for(ss,FGrid->oSites(),{ + A_v[ss] = A_v[ss] + zc; + }); + } + } + } else { + if ( e == 0 ) { + A[v] = A[v] + A[v] - A[v]; + } else { + if ( dev ) { + autoView(A_v,A[v],AcceleratorRead); + accelerator_for(ss,FGrid->oSites(),1,{ + assert(B[v]==A_v[ss]()()().getlane(0)); + }); + // std::cout << "["<oSites(),{ + assert(B[v]==A_v[ss]()()().getlane(0)); + }); + // std::cout << "["<