From 281f8101fe8c679bfd52d149d53a669fdbd92c0d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 17 Dec 2022 20:35:33 -0500 Subject: [PATCH 01/11] Matt FFT test --- tests/core/Test_fft_matt.cc | 160 ++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 tests/core/Test_fft_matt.cc diff --git a/tests/core/Test_fft_matt.cc b/tests/core/Test_fft_matt.cc new file mode 100644 index 00000000..d4455a7e --- /dev/null +++ b/tests/core/Test_fft_matt.cc @@ -0,0 +1,160 @@ + /************************************************************************************* + + grid` physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_cshift.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< seeds({1,2,3,4}); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds); // naughty seeding + GridParallelRNG pRNG(&GRID); + pRNG.SeedFixedIntegers(seeds); + + LatticeGaugeFieldD Umu(&GRID); + + SU::ColdConfiguration(pRNG,Umu); // Unit gauge + + //////////////////////////////////////////////////// + // Wilson test + //////////////////////////////////////////////////// + { + LatticeFermionD src(&GRID); gaussian(pRNG,src); + LatticeFermionD tmp(&GRID); + LatticeFermionD ref(&GRID); + + RealD mass=0.01; + WilsonFermionD Dw(Umu,GRID,RBGRID,mass); + + Dw.M(src,tmp); + + std::cout << "Dw src = " < HermOp(Dw); + ConjugateGradient CG(1.0e-10,10000); + CG(HermOp,src,result); + + //////////////////////////////////////////////////////////////////////// + std::cout << " Taking difference" < Date: Sun, 18 Dec 2022 12:05:00 -0500 Subject: [PATCH 02/11] Updated FFT test for PETSc --- Grid/lattice/Lattice.h | 1 + Grid/lattice/Lattice_crc.h | 55 +++++++++++++++++++++ tests/core/Test_fft_matt.cc | 95 ++++++++++++++++++++++++++++--------- 3 files changed, 129 insertions(+), 22 deletions(-) create mode 100644 Grid/lattice/Lattice_crc.h diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 9f5f1da7..c4adf86a 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -46,3 +46,4 @@ Author: Peter Boyle #include #include #include +#include diff --git a/Grid/lattice/Lattice_crc.h b/Grid/lattice/Lattice_crc.h new file mode 100644 index 00000000..142e2349 --- /dev/null +++ b/Grid/lattice/Lattice_crc.h @@ -0,0 +1,55 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/lattice/Lattice_crc.h + + Copyright (C) 2021 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +template void DumpSliceNorm(std::string s,Lattice &f,int mu=-1) +{ + auto ff = localNorm2(f); + if ( mu==-1 ) mu = f.Grid()->Nd()-1; + typedef typename vobj::tensor_reduced normtype; + typedef typename normtype::scalar_object scalar; + std::vector sff; + sliceSum(ff,sff,mu); + for(int t=0;t uint32_t crc(Lattice & buf) +{ + autoView( buf_v , buf, CpuRead); + return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites()); +} + +#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "< #include using namespace Grid; - ; + +Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT, + Gamma::Algebra::Gamma5 +}; int main (int argc, char ** argv) { @@ -49,22 +55,7 @@ int main (int argc, char ** argv) GridCartesian GRID(latt_size,simd_layout,mpi_layout); GridRedBlackCartesian RBGRID(&GRID); - LatticeComplexD one(&GRID); - LatticeComplexD zz(&GRID); - LatticeComplexD C(&GRID); - LatticeComplexD Ctilde(&GRID); - LatticeComplexD Cref (&GRID); - LatticeComplexD Csav (&GRID); LatticeComplexD coor(&GRID); - - LatticeSpinMatrixD S(&GRID); - LatticeSpinMatrixD Stilde(&GRID); - - Coordinate p({1,3,2,3}); - - one = ComplexD(1.0,0.0); - zz = ComplexD(0.0,0.0); - ComplexD ci(0.0,1.0); std::vector seeds({1,2,3,4}); @@ -73,7 +64,6 @@ int main (int argc, char ** argv) pRNG.SeedFixedIntegers(seeds); LatticeGaugeFieldD Umu(&GRID); - SU::ColdConfiguration(pRNG,Umu); // Unit gauge //////////////////////////////////////////////////// @@ -81,17 +71,78 @@ int main (int argc, char ** argv) //////////////////////////////////////////////////// { LatticeFermionD src(&GRID); gaussian(pRNG,src); + LatticeFermionD src_p(&GRID); LatticeFermionD tmp(&GRID); LatticeFermionD ref(&GRID); + LatticeFermionD result(&GRID); - RealD mass=0.01; + RealD mass=0.1; WilsonFermionD Dw(Umu,GRID,RBGRID,mass); - Dw.M(src,tmp); + Dw.M(src,ref); + std::cout << "Norm src "< 1/2 gmu (eip - emip) = i sinp gmu + Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src_p); + + } + + W = mass + sk2; + Kinetic = Kinetic + W * src_p; + + std::cout<<"Momentum space src "<< norm2(src_p)< %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Flush %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -165,7 +172,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -191,6 +198,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::ViewClose(void* Ptr,ViewMode mode) { if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr); } else if( (mode==CpuRead)||(mode==CpuWrite)){ CpuViewClose((uint64_t)Ptr); @@ -202,6 +210,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis { uint64_t CpuPtr = (uint64_t)_CpuPtr; if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); } else if( (mode==CpuRead)||(mode==CpuWrite)){ return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); @@ -241,11 +250,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { - dprintf("ViewOpen found entry %llx %llx : %lld %lld\n", + dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr, (uint64_t)CpuPtr, (uint64_t)AccCache.bytes, - (uint64_t)bytes); + (uint64_t)bytes, + (uint64_t)AccCache.accLock); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -280,6 +290,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // Empty + AccRead => Consistent } AccCache.accLock= 1; + dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); } else if(AccCache.state==CpuDirty ){ if(mode==AcceleratorWriteDiscard) { CpuDiscard(AccCache); @@ -292,21 +303,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); } else { assert(0); } @@ -314,6 +325,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // If view is opened on device remove from LRU if(AccCache.LRU_valid==1){ // must possibly remove from LRU as now locked on GPU + dprintf("AccCache entry removed from LRU \n"); LRUremove(AccCache); } @@ -334,10 +346,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) assert(AccCache.accLock>0); AccCache.accLock--; - // Move to LRU queue if not locked and close on device if(AccCache.accLock==0) { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); LRUinsert(AccCache); + } else { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); } } void MemoryManager::CpuViewClose(uint64_t CpuPtr) @@ -473,6 +487,29 @@ int MemoryManager::isOpen (void* _CpuPtr) return 0; } } +void MemoryManager::Audit(std::string s) +{ + for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){ + auto &AccCache = it->second; + + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + + if ( AccCache.cpuLock || AccCache.accLock ) { + std::cout << GridLogError << s<< "\n\t 0x"< &logstreams) { - GridLogError.Active(0); + GridLogError.Active(1); GridLogWarning.Active(0); GridLogMessage.Active(1); // at least the messages should be always on + GridLogMemory.Active(0); + GridLogTracing.Active(0); GridLogIterative.Active(0); GridLogDebug.Active(0); GridLogPerformance.Active(0); + GridLogDslash.Active(0); GridLogIntegrator.Active(1); GridLogColours.Active(0); + GridLogHMC.Active(1); for (int i = 0; i < logstreams.size(); i++) { - if (logstreams[i] == std::string("Error")) GridLogError.Active(1); + if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1); + if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1); if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1); if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0); if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); - if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1); + if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); + if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0); + if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); } } diff --git a/Grid/log/Log.h b/Grid/log/Log.h index 68693647..2d663a3c 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -138,7 +138,8 @@ public: stream << std::setw(log.topWidth); } stream << log.topName << log.background()<< " : "; - stream << log.colour() << std::left; + // stream << log.colour() << std::left; + stream << std::left; if (log.chanWidth > 0) { stream << std::setw(log.chanWidth); @@ -153,9 +154,9 @@ public: stream << log.evidence() << now << log.background() << " : " ; } - stream << log.colour(); + // stream << log.colour(); + stream << std::right; stream.flags(f); - return stream; } else { return devnull; @@ -180,8 +181,12 @@ extern GridLogger GridLogWarning; extern GridLogger GridLogMessage; extern GridLogger GridLogDebug ; extern GridLogger GridLogPerformance; +extern GridLogger GridLogDslash; extern GridLogger GridLogIterative ; extern GridLogger GridLogIntegrator ; +extern GridLogger GridLogHMC; +extern GridLogger GridLogMemory; +extern GridLogger GridLogTracing; extern Colours GridLogColours; std::string demangle(const char* name) ; From b00a4142e5f919847285caeac13187fe3f6e10b2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2022 00:18:11 -0500 Subject: [PATCH 04/11] A=A fix --- Grid/lattice/Lattice_base.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 9c3d723f..34f13fa6 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -129,7 +129,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -152,7 +152,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -174,7 +174,7 @@ public: this->checkerboard=cb; auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -245,7 +245,7 @@ public: /////////////////////////////////////////// // user defined constructor /////////////////////////////////////////// - Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { + Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) { this->_grid = grid; resize(this->_grid->oSites()); assert((((uint64_t)&this->_odata[0])&0xF) ==0); @@ -288,7 +288,7 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); @@ -303,7 +303,7 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); From 43a45ec97bf0212844dc2bc8df9774f534bc29ba Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2022 00:18:43 -0500 Subject: [PATCH 05/11] SSC_START --- Grid/perfmon/PerfCount.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/Grid/perfmon/PerfCount.h b/Grid/perfmon/PerfCount.h index dd25b41e..62b2a740 100644 --- a/Grid/perfmon/PerfCount.h +++ b/Grid/perfmon/PerfCount.h @@ -30,6 +30,12 @@ Author: paboyle #ifndef GRID_PERFCOUNT_H #define GRID_PERFCOUNT_H + +#ifndef __SSC_START +#define __SSC_START +#define __SSC_STOP +#endif + #include #include #include @@ -72,17 +78,9 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, inline uint64_t cyclecount(void){ return 0; } -#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx") -#define __SSC_STOP __SSC_MARK(0x110) -#define __SSC_START __SSC_MARK(0x111) - #else -#define __SSC_MARK(mark) -#define __SSC_STOP -#define __SSC_START - /* * cycle counters arch dependent */ From 99b3697b031b201a2dbe11c1942e672bfc846f05 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2022 00:19:33 -0500 Subject: [PATCH 06/11] More loggin --- Grid/allocator/MemoryManager.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index ad2d9ffc..c22a54f3 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -97,8 +97,8 @@ private: static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ; static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ; - static void PrintBytes(void); public: + static void PrintBytes(void); static void Audit(std::string s); static void Init(void); static void InitMessage(void); @@ -119,6 +119,8 @@ private: static uint64_t DeviceToHostBytes; static uint64_t HostToDeviceXfer; static uint64_t DeviceToHostXfer; + static uint64_t DeviceEvictions; + static uint64_t DeviceDestroy; private: #ifndef GRID_UVM @@ -176,6 +178,7 @@ private: public: static void Print(void); + static void PrintAll(void); static void PrintState( void* CpuPtr); static int isOpen (void* CpuPtr); static void ViewClose(void* CpuPtr,ViewMode mode); From 37ba32776f410af38b43ddceff009be1c0cfcc62 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2022 00:19:42 -0500 Subject: [PATCH 07/11] More logging --- Grid/allocator/MemoryManagerShared.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 2434ad47..ba95420e 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -12,8 +12,10 @@ uint64_t MemoryManager::HostToDeviceBytes; uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; +uint64_t MemoryManager::DeviceEvictions; +uint64_t MemoryManager::DeviceDestroy; -void MemoryManager::Audit(void){}; +void MemoryManager::Audit(std::string s){}; void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; int MemoryManager::isOpen (void* CpuPtr) { return 0;} @@ -22,6 +24,7 @@ void MemoryManager::PrintState(void* CpuPtr) std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl; }; void MemoryManager::Print(void){}; +void MemoryManager::PrintAll(void){}; void MemoryManager::NotifyDeletion(void *ptr){}; NAMESPACE_END(Grid); From 1822ced302f08435035921438cb414c364a98898 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2022 00:24:08 -0500 Subject: [PATCH 08/11] Bug fix --- Grid/allocator/MemoryManagerCache.cc | 116 ++++++++++++++++++++------- 1 file changed, 86 insertions(+), 30 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index f03ee79f..3420c9cc 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -28,6 +28,8 @@ uint64_t MemoryManager::HostToDeviceBytes; uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; +uint64_t MemoryManager::DeviceEvictions; +uint64_t MemoryManager::DeviceDestroy; //////////////////////////////////// // Priority ordering for unlocked entries @@ -115,8 +117,10 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); + DeviceDestroy++; DeviceBytes -=AccCache.bytes; LRUremove(AccCache); + AccCache.AccPtr=(uint64_t) NULL; dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; @@ -126,28 +130,36 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::Evict(AcceleratorViewEntry &AccCache) { /////////////////////////////////////////////////////////////////////////// - // Make CPU consistent, remove from Accelerator, remove entry - // Cannot be locked. If allocated must be in LRU pool. + // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry + // Cannot be acclocked. If allocated must be in LRU pool. + // + // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock. + // and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen + // but there is a weakness where CpuLock entries are attempted for erase + // Take these OUT LRU queue when CPU locked? + // Cannot take out the table as cpuLock data is important. /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); - if (AccCache.accLock!=0) return; - if (AccCache.cpuLock!=0) return; + assert(AccCache.accLock==0); // Cannot evict so logic bomb + assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.state==AccDirty) { Flush(AccCache); } - assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); - DeviceBytes -=AccCache.bytes; LRUremove(AccCache); + AccCache.AccPtr=(uint64_t)NULL; + AccCache.state=CpuDirty; // CPU primary now + DeviceBytes -=AccCache.bytes; dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } - uint64_t CpuPtr = AccCache.CpuPtr; - EntryErase(CpuPtr); + // uint64_t CpuPtr = AccCache.CpuPtr; + DeviceEvictions++; + // EntryErase(CpuPtr); } void MemoryManager::Flush(AcceleratorViewEntry &AccCache) { @@ -221,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis } void MemoryManager::EvictVictims(uint64_t bytes) { + assert(bytes DeviceMaxBytes){ if ( DeviceLRUBytes > 0){ assert(LRU.size()>0); - uint64_t victim = LRU.back(); + uint64_t victim = LRU.back(); // From the LRU auto AccCacheIterator = EntryLookup(victim); auto & AccCache = AccCacheIterator->second; Evict(AccCache); + } else { + return; } } } @@ -322,7 +337,8 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(0); } - // If view is opened on device remove from LRU + assert(AccCache.accLock>0); + // If view is opened on device must remove from LRU if(AccCache.LRU_valid==1){ // must possibly remove from LRU as now locked on GPU dprintf("AccCache entry removed from LRU \n"); @@ -389,7 +405,7 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V auto & AccCache = AccCacheIterator->second; if (!AccCache.AccPtr) { - EvictVictims(bytes); + EvictVictims(bytes); } assert((mode==CpuRead)||(mode==CpuWrite)); @@ -444,20 +460,28 @@ void MemoryManager::NotifyDeletion(void *_ptr) void MemoryManager::Print(void) { PrintBytes(); - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "Memory Manager " << std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl; - std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl; - std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl; - std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl; - std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl; - std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<second; @@ -467,13 +491,13 @@ void MemoryManager::Print(void) if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); if ( AccCache.state==Consistent)str = std::string("Consistent"); - std::cout << GridLogDebug << "0x"<second; + LruBytes2+=AccCache.bytes; + assert(AccCache.LRU_valid==1); + assert(AccCache.LRU_entry==it); + } + std::cout << " Memory Manager::Audit() LRU queue matches table entries "<second; @@ -498,7 +540,13 @@ void MemoryManager::Audit(std::string s) if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); if ( AccCache.state==Consistent)str = std::string("Consistent"); - if ( AccCache.cpuLock || AccCache.accLock ) { + CpuBytes+=AccCache.bytes; + if( AccCache.AccPtr ) AccBytes+=AccCache.bytes; + if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes; + if( AccCache.LRU_valid ) LruCnt++; + + if ( AccCache.cpuLock || AccCache.accLock ) { + assert(AccCache.LRU_valid==0); std::cout << GridLogError << s<< "\n\t 0x"< Date: Thu, 1 Dec 2022 00:25:04 -0500 Subject: [PATCH 09/11] Memory manager debug Felix case --- Grid/perfmon/PerfCount.cc | 5 +- Grid/perfmon/Timer.h | 22 +++--- Grid/perfmon/Tracing.h | 70 ++++++++++++++++++ systems/mac-arm/config-command-mpi | 2 +- tests/core/Test_memory_manager.cc | 110 +++++++++++++++++++++++++++++ 5 files changed, 196 insertions(+), 13 deletions(-) create mode 100644 Grid/perfmon/Tracing.h create mode 100644 tests/core/Test_memory_manager.cc diff --git a/Grid/perfmon/PerfCount.cc b/Grid/perfmon/PerfCount.cc index 2062bb59..114c36a0 100644 --- a/Grid/perfmon/PerfCount.cc +++ b/Grid/perfmon/PerfCount.cc @@ -27,10 +27,13 @@ Author: paboyle /* END LEGAL */ #include -#include +#include +#include NAMESPACE_BEGIN(Grid); +GridTimePoint theProgramStart = GridClock::now(); + #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16)) #define RawConfig(A,B) (A<<8|B) const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = { diff --git a/Grid/perfmon/Timer.h b/Grid/perfmon/Timer.h index 2a44faee..ba5df85a 100644 --- a/Grid/perfmon/Timer.h +++ b/Grid/perfmon/Timer.h @@ -35,17 +35,8 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid) -// Dress the output; use std::chrono -// C++11 time facilities better? -inline double usecond(void) { - struct timeval tv; -#ifdef TIMERS_ON - gettimeofday(&tv,NULL); -#endif - return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec; -} - -typedef std::chrono::system_clock GridClock; +//typedef std::chrono::system_clock GridClock; +typedef std::chrono::high_resolution_clock GridClock; typedef std::chrono::time_point GridTimePoint; typedef std::chrono::seconds GridSecs; @@ -53,6 +44,15 @@ typedef std::chrono::milliseconds GridMillisecs; typedef std::chrono::microseconds GridUsecs; typedef std::chrono::microseconds GridTime; +extern GridTimePoint theProgramStart; +// Dress the output; use std::chrono +// C++11 time facilities better? +inline double usecond(void) { + auto usecs = std::chrono::duration_cast(GridClock::now()-theProgramStart); + return 1.0*usecs.count(); +} + + inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time) { stream << time.count()<<" s"; diff --git a/Grid/perfmon/Tracing.h b/Grid/perfmon/Tracing.h new file mode 100644 index 00000000..5000cef4 --- /dev/null +++ b/Grid/perfmon/Tracing.h @@ -0,0 +1,70 @@ +#pragma once + +NAMESPACE_BEGIN(Grid); + +#ifdef GRID_TRACING_NVTX +#include +class GridTracer { +public: + GridTracer(const char* name) { + nvtxRangePushA(name); + } + ~GridTracer() { + nvtxRangePop(); + } +}; +inline void tracePush(const char *name) { nvtxRangePushA(name); } +inline void tracePop(const char *name) { nvtxRangePop(); } +inline int traceStart(const char *name) { } +inline void traceStop(int ID) { } +#endif + +#ifdef GRID_TRACING_ROCTX +#include +class GridTracer { + public: + GridTracer(const char* name) { + roctxRangePushA(name); + std::cout << "roctxRangePush "< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +void MemoryTest(GridCartesian * FGrid,int N); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + + int N=100; + for(int i=0;i A(N,zero);//FGrid); + + std::vector B(N,ComplexD(0.0)); // Update sequentially on host + + for(int v=0;voSites(),1,{ + A_v[ss] = A_v[ss] + zc; + }); + } else { + autoView(A_v,A[v],CpuWrite); + thread_for(ss,FGrid->oSites(),{ + A_v[ss] = A_v[ss] + zc; + }); + } + } + } else { + if ( e == 0 ) { + A[v] = A[v] + A[v] - A[v]; + } else { + if ( dev ) { + autoView(A_v,A[v],AcceleratorRead); + accelerator_for(ss,FGrid->oSites(),1,{ + assert(B[v]==A_v[ss]()()().getlane(0)); + }); + // std::cout << "["<oSites(),{ + assert(B[v]==A_v[ss]()()().getlane(0)); + }); + // std::cout << "["< Date: Thu, 1 Dec 2022 00:35:05 -0500 Subject: [PATCH 10/11] CPU open doesn't need to free space --- Grid/allocator/MemoryManagerCache.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 3420c9cc..bae184ec 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -404,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - if (!AccCache.AccPtr) { - EvictVictims(bytes); - } + // CPU doesn't need to free space + // if (!AccCache.AccPtr) { + // EvictVictims(bytes); + // } assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error From 4ca1bf7ccaaa471a51994564adb5e445bc62afd5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 21 Dec 2022 07:23:16 -0500 Subject: [PATCH 11/11] Added gauge invariance test --- tests/core/Test_fft_matt.cc | 59 +++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/core/Test_fft_matt.cc b/tests/core/Test_fft_matt.cc index d92fa94e..55234601 100644 --- a/tests/core/Test_fft_matt.cc +++ b/tests/core/Test_fft_matt.cc @@ -206,6 +206,65 @@ int main (int argc, char ** argv) DumpSliceNorm("Slice Norm Solution ",result,Nd-1); } + //////////////////////////////////////////////////// + //Gauge invariance test + //////////////////////////////////////////////////// + { + std::cout<<"****************************************"<::RandomGaugeTransform(pRNG,U_GT,g); // Unit gauge + + LatticeFermionD src(&GRID); + LatticeFermionD tmp(&GRID); + LatticeFermionD ref(&GRID); + LatticeFermionD diff(&GRID); + + // could loop over colors + src=Zero(); + Coordinate point(4,0); // 0,0,0,0 + SpinColourVectorD ferm; + ferm=Zero(); + ferm()(0)(0) = ComplexD(1.0); + pokeSite(ferm,src,point); + + RealD mass=0.1; + WilsonFermionD Dw(U_GT,GRID,RBGRID,mass); + + // Momentum space prop + std::cout << " Solving by FFT and Feynman rules" < HermOp(Dw); + ConjugateGradient CG(1.0e-10,10000); + CG(HermOp,src,result); + + //////////////////////////////////////////////////////////////////////// + std::cout << " Taking difference" <