From 281f8101fe8c679bfd52d149d53a669fdbd92c0d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 17 Dec 2022 20:35:33 -0500 Subject: [PATCH 01/20] Matt FFT test --- tests/core/Test_fft_matt.cc | 160 ++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 tests/core/Test_fft_matt.cc diff --git a/tests/core/Test_fft_matt.cc b/tests/core/Test_fft_matt.cc new file mode 100644 index 00000000..d4455a7e --- /dev/null +++ b/tests/core/Test_fft_matt.cc @@ -0,0 +1,160 @@ + /************************************************************************************* + + grid` physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_cshift.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< seeds({1,2,3,4}); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds); // naughty seeding + GridParallelRNG pRNG(&GRID); + pRNG.SeedFixedIntegers(seeds); + + LatticeGaugeFieldD Umu(&GRID); + + SU::ColdConfiguration(pRNG,Umu); // Unit gauge + + //////////////////////////////////////////////////// + // Wilson test + //////////////////////////////////////////////////// + { + LatticeFermionD src(&GRID); gaussian(pRNG,src); + LatticeFermionD tmp(&GRID); + LatticeFermionD ref(&GRID); + + RealD mass=0.01; + WilsonFermionD Dw(Umu,GRID,RBGRID,mass); + + Dw.M(src,tmp); + + std::cout << "Dw src = " < HermOp(Dw); + ConjugateGradient CG(1.0e-10,10000); + CG(HermOp,src,result); + + //////////////////////////////////////////////////////////////////////// + std::cout << " Taking difference" < Date: Sun, 18 Dec 2022 12:05:00 -0500 Subject: [PATCH 02/20] Updated FFT test for PETSc --- Grid/lattice/Lattice.h | 1 + Grid/lattice/Lattice_crc.h | 55 +++++++++++++++++++++ tests/core/Test_fft_matt.cc | 95 ++++++++++++++++++++++++++++--------- 3 files changed, 129 insertions(+), 22 deletions(-) create mode 100644 Grid/lattice/Lattice_crc.h diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 9f5f1da7..c4adf86a 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -46,3 +46,4 @@ Author: Peter Boyle #include #include #include +#include diff --git a/Grid/lattice/Lattice_crc.h b/Grid/lattice/Lattice_crc.h new file mode 100644 index 00000000..142e2349 --- /dev/null +++ b/Grid/lattice/Lattice_crc.h @@ -0,0 +1,55 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/lattice/Lattice_crc.h + + Copyright (C) 2021 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +template void DumpSliceNorm(std::string s,Lattice &f,int mu=-1) +{ + auto ff = localNorm2(f); + if ( mu==-1 ) mu = f.Grid()->Nd()-1; + typedef typename vobj::tensor_reduced normtype; + typedef typename normtype::scalar_object scalar; + std::vector sff; + sliceSum(ff,sff,mu); + for(int t=0;t uint32_t crc(Lattice & buf) +{ + autoView( buf_v , buf, CpuRead); + return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites()); +} + +#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "< #include using namespace Grid; - ; + +Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT, + Gamma::Algebra::Gamma5 +}; int main (int argc, char ** argv) { @@ -49,22 +55,7 @@ int main (int argc, char ** argv) GridCartesian GRID(latt_size,simd_layout,mpi_layout); GridRedBlackCartesian RBGRID(&GRID); - LatticeComplexD one(&GRID); - LatticeComplexD zz(&GRID); - LatticeComplexD C(&GRID); - LatticeComplexD Ctilde(&GRID); - LatticeComplexD Cref (&GRID); - LatticeComplexD Csav (&GRID); LatticeComplexD coor(&GRID); - - LatticeSpinMatrixD S(&GRID); - LatticeSpinMatrixD Stilde(&GRID); - - Coordinate p({1,3,2,3}); - - one = ComplexD(1.0,0.0); - zz = ComplexD(0.0,0.0); - ComplexD ci(0.0,1.0); std::vector seeds({1,2,3,4}); @@ -73,7 +64,6 @@ int main (int argc, char ** argv) pRNG.SeedFixedIntegers(seeds); LatticeGaugeFieldD Umu(&GRID); - SU::ColdConfiguration(pRNG,Umu); // Unit gauge //////////////////////////////////////////////////// @@ -81,17 +71,78 @@ int main (int argc, char ** argv) //////////////////////////////////////////////////// { LatticeFermionD src(&GRID); gaussian(pRNG,src); + LatticeFermionD src_p(&GRID); LatticeFermionD tmp(&GRID); LatticeFermionD ref(&GRID); + LatticeFermionD result(&GRID); - RealD mass=0.01; + RealD mass=0.1; WilsonFermionD Dw(Umu,GRID,RBGRID,mass); - Dw.M(src,tmp); + Dw.M(src,ref); + std::cout << "Norm src "< 1/2 gmu (eip - emip) = i sinp gmu + Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src_p); + + } + + W = mass + sk2; + Kinetic = Kinetic + W * src_p; + + std::cout<<"Momentum space src "<< norm2(src_p)< %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Flush %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -165,7 +172,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -191,6 +198,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::ViewClose(void* Ptr,ViewMode mode) { if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr); } else if( (mode==CpuRead)||(mode==CpuWrite)){ CpuViewClose((uint64_t)Ptr); @@ -202,6 +210,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis { uint64_t CpuPtr = (uint64_t)_CpuPtr; if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); } else if( (mode==CpuRead)||(mode==CpuWrite)){ return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); @@ -241,11 +250,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { - dprintf("ViewOpen found entry %llx %llx : %lld %lld\n", + dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr, (uint64_t)CpuPtr, (uint64_t)AccCache.bytes, - (uint64_t)bytes); + (uint64_t)bytes, + (uint64_t)AccCache.accLock); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -280,6 +290,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // Empty + AccRead => Consistent } AccCache.accLock= 1; + dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); } else if(AccCache.state==CpuDirty ){ if(mode==AcceleratorWriteDiscard) { CpuDiscard(AccCache); @@ -292,21 +303,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); } else { assert(0); } @@ -314,6 +325,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // If view is opened on device remove from LRU if(AccCache.LRU_valid==1){ // must possibly remove from LRU as now locked on GPU + dprintf("AccCache entry removed from LRU \n"); LRUremove(AccCache); } @@ -334,10 +346,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) assert(AccCache.accLock>0); AccCache.accLock--; - // Move to LRU queue if not locked and close on device if(AccCache.accLock==0) { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); LRUinsert(AccCache); + } else { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); } } void MemoryManager::CpuViewClose(uint64_t CpuPtr) @@ -473,6 +487,29 @@ int MemoryManager::isOpen (void* _CpuPtr) return 0; } } +void MemoryManager::Audit(std::string s) +{ + for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){ + auto &AccCache = it->second; + + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + + if ( AccCache.cpuLock || AccCache.accLock ) { + std::cout << GridLogError << s<< "\n\t 0x"< &logstreams) { - GridLogError.Active(0); + GridLogError.Active(1); GridLogWarning.Active(0); GridLogMessage.Active(1); // at least the messages should be always on + GridLogMemory.Active(0); + GridLogTracing.Active(0); GridLogIterative.Active(0); GridLogDebug.Active(0); GridLogPerformance.Active(0); + GridLogDslash.Active(0); GridLogIntegrator.Active(1); GridLogColours.Active(0); + GridLogHMC.Active(1); for (int i = 0; i < logstreams.size(); i++) { - if (logstreams[i] == std::string("Error")) GridLogError.Active(1); + if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1); + if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1); if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1); if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0); if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); - if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1); + if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); + if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0); + if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); } } diff --git a/Grid/log/Log.h b/Grid/log/Log.h index 68693647..2d663a3c 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -138,7 +138,8 @@ public: stream << std::setw(log.topWidth); } stream << log.topName << log.background()<< " : "; - stream << log.colour() << std::left; + // stream << log.colour() << std::left; + stream << std::left; if (log.chanWidth > 0) { stream << std::setw(log.chanWidth); @@ -153,9 +154,9 @@ public: stream << log.evidence() << now << log.background() << " : " ; } - stream << log.colour(); + // stream << log.colour(); + stream << std::right; stream.flags(f); - return stream; } else { return devnull; @@ -180,8 +181,12 @@ extern GridLogger GridLogWarning; extern GridLogger GridLogMessage; extern GridLogger GridLogDebug ; extern GridLogger GridLogPerformance; +extern GridLogger GridLogDslash; extern GridLogger GridLogIterative ; extern GridLogger GridLogIntegrator ; +extern GridLogger GridLogHMC; +extern GridLogger GridLogMemory; +extern GridLogger GridLogTracing; extern Colours GridLogColours; std::string demangle(const char* name) ; From b00a4142e5f919847285caeac13187fe3f6e10b2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2022 00:18:11 -0500 Subject: [PATCH 04/20] A=A fix --- Grid/lattice/Lattice_base.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 9c3d723f..34f13fa6 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -129,7 +129,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -152,7 +152,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -174,7 +174,7 @@ public: this->checkerboard=cb; auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -245,7 +245,7 @@ public: /////////////////////////////////////////// // user defined constructor /////////////////////////////////////////// - Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { + Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) { this->_grid = grid; resize(this->_grid->oSites()); assert((((uint64_t)&this->_odata[0])&0xF) ==0); @@ -288,7 +288,7 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); @@ -303,7 +303,7 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - auto me = View(AcceleratorWriteDiscard); + auto me = View(AcceleratorWrite); auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); From 43a45ec97bf0212844dc2bc8df9774f534bc29ba Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2022 00:18:43 -0500 Subject: [PATCH 05/20] SSC_START --- Grid/perfmon/PerfCount.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/Grid/perfmon/PerfCount.h b/Grid/perfmon/PerfCount.h index dd25b41e..62b2a740 100644 --- a/Grid/perfmon/PerfCount.h +++ b/Grid/perfmon/PerfCount.h @@ -30,6 +30,12 @@ Author: paboyle #ifndef GRID_PERFCOUNT_H #define GRID_PERFCOUNT_H + +#ifndef __SSC_START +#define __SSC_START +#define __SSC_STOP +#endif + #include #include #include @@ -72,17 +78,9 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, inline uint64_t cyclecount(void){ return 0; } -#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx") -#define __SSC_STOP __SSC_MARK(0x110) -#define __SSC_START __SSC_MARK(0x111) - #else -#define __SSC_MARK(mark) -#define __SSC_STOP -#define __SSC_START - /* * cycle counters arch dependent */ From 99b3697b031b201a2dbe11c1942e672bfc846f05 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2022 00:19:33 -0500 Subject: [PATCH 06/20] More loggin --- Grid/allocator/MemoryManager.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index ad2d9ffc..c22a54f3 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -97,8 +97,8 @@ private: static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ; static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ; - static void PrintBytes(void); public: + static void PrintBytes(void); static void Audit(std::string s); static void Init(void); static void InitMessage(void); @@ -119,6 +119,8 @@ private: static uint64_t DeviceToHostBytes; static uint64_t HostToDeviceXfer; static uint64_t DeviceToHostXfer; + static uint64_t DeviceEvictions; + static uint64_t DeviceDestroy; private: #ifndef GRID_UVM @@ -176,6 +178,7 @@ private: public: static void Print(void); + static void PrintAll(void); static void PrintState( void* CpuPtr); static int isOpen (void* CpuPtr); static void ViewClose(void* CpuPtr,ViewMode mode); From 37ba32776f410af38b43ddceff009be1c0cfcc62 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2022 00:19:42 -0500 Subject: [PATCH 07/20] More logging --- Grid/allocator/MemoryManagerShared.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 2434ad47..ba95420e 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -12,8 +12,10 @@ uint64_t MemoryManager::HostToDeviceBytes; uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; +uint64_t MemoryManager::DeviceEvictions; +uint64_t MemoryManager::DeviceDestroy; -void MemoryManager::Audit(void){}; +void MemoryManager::Audit(std::string s){}; void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; int MemoryManager::isOpen (void* CpuPtr) { return 0;} @@ -22,6 +24,7 @@ void MemoryManager::PrintState(void* CpuPtr) std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl; }; void MemoryManager::Print(void){}; +void MemoryManager::PrintAll(void){}; void MemoryManager::NotifyDeletion(void *ptr){}; NAMESPACE_END(Grid); From 1822ced302f08435035921438cb414c364a98898 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Dec 2022 00:24:08 -0500 Subject: [PATCH 08/20] Bug fix --- Grid/allocator/MemoryManagerCache.cc | 116 ++++++++++++++++++++------- 1 file changed, 86 insertions(+), 30 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index f03ee79f..3420c9cc 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -28,6 +28,8 @@ uint64_t MemoryManager::HostToDeviceBytes; uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; +uint64_t MemoryManager::DeviceEvictions; +uint64_t MemoryManager::DeviceDestroy; //////////////////////////////////// // Priority ordering for unlocked entries @@ -115,8 +117,10 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); + DeviceDestroy++; DeviceBytes -=AccCache.bytes; LRUremove(AccCache); + AccCache.AccPtr=(uint64_t) NULL; dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; @@ -126,28 +130,36 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::Evict(AcceleratorViewEntry &AccCache) { /////////////////////////////////////////////////////////////////////////// - // Make CPU consistent, remove from Accelerator, remove entry - // Cannot be locked. If allocated must be in LRU pool. + // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry + // Cannot be acclocked. If allocated must be in LRU pool. + // + // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock. + // and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen + // but there is a weakness where CpuLock entries are attempted for erase + // Take these OUT LRU queue when CPU locked? + // Cannot take out the table as cpuLock data is important. /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); - if (AccCache.accLock!=0) return; - if (AccCache.cpuLock!=0) return; + assert(AccCache.accLock==0); // Cannot evict so logic bomb + assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.state==AccDirty) { Flush(AccCache); } - assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); - DeviceBytes -=AccCache.bytes; LRUremove(AccCache); + AccCache.AccPtr=(uint64_t)NULL; + AccCache.state=CpuDirty; // CPU primary now + DeviceBytes -=AccCache.bytes; dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } - uint64_t CpuPtr = AccCache.CpuPtr; - EntryErase(CpuPtr); + // uint64_t CpuPtr = AccCache.CpuPtr; + DeviceEvictions++; + // EntryErase(CpuPtr); } void MemoryManager::Flush(AcceleratorViewEntry &AccCache) { @@ -221,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis } void MemoryManager::EvictVictims(uint64_t bytes) { + assert(bytes DeviceMaxBytes){ if ( DeviceLRUBytes > 0){ assert(LRU.size()>0); - uint64_t victim = LRU.back(); + uint64_t victim = LRU.back(); // From the LRU auto AccCacheIterator = EntryLookup(victim); auto & AccCache = AccCacheIterator->second; Evict(AccCache); + } else { + return; } } } @@ -322,7 +337,8 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(0); } - // If view is opened on device remove from LRU + assert(AccCache.accLock>0); + // If view is opened on device must remove from LRU if(AccCache.LRU_valid==1){ // must possibly remove from LRU as now locked on GPU dprintf("AccCache entry removed from LRU \n"); @@ -389,7 +405,7 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V auto & AccCache = AccCacheIterator->second; if (!AccCache.AccPtr) { - EvictVictims(bytes); + EvictVictims(bytes); } assert((mode==CpuRead)||(mode==CpuWrite)); @@ -444,20 +460,28 @@ void MemoryManager::NotifyDeletion(void *_ptr) void MemoryManager::Print(void) { PrintBytes(); - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "Memory Manager " << std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl; - std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl; - std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl; - std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl; - std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl; - std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<second; @@ -467,13 +491,13 @@ void MemoryManager::Print(void) if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); if ( AccCache.state==Consistent)str = std::string("Consistent"); - std::cout << GridLogDebug << "0x"<second; + LruBytes2+=AccCache.bytes; + assert(AccCache.LRU_valid==1); + assert(AccCache.LRU_entry==it); + } + std::cout << " Memory Manager::Audit() LRU queue matches table entries "<second; @@ -498,7 +540,13 @@ void MemoryManager::Audit(std::string s) if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); if ( AccCache.state==Consistent)str = std::string("Consistent"); - if ( AccCache.cpuLock || AccCache.accLock ) { + CpuBytes+=AccCache.bytes; + if( AccCache.AccPtr ) AccBytes+=AccCache.bytes; + if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes; + if( AccCache.LRU_valid ) LruCnt++; + + if ( AccCache.cpuLock || AccCache.accLock ) { + assert(AccCache.LRU_valid==0); std::cout << GridLogError << s<< "\n\t 0x"< Date: Thu, 1 Dec 2022 00:25:04 -0500 Subject: [PATCH 09/20] Memory manager debug Felix case --- Grid/perfmon/PerfCount.cc | 5 +- Grid/perfmon/Timer.h | 22 +++--- Grid/perfmon/Tracing.h | 70 ++++++++++++++++++ systems/mac-arm/config-command-mpi | 2 +- tests/core/Test_memory_manager.cc | 110 +++++++++++++++++++++++++++++ 5 files changed, 196 insertions(+), 13 deletions(-) create mode 100644 Grid/perfmon/Tracing.h create mode 100644 tests/core/Test_memory_manager.cc diff --git a/Grid/perfmon/PerfCount.cc b/Grid/perfmon/PerfCount.cc index 2062bb59..114c36a0 100644 --- a/Grid/perfmon/PerfCount.cc +++ b/Grid/perfmon/PerfCount.cc @@ -27,10 +27,13 @@ Author: paboyle /* END LEGAL */ #include -#include +#include +#include NAMESPACE_BEGIN(Grid); +GridTimePoint theProgramStart = GridClock::now(); + #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16)) #define RawConfig(A,B) (A<<8|B) const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = { diff --git a/Grid/perfmon/Timer.h b/Grid/perfmon/Timer.h index 2a44faee..ba5df85a 100644 --- a/Grid/perfmon/Timer.h +++ b/Grid/perfmon/Timer.h @@ -35,17 +35,8 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid) -// Dress the output; use std::chrono -// C++11 time facilities better? -inline double usecond(void) { - struct timeval tv; -#ifdef TIMERS_ON - gettimeofday(&tv,NULL); -#endif - return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec; -} - -typedef std::chrono::system_clock GridClock; +//typedef std::chrono::system_clock GridClock; +typedef std::chrono::high_resolution_clock GridClock; typedef std::chrono::time_point GridTimePoint; typedef std::chrono::seconds GridSecs; @@ -53,6 +44,15 @@ typedef std::chrono::milliseconds GridMillisecs; typedef std::chrono::microseconds GridUsecs; typedef std::chrono::microseconds GridTime; +extern GridTimePoint theProgramStart; +// Dress the output; use std::chrono +// C++11 time facilities better? +inline double usecond(void) { + auto usecs = std::chrono::duration_cast(GridClock::now()-theProgramStart); + return 1.0*usecs.count(); +} + + inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time) { stream << time.count()<<" s"; diff --git a/Grid/perfmon/Tracing.h b/Grid/perfmon/Tracing.h new file mode 100644 index 00000000..5000cef4 --- /dev/null +++ b/Grid/perfmon/Tracing.h @@ -0,0 +1,70 @@ +#pragma once + +NAMESPACE_BEGIN(Grid); + +#ifdef GRID_TRACING_NVTX +#include +class GridTracer { +public: + GridTracer(const char* name) { + nvtxRangePushA(name); + } + ~GridTracer() { + nvtxRangePop(); + } +}; +inline void tracePush(const char *name) { nvtxRangePushA(name); } +inline void tracePop(const char *name) { nvtxRangePop(); } +inline int traceStart(const char *name) { } +inline void traceStop(int ID) { } +#endif + +#ifdef GRID_TRACING_ROCTX +#include +class GridTracer { + public: + GridTracer(const char* name) { + roctxRangePushA(name); + std::cout << "roctxRangePush "< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +void MemoryTest(GridCartesian * FGrid,int N); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + + int N=100; + for(int i=0;i A(N,zero);//FGrid); + + std::vector B(N,ComplexD(0.0)); // Update sequentially on host + + for(int v=0;voSites(),1,{ + A_v[ss] = A_v[ss] + zc; + }); + } else { + autoView(A_v,A[v],CpuWrite); + thread_for(ss,FGrid->oSites(),{ + A_v[ss] = A_v[ss] + zc; + }); + } + } + } else { + if ( e == 0 ) { + A[v] = A[v] + A[v] - A[v]; + } else { + if ( dev ) { + autoView(A_v,A[v],AcceleratorRead); + accelerator_for(ss,FGrid->oSites(),1,{ + assert(B[v]==A_v[ss]()()().getlane(0)); + }); + // std::cout << "["<oSites(),{ + assert(B[v]==A_v[ss]()()().getlane(0)); + }); + // std::cout << "["< Date: Thu, 1 Dec 2022 00:35:05 -0500 Subject: [PATCH 10/20] CPU open doesn't need to free space --- Grid/allocator/MemoryManagerCache.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 3420c9cc..bae184ec 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -404,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - if (!AccCache.AccPtr) { - EvictVictims(bytes); - } + // CPU doesn't need to free space + // if (!AccCache.AccPtr) { + // EvictVictims(bytes); + // } assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error From 4ca1bf7ccaaa471a51994564adb5e445bc62afd5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 21 Dec 2022 07:23:16 -0500 Subject: [PATCH 11/20] Added gauge invariance test --- tests/core/Test_fft_matt.cc | 59 +++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/core/Test_fft_matt.cc b/tests/core/Test_fft_matt.cc index d92fa94e..55234601 100644 --- a/tests/core/Test_fft_matt.cc +++ b/tests/core/Test_fft_matt.cc @@ -206,6 +206,65 @@ int main (int argc, char ** argv) DumpSliceNorm("Slice Norm Solution ",result,Nd-1); } + //////////////////////////////////////////////////// + //Gauge invariance test + //////////////////////////////////////////////////// + { + std::cout<<"****************************************"<::RandomGaugeTransform(pRNG,U_GT,g); // Unit gauge + + LatticeFermionD src(&GRID); + LatticeFermionD tmp(&GRID); + LatticeFermionD ref(&GRID); + LatticeFermionD diff(&GRID); + + // could loop over colors + src=Zero(); + Coordinate point(4,0); // 0,0,0,0 + SpinColourVectorD ferm; + ferm=Zero(); + ferm()(0)(0) = ComplexD(1.0); + pokeSite(ferm,src,point); + + RealD mass=0.1; + WilsonFermionD Dw(U_GT,GRID,RBGRID,mass); + + // Momentum space prop + std::cout << " Solving by FFT and Feynman rules" < HermOp(Dw); + ConjugateGradient CG(1.0e-10,10000); + CG(HermOp,src,result); + + //////////////////////////////////////////////////////////////////////// + std::cout << " Taking difference" < Date: Thu, 12 Jan 2023 12:36:30 +0000 Subject: [PATCH 12/20] fix: diagnostic pragma warnings fixed for CUDA 12+ --- Grid/DisableWarnings.h | 2 +- Grid/Grid_Eigen_Dense.h | 2 +- Grid/pugixml/pugixml.cc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/DisableWarnings.h b/Grid/DisableWarnings.h index 64e4faf4..015e19d1 100644 --- a/Grid/DisableWarnings.h +++ b/Grid/DisableWarnings.h @@ -45,7 +45,7 @@ directory //disables nvcc specific warning in json.hpp #pragma clang diagnostic ignored "-Wdeprecated-register" -#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5) +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ //disables nvcc specific warning in json.hpp #pragma nv_diag_suppress unsigned_compare_with_zero #pragma nv_diag_suppress cast_to_qualified_type diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index 5aee81de..bdd39a65 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -14,7 +14,7 @@ /* NVCC save and restore compile environment*/ #ifdef __NVCC__ #pragma push -#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5) +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ #pragma nv_diag_suppress code_is_unreachable #else #pragma diag_suppress code_is_unreachable diff --git a/Grid/pugixml/pugixml.cc b/Grid/pugixml/pugixml.cc index 81973964..b2ba698b 100644 --- a/Grid/pugixml/pugixml.cc +++ b/Grid/pugixml/pugixml.cc @@ -16,7 +16,7 @@ #ifdef __NVCC__ #pragma push -#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5) +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ #pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning" #else #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning" From 1db58a8acce1cddb20be116b5916e4b359ae259f Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 21 Feb 2023 10:52:42 -0500 Subject: [PATCH 13/20] Precision change improvements Added a new, much faster implementation of precision change that uses (optionally) a precomputed workspace containing pointer offsets that is device resident, such that all lattice copying occurs only on the device and no host<->device transfer is required, other than the pointer table. It also avoids the need to unpack and repack the fields using explicit lane copying. When this new precisionChange is called without a workspace, one will be computed on-the-fly; however it is still considerably faster than the original implementation. In the special case of using double2 and when the Grids are the same, calls to the new precisionChange will automatically use precisionChangeFast, such that there is a single API call for all precision changes. Reliable update and mixed-prec multishift have been modified to precompute precision change workspaces Renamed the original precisionChange as precisionChangeOrig Fixed incorrect pointer offset bug in copyLane Added a test and a benchmark for precisionChange Added a test for reliable update CG --- .../ConjugateGradientMultiShiftMixedPrec.h | 11 +- .../ConjugateGradientReliableUpdate.h | 32 ++- Grid/lattice/Lattice_transfer.h | 127 +++++++++++- Grid/tensors/Tensor_extract_merge.h | 6 +- benchmarks/Benchmark_prec_change.cc | 189 ++++++++++++++++++ tests/core/Test_prec_change.cc | 124 ++++++++++++ tests/solver/Test_dwf_relupcg_prec.cc | 143 +++++++++++++ 7 files changed, 616 insertions(+), 16 deletions(-) create mode 100644 benchmarks/Benchmark_prec_change.cc create mode 100644 tests/core/Test_prec_change.cc create mode 100644 tests/solver/Test_dwf_relupcg_prec.cc diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h index de1cfe01..a89a1e4a 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h @@ -130,6 +130,9 @@ public: GRID_TRACE("ConjugateGradientMultiShiftMixedPrec"); GridBase *DoublePrecGrid = src_d.Grid(); + precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid); + precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid); + //////////////////////////////////////////////////////////////////////// // Convenience references to the info stored in "MultiShiftFunction" //////////////////////////////////////////////////////////////////////// @@ -200,10 +203,10 @@ public: r_d = p_d; //MdagM+m[0] - precisionChangeFast(p_f,p_d); + precisionChange(p_f, p_d, pc_wk_d_to_s); Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) - precisionChangeFast(tmp_d,mmp_f); + precisionChange(tmp_d, mmp_f, pc_wk_s_to_d); Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) tmp_d = tmp_d - mmp_d; std::cout << " Testing operators match "< &Linop_f; LinearOperatorBase &Linop_d; GridBase* SinglePrecGrid; - RealD Delta; //reliable update parameter + RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single LinearOperatorBase *Linop_fallback; @@ -65,7 +65,9 @@ public: ErrorOnNoConverge(err_on_no_conv), DoFinalCleanup(true), Linop_fallback(NULL) - {}; + { + assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1"); + }; void setFallbackLinop(LinearOperatorBase &_Linop_fallback, const RealD _fallback_transition_tol){ Linop_fallback = &_Linop_fallback; @@ -116,9 +118,12 @@ public: } //Single prec initialization + precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid); + precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid()); + FieldF r_f(SinglePrecGrid); r_f.Checkerboard() = r.Checkerboard(); - precisionChange(r_f, r); + precisionChange(r_f, r, pc_wk_dp_to_sp); FieldF psi_f(r_f); psi_f = Zero(); @@ -134,7 +139,8 @@ public: GridStopWatch LinalgTimer; GridStopWatch MatrixTimer; GridStopWatch SolverTimer; - + GridStopWatch PrecChangeTimer; + SolverTimer.Start(); int k = 0; int l = 0; @@ -173,7 +179,9 @@ public: // Stopping condition if (cp <= rsq) { //Although not written in the paper, I assume that I have to add on the final solution - precisionChange(mmp, psi_f); + PrecChangeTimer.Start(); + precisionChange(mmp, psi_f, pc_wk_sp_to_dp); + PrecChangeTimer.Stop(); psi = psi + mmp; @@ -194,7 +202,10 @@ public: std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() < &in, Lattice &out) }); } +//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field) template void precisionChangeFast(Lattice &out, const Lattice &in) { @@ -1097,9 +1098,9 @@ void precisionChangeFast(Lattice &out, const Lattice &in) precisionChange(vout,vin,N); }); } -//Convert a Lattice from one precision to another +//Convert a Lattice from one precision to another (original, slow implementation) template -void precisionChange(Lattice &out, const Lattice &in) +void precisionChangeOrig(Lattice &out, const Lattice &in) { assert(out.Grid()->Nd() == in.Grid()->Nd()); for(int d=0;dNd();d++){ @@ -1145,6 +1146,128 @@ void precisionChange(Lattice &out, const Lattice &in) }); } +//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls +class precisionChangeWorkspace{ + std::pair* fmap_device; //device pointer + //maintain grids for checking + GridBase* _out_grid; + GridBase* _in_grid; +public: + precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){ + //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device + assert(out_grid->Nd() == in_grid->Nd()); + for(int d=0;dNd();d++){ + assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]); + } + int Nsimd_out = out_grid->Nsimd(); + + std::vector out_icorrs(out_grid->Nsimd()); //reuse these + for(int lane=0; lane < out_grid->Nsimd(); lane++) + out_grid->iCoorFromIindex(out_icorrs[lane], lane); + + std::vector > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd + thread_for(out_oidx,out_grid->oSites(),{ + Coordinate out_ocorr; + out_grid->oCoorFromOindex(out_ocorr, out_oidx); + + Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate) + for(int out_lane=0; out_lane < Nsimd_out; out_lane++){ + out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr); + + //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr); + //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice + //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity + int in_oidx = 0, in_lane = 0; + for(int d=0;d_ndimension;d++){ + in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] ); + in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] ); + } + fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair( in_oidx, in_lane ); + } + }); + + //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines) + size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair); + fmap_device = (std::pair*)acceleratorAllocDevice(fmap_bytes); + acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); + } + + //Prevent moving or copying + precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete; + precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete; + precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete; + precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete; + + std::pair const* getMap() const{ return fmap_device; } + + void checkGrids(GridBase* out, GridBase* in) const{ + conformable(out, _out_grid); + conformable(in, _in_grid); + } + + ~precisionChangeWorkspace(){ + acceleratorFreeDevice(fmap_device); + } +}; + + +//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check) +//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery +template +auto _precisionChangeFastWrap(Lattice &out, const Lattice &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){ + if(out.Grid() == in.Grid()){ + precisionChangeFast(out,in); + return 1; + }else{ + return 0; + } +} +template +int _precisionChangeFastWrap(Lattice &out, const Lattice &in, long dummy){ //note long here is intentional; it means the above is preferred if available + return 0; +} + + +//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace +//which contains the mapping data. +template +void precisionChange(Lattice &out, const Lattice &in, const precisionChangeWorkspace &workspace){ + if(_precisionChangeFastWrap(out,in,0)) return; + + static_assert( std::is_same::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same + + out.Checkerboard() = in.Checkerboard(); + constexpr int Nsimd_out = VobjOut::Nsimd(); + + workspace.checkGrids(out.Grid(),in.Grid()); + std::pair const* fmap_device = workspace.getMap(); + + //Do the copy/precision change + autoView( out_v , out, AcceleratorWrite); + autoView( in_v , in, AcceleratorRead); + + accelerator_for(out_oidx, out.Grid()->oSites(), 1,{ + std::pair const* fmap_osite = fmap_device + out_oidx*Nsimd_out; + for(int out_lane=0; out_lane < Nsimd_out; out_lane++){ + int in_oidx = fmap_osite[out_lane].first; + int in_lane = fmap_osite[out_lane].second; + copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane); + } + }); +} + +//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast +//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device +template +void precisionChange(Lattice &out, const Lattice &in){ + if(_precisionChangeFastWrap(out,in,0)) return; + precisionChangeWorkspace workspace(out.Grid(), in.Grid()); + precisionChange(out, in, workspace); +} + + + + //////////////////////////////////////////////////////////////////////////////// // Communicate between grids //////////////////////////////////////////////////////////////////////////////// diff --git a/Grid/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h index 87572faf..f1407d1f 100644 --- a/Grid/tensors/Tensor_extract_merge.h +++ b/Grid/tensors/Tensor_extract_merge.h @@ -226,7 +226,7 @@ template accelerator_inline void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in) { - static_assert( std::is_same::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same + static_assert( std::is_same::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same typedef typename vobjOut::vector_type ovector_type; typedef typename vobjIn::vector_type ivector_type; @@ -251,9 +251,9 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest ovector_type * __restrict__ op = (ovector_type *)&vecOut; ivector_type * __restrict__ ip = (ivector_type *)&vecIn; for(int w=0;w +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int Ls = 12; + Coordinate latt4 = GridDefaultLatt(); + + GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD); + GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD); + GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD); + + GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF); + GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF); + GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF); + + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGridD); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionD field_d(FGridD), tmp_d(FGridD); + random(RNG5,field_d); tmp_d = field_d; + + LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF); + precisionChange(field_d2, field_d); tmp_d2 = field_d2; + + LatticeFermionF field_f(FGridF), tmp_f(FGridF); + precisionChange(field_f, field_d); tmp_f = field_f; + + int N = 500; + + double time_ds = 0, time_sd = 0; + + std::cout<double original implementation (fields initially device-resident)" << std::endl; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + + precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid()); + precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid()); + + std::cout<double with pregenerated workspace(fields initially device-resident)" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + std::cout<double with workspace generated on-the-fly (fields initially device-resident)" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + + std::cout<double2 (fields initially device-resident)" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + + std::cout<double2 through standard precisionChange call(fields initially device-resident) [NB: perf should be the same as the previous test!]" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + Grid_finalize(); +} diff --git a/tests/core/Test_prec_change.cc b/tests/core/Test_prec_change.cc new file mode 100644 index 00000000..06b9ae5c --- /dev/null +++ b/tests/core/Test_prec_change.cc @@ -0,0 +1,124 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/core/Test_prec_change.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int Ls = 12; + Coordinate latt4 = GridDefaultLatt(); + + GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD); + GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD); + GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD); + + GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF); + GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF); + GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF); + + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG5F(FGridF); RNG5F.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionD field_d(FGridD), tmp_d(FGridD); + random(RNG5,field_d); + RealD norm2_field_d = norm2(field_d); + + LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF); + random(RNG5F,field_d2); + RealD norm2_field_d2 = norm2(field_d2); + + LatticeFermionF field_f(FGridF); + + //Test original implementation + { + std::cout << GridLogMessage << "Testing original implementation" << std::endl; + field_f = Zero(); + precisionChangeOrig(field_f,field_d); + RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d = Zero(); + precisionChangeOrig(tmp_d, field_f); + Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + //Test new implementation with pregenerated workspace + { + std::cout << GridLogMessage << "Testing new implementation with pregenerated workspace" << std::endl; + precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid()); + precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid()); + + field_f = Zero(); + precisionChange(field_f,field_d,wk_dp_to_sp); + RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d = Zero(); + precisionChange(tmp_d, field_f,wk_sp_to_dp); + Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + //Test new implementation without pregenerated workspace + { + std::cout << GridLogMessage << "Testing new implementation without pregenerated workspace" << std::endl; + field_f = Zero(); + precisionChange(field_f,field_d); + RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d = Zero(); + precisionChange(tmp_d, field_f); + Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + //Test fast implementation + { + std::cout << GridLogMessage << "Testing fast (double2) implementation" << std::endl; + field_f = Zero(); + precisionChangeFast(field_f,field_d2); + RealD Ndiff = (norm2_field_d2 - norm2(field_f))/norm2_field_d2; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d2 = Zero(); + precisionChangeFast(tmp_d2, field_f); + Ndiff = norm2( LatticeFermionD2(tmp_d2-field_d2) ) / norm2_field_d2; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + std::cout << "Done" << std::endl; + + Grid_finalize(); +} diff --git a/tests/solver/Test_dwf_relupcg_prec.cc b/tests/solver/Test_dwf_relupcg_prec.cc new file mode 100644 index 00000000..1d8c022a --- /dev/null +++ b/tests/solver/Test_dwf_relupcg_prec.cc @@ -0,0 +1,143 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/solver/Test_dwf_relupcg_prec.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + double relup_delta = 0.2; + for(int i=1;i> relup_delta; + std::cout << GridLogMessage << "Set reliable update Delta to " << relup_delta << std::endl; + } + } + + const int Ls=12; + + { + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + GridCartesian * UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f); + GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f); + GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeFermionD src(FGrid); random(RNG5,src); + LatticeFermionD result(FGrid); result=Zero(); + LatticeGaugeFieldD Umu(UGrid); + LatticeGaugeFieldF Umu_f(UGrid_f); + + SU::HotConfiguration(RNG4,Umu); + + precisionChange(Umu_f,Umu); + + RealD mass=0.1; + RealD M5=1.8; + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionF Ddwf_f(Umu_f,*FGrid_f,*FrbGrid_f,*UGrid_f,*UrbGrid_f,mass,M5); + + LatticeFermionD src_o(FrbGrid); + LatticeFermionD result_o(FrbGrid); + LatticeFermionD result_o_2(FrbGrid); + pickCheckerboard(Odd,src_o,src); + result_o.Checkerboard() = Odd; + result_o = Zero(); + result_o_2.Checkerboard() = Odd; + result_o_2 = Zero(); + + SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); + + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; + ConjugateGradientReliableUpdate mCG(1e-8, 10000, relup_delta, FrbGrid_f, HermOpEO_f, HermOpEO); + double t1,t2,flops; + double MdagMsiteflops = 1452; // Mobius (real coeffs) + // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of) + double CGsiteflops = (8+4+8+4+4)*Nc*Ns ; + std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops< CG(1.0e-8,10000); + for(int i=0;i<1;i++){ + result_o_2 = Zero(); + t1=usecond(); + CG(HermOpEO,src_o,result_o_2); + t2=usecond(); + iters = CG.IterationsToComplete; + flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; + flops+= CGsiteflops*FrbGrid->gSites()*iters; + + std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< Date: Thu, 23 Feb 2023 09:45:29 -0500 Subject: [PATCH 14/20] Further prec-change improvements Mixed prec CG algorithm has been modified to precompute precision change workspaces As the original Test_dwf_mixedcg_prec has been coopted to do a performance stability and reproducibility test, requiring the single-prec CG to be run 200 times, I have created a new version of Test_dwf_mixedcg_prec in the solver subdirectory that just does the mixed vs double CG test --- .../iterative/ConjugateGradientMixedPrec.h | 9 +- tests/solver/Test_dwf_mixedcg_prec.cc | 122 ++++++++++++++++++ 2 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 tests/solver/Test_dwf_mixedcg_prec.cc diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h index 31ac55e0..27fee791 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h @@ -108,7 +108,10 @@ NAMESPACE_BEGIN(Grid); GridStopWatch PrecChangeTimer; Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count - + + precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid); + precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid); + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ //Compute double precision rsd and also new RHS vector. Linop_d.HermOp(sol_d, tmp_d); @@ -123,7 +126,7 @@ NAMESPACE_BEGIN(Grid); while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? PrecChangeTimer.Start(); - precisionChange(src_f, src_d); + precisionChange(src_f, src_d, pc_wk_dp_to_sp); PrecChangeTimer.Stop(); sol_f = Zero(); @@ -142,7 +145,7 @@ NAMESPACE_BEGIN(Grid); //Convert sol back to double and add to double prec solution PrecChangeTimer.Start(); - precisionChange(tmp_d, sol_f); + precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp); PrecChangeTimer.Stop(); axpy(sol_d, 1.0, tmp_d, sol_d); diff --git a/tests/solver/Test_dwf_mixedcg_prec.cc b/tests/solver/Test_dwf_mixedcg_prec.cc new file mode 100644 index 00000000..dc88018e --- /dev/null +++ b/tests/solver/Test_dwf_mixedcg_prec.cc @@ -0,0 +1,122 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_cg_prec.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +//using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=12; + + std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl; + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + GridCartesian * UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f); + GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f); + GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeFermionD src(FGrid); random(RNG5,src); + LatticeFermionD result(FGrid); result=Zero(); + LatticeGaugeFieldD Umu(UGrid); + LatticeGaugeFieldF Umu_f(UGrid_f); + + SU::HotConfiguration(RNG4,Umu); + + precisionChange(Umu_f,Umu); + + RealD mass=0.1; + RealD M5=1.8; + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionF Ddwf_f(Umu_f,*FGrid_f,*FrbGrid_f,*UGrid_f,*UrbGrid_f,mass,M5); + + LatticeFermionD src_o(FrbGrid); + LatticeFermionD result_o(FrbGrid); + LatticeFermionD result_o_2(FrbGrid); + pickCheckerboard(Odd,src_o,src); + result_o.Checkerboard() = Odd; + result_o = Zero(); + result_o_2.Checkerboard() = Odd; + result_o_2 = Zero(); + + SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); + + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; + MixedPrecisionConjugateGradient mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); + double t1,t2,flops; + double MdagMsiteflops = 1452; // Mobius (real coeffs) + // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of) + double CGsiteflops = (8+4+8+4+4)*Nc*Ns ; + std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops< CG(1.0e-8,10000); + result_o_2 = Zero(); + t1=usecond(); + CG(HermOpEO,src_o,result_o_2); + t2=usecond(); + iters = CG.IterationsToComplete; + flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; + flops+= CGsiteflops*FrbGrid->gSites()*iters; + + std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< Date: Thu, 23 Feb 2023 13:09:45 -0500 Subject: [PATCH 15/20] Fixed compile bug in MemoryManagerShared caused by Audit function not being passed a string --- Grid/allocator/MemoryManagerShared.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 2434ad47..e291ef89 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -13,7 +13,7 @@ uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; -void MemoryManager::Audit(void){}; +void MemoryManager::Audit(std::string s){}; void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; int MemoryManager::isOpen (void* CpuPtr) { return 0;} From 39c0815d9e6a338cf294af6fa6b759401888a5cc Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 21 Mar 2023 08:57:29 -0400 Subject: [PATCH 16/20] WriteDiscard --- Grid/lattice/Lattice_base.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 34f13fa6..d6289de2 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -129,7 +129,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWrite); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -152,7 +152,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWrite); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -174,7 +174,7 @@ public: this->checkerboard=cb; auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(AcceleratorWrite); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); @@ -288,8 +288,8 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - auto me = View(AcceleratorWrite); auto him= r.View(AcceleratorRead); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); @@ -303,8 +303,8 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - auto me = View(AcceleratorWrite); auto him= r.View(AcceleratorRead); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); From c6621806ca73b9384b3bcae6cc8ef47157262be7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 21 Mar 2023 17:27:09 -0400 Subject: [PATCH 17/20] Compiling on laptop and running --- Grid/qcd/action/fermion/WilsonCompressor.h | 4 ++-- Grid/stencil/SimpleCompressor.h | 2 +- tests/Test_dwf_mixedcg_prec.cc | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 6e46e0b5..b2c07d18 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -110,7 +110,7 @@ public: //////////////////////////////////////////////////////////////////////////////////////////// template static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, + std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { GridBase *Grid = rhs.Grid(); @@ -209,7 +209,7 @@ public: } template static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, + std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { // std::cout << " face gather exch DWF partial "< static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, + std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { assert( (table.size()&0x1)==0); diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index 1e6da515..cbc573d1 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -101,7 +101,7 @@ int main (int argc, char ** argv) std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<