From a7635fd5ba250e95483005c6988b30b27980d928 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 18 May 2020 17:52:26 -0400 Subject: [PATCH] summit mem --- Grid/GridCore.h | 1 + Grid/allocator/AlignedAllocator.h | 9 +- Grid/allocator/GridMemoryManager.cc | 131 ++++++++++++++++++++++++++++ Grid/allocator/GridMemoryManager.h | 42 +++++++++ Grid/lattice/Lattice_base.h | 39 ++++----- Grid/lattice/Lattice_transfer.h | 2 +- 6 files changed, 197 insertions(+), 27 deletions(-) create mode 100644 Grid/allocator/GridMemoryManager.cc create mode 100644 Grid/allocator/GridMemoryManager.h diff --git a/Grid/GridCore.h b/Grid/GridCore.h index a48d2d49..495a81e1 100644 --- a/Grid/GridCore.h +++ b/Grid/GridCore.h @@ -47,6 +47,7 @@ Author: paboyle #include #include #include +#include #include #include #include diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 77167299..600b7097 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -178,12 +178,13 @@ public: //////////////////////////////////// if ( ptr == (_Tp *) NULL ) { // printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout); - auto err = cudaMallocManaged((void **)&ptr,bytes); - if( err != cudaSuccess ) { + // auto err = +gridMallocManaged((void **)&ptr,bytes); +/*if( err != cudaSuccess ) { ptr = (_Tp *) NULL; std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " < + +NAMESPACE_BEGIN(Grid); + +#define _GRID_MEM_PAGE_SIZE 4096 +void* _grid_mem_base = 0; +size_t _grid_mem_pages; +struct _grid_mem_range { + size_t page_start, page_end; +}; +std::vector<_grid_mem_range> _grid_mem_avail; +std::map _grid_mem_alloc; + +void gridMemoryInit() { + size_t free,total; + cudaMemGetInfo(&free,&total); + + char* ev = getenv("GRID_DEVICE_BYTES_FOR_CACHE"); + if (ev) { + long bytes; + assert(sscanf(ev,"%ld",&bytes)==1); + free -= bytes; + } + + _grid_mem_pages = free / _GRID_MEM_PAGE_SIZE; + size_t sz = _grid_mem_pages * _GRID_MEM_PAGE_SIZE; + + assert(cudaSuccess == cudaMallocManaged(&_grid_mem_base,sz)); + + int target; + cudaGetDevice(&target); + cudaMemAdvise(_grid_mem_base,sz,cudaMemAdviseSetPreferredLocation,target); + + assert(cudaSuccess == cudaMemset(_grid_mem_base,0,sz)); // touch on device + std::cout << GridLogMessage << "gridMemoryInit: " << sz << " bytes" << std::endl; + + _grid_mem_avail.push_back( { 0, _grid_mem_pages } ); +} + +void gridMallocManaged(void** pp, size_t sz) { + + if (_grid_mem_avail.empty()) + gridMemoryInit(); + + size_t pages = (sz + _GRID_MEM_PAGE_SIZE - 1) / _GRID_MEM_PAGE_SIZE; + // find free block + size_t m; + for (m=0;m<_grid_mem_avail.size();m++) { + auto & b = _grid_mem_avail[m]; + if (b.page_end - b.page_start >= pages) + break; + } + if (m == _grid_mem_avail.size()) { + std::cout << GridLogMessage << "Out of memory" << std::endl; + assert(0); + } + *pp = (char*)_grid_mem_base + _GRID_MEM_PAGE_SIZE*_grid_mem_avail[m].page_start; + _grid_mem_alloc[*pp] = { _grid_mem_avail[m].page_start, _grid_mem_avail[m].page_start + pages }; + _grid_mem_avail[m].page_start += pages; +} + +void gridFree(void* p) { + + if (_grid_mem_avail.empty()) + gridMemoryInit(); + + auto & alloc = _grid_mem_alloc[p]; + if (alloc.page_start == alloc.page_end) { + free(p); + //cudaFreeHost(p); + } else { + // can we enlarge existing one? + for (size_t m=0;m<_grid_mem_avail.size();m++) { + auto & b = _grid_mem_avail[m]; + if (b.page_start == alloc.page_end) { + b.page_start = alloc.page_start; + return; + } + if (b.page_end == alloc.page_start) { + b.page_end = alloc.page_end; + return; + } + } + // fragment memory + _grid_mem_avail.push_back( alloc ); + } + _grid_mem_alloc.erase(p); +} + +void gridAcceleratorPrefetch(void* p, size_t sz) { + + auto & alloc = _grid_mem_alloc[p]; + if (alloc.page_start == alloc.page_end) // pinned to host + return; + + int target; + cudaGetDevice(&target); + cudaMemPrefetchAsync(p,sz,target); +} + +void gridMemGetInfo(size_t* pfree, size_t* ptotal) { + + if (_grid_mem_avail.empty()) + gridMemoryInit(); + + *ptotal = _grid_mem_pages * _GRID_MEM_PAGE_SIZE; + *pfree = 0; + for (auto & a : _grid_mem_avail) + *pfree += (a.page_end - a.page_start) * _GRID_MEM_PAGE_SIZE; +} + +void gridMoveToHost(void** pp) { + + if (_grid_mem_avail.empty()) + gridMemoryInit(); + + auto & alloc = _grid_mem_alloc[*pp]; + if (alloc.page_start == alloc.page_end) // already on host + return; + + size_t sz = (alloc.page_end - alloc.page_start) * _GRID_MEM_PAGE_SIZE; + void*pn; + //assert(cudaSuccess == cudaMallocHost(&pn,sz)); + pn = malloc(sz); + memcpy(pn,*pp,sz); + gridFree(*pp); + *pp = pn; + _grid_mem_alloc[pn] = { 0,0 }; +} + +NAMESPACE_END(Grid); diff --git a/Grid/allocator/GridMemoryManager.h b/Grid/allocator/GridMemoryManager.h new file mode 100644 index 00000000..9e619301 --- /dev/null +++ b/Grid/allocator/GridMemoryManager.h @@ -0,0 +1,42 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/GridMemoryManager.h + + Copyright (C) 2020 + +Author: Christoph Lehner + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_MEMORY_MANAGER_H +#define GRID_MEMORY_MANAGER_H + +NAMESPACE_BEGIN(Grid); + +void gridMemoryInit(); +void gridMallocManaged(void** pp, size_t sz); +void gridMoveToHost(void** pp); +void gridAcceleratorPrefetch(void* p, size_t sz); +void gridMemGetInfo(size_t* pfree, size_t* ptotal); +void gridFree(void* p); + +NAMESPACE_END(Grid); + +#endif diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 284190ba..42e9e50a 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -97,33 +97,14 @@ public: else grid = _grid; }; - accelerator_inline void Advise(int advise) { -#ifdef GRID_NVCC -#ifndef __CUDA_ARCH__ // only on host - if (advise & AdviseInfrequentUse) { - cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId); - } - if (advise & AdviseReadMostly) { - cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1); - } -#endif -#endif - }; - accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future -#ifdef GRID_NVCC -#ifndef __CUDA_ARCH__ // only on host - int target; - cudaGetDevice(&target); - cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target); -#endif -#endif + gridAcceleratorPrefetch(_odata,_odata_size*sizeof(vobj)); }; accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future #ifdef GRID_NVCC #ifndef __CUDA_ARCH__ // only on host - cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId); + //cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId); #endif #endif }; @@ -246,13 +227,27 @@ private: dealloc(); this->_odata_size = size; - if ( size ) + if ( size ) this->_odata = alloc.allocate(this->_odata_size); else this->_odata = nullptr; } } public: + + void Advise(int advise) { +#ifdef GRID_NVCC +#ifndef __CUDA_ARCH__ // only on host + if (advise & AdviseInfrequentUse) { + gridMoveToHost((void**)&this->_odata); + } + if (advise & AdviseReadMostly) { + //cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1); + } +#endif +#endif + }; + ///////////////////////////////////////////////////////////////////////////////// // Return a view object that may be dereferenced in site loops. // The view is trivially copy constructible and may be copied to an accelerator device diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index c23ddcdc..e12ef787 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -96,7 +96,7 @@ accelerator_inline void convertType(ComplexF & out, const std::complex & out = in; } -#ifdef __CUDA_ARCH__ +#ifdef GRID_NVCC accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in; }