1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 19:25:56 +01:00

MallocManaged in GPU

This commit is contained in:
paboyle 2018-03-18 14:44:46 +00:00
parent 38eadee2c9
commit b1c02ec310

View File

@ -40,11 +40,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <mm_malloc.h>
#endif
#define POINTER_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#undef POINTER_CACHE
#ifdef POINTER_CACHE
class PointerCache {
private:
@ -163,22 +164,28 @@ public:
#else
pointer ptr = nullptr;
#endif
// if ( ptr != NULL )
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
//////////////////
// Hack 2MB align; could make option probably doesn't need configurability
//////////////////
//define GRID_ALLOC_ALIGN (128)
#define GRID_ALLOC_ALIGN (2*1024*1024)
#ifdef HAVE_MM_MALLOC_H
#ifdef GRID_NVCC
if ( ptr == (_Tp *) NULL ) {
if( cudaMallocManaged((void **)&ptr,bytes) != cudaSuccess ) {
ptr = (_Tp *) NULL;
assert(0);
}
}
#else
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
#endif
// std::cout << "alignedAllocator allocate " << std::hex << ptr <<std::dec <<std::endl;
assert( ptr != (_Tp *)NULL);
/////////////////////////////////////////
// First touch optimise in threaded loop
/////////////////////////////////////////
uint8_t *cp = (uint8_t *)ptr;
thread_loop( (size_type n=0;n<bytes;n+=4096) , {
cp[n]=0;
@ -196,14 +203,17 @@ public:
#else
pointer __freeme = __p;
#endif
// if ( __freeme ) {
// std::cout << "alignedAllocator free:" << std::hex << __p <<std::dec <<std::endl;
// }
#ifdef HAVE_MM_MALLOC_H
#ifdef GRID_NVCC
if ( __freeme ) cudaFree((void *)__freeme);
#else
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else
#else
if ( __freeme ) free((void *)__freeme);
#endif
#endif
}
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
@ -212,113 +222,6 @@ public:
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
// Deprecate shmem and comm allocator
//////////////////////////////////////////////////////////////////////////////////////////
// MPI3 : comms must use shm region
// SHMEM: comms must use symmetric heap
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_SHMEM
extern "C" {
#include <mpp/shmem.h>
extern void * shmem_align(size_t, size_t);
extern void shmem_free(void *);
}
#define PARANOID_SYMMETRIC_HEAP
#endif
#if 0
template<typename _Tp>
class commAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef commAllocator<_Tp1> other; };
commAllocator() throw() { }
commAllocator(const commAllocator&) throw() { }
template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
~commAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
#ifdef GRID_COMMS_SHMEM
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef CRAY
_Tp *ptr = (_Tp *) shmem_align(bytes,64);
#else
_Tp *ptr = (_Tp *) shmem_align(64,bytes);
#endif
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
bcast = (void *) ptr;
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
// BACKTRACEFILE();
exit(0);
}
assert( bcast == (void *) ptr);
#endif
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
shmem_free((void *)__p);
}
#else
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
#else
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
#endif
uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order
thread_loop( (size_type n=0;n<bytes;n+=4096),{
cp[n]=0;
});
}
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
#ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p);
#else
free((void *)__p);
#endif
}
#endif
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
#endif
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////