mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge branch 'feature/dirichlet' into feature/dirichlet-gparity
This commit is contained in:
commit
65abe4d0d3
@ -142,6 +142,15 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
|
||||
return sumD_cpu(arg,osites);
|
||||
#endif
|
||||
}
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
|
||||
{
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
return sumD_gpu_large(arg,osites);
|
||||
#else
|
||||
return sumD_cpu(arg,osites);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||
@ -159,6 +168,22 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||
return ssum;
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
|
||||
{
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
autoView( arg_v, arg, AcceleratorRead);
|
||||
Integer osites = arg.Grid()->oSites();
|
||||
auto ssum= sum_gpu_large(&arg_v[0],osites);
|
||||
#else
|
||||
autoView(arg_v, arg, CpuRead);
|
||||
Integer osites = arg.Grid()->oSites();
|
||||
auto ssum= sum_cpu(&arg_v[0],osites);
|
||||
#endif
|
||||
arg.Grid()->GlobalSum(ssum);
|
||||
return ssum;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Deterministic Reduction operations
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
|
||||
}
|
||||
|
||||
template <class Iterator>
|
||||
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
|
||||
int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
|
||||
|
||||
int device;
|
||||
#ifdef GRID_CUDA
|
||||
@ -37,13 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
|
||||
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
|
||||
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
|
||||
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
|
||||
|
||||
/*
|
||||
std::cout << GridLogDebug << "GPU has:" << std::endl;
|
||||
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
|
||||
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
|
||||
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
|
||||
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
|
||||
|
||||
*/
|
||||
if (warpSize != WARP_SIZE) {
|
||||
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
@ -53,12 +53,12 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
|
||||
threads = warpSize;
|
||||
if ( threads*sizeofsobj > sharedMemPerBlock ) {
|
||||
std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
return 0;
|
||||
}
|
||||
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
|
||||
// keep all the streaming multiprocessors busy
|
||||
blocks = nextPow2(multiProcessorCount);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
template <class sobj, class Iterator>
|
||||
@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
|
||||
// Possibly promote to double and sum
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
||||
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::scalar_objectD sobj;
|
||||
typedef decltype(lat) Iterator;
|
||||
@ -207,7 +207,9 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
||||
Integer size = osites*nsimd;
|
||||
|
||||
Integer numThreads, numBlocks;
|
||||
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
|
||||
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
|
||||
assert(ok);
|
||||
|
||||
Integer smemSize = numThreads * sizeof(sobj);
|
||||
|
||||
Vector<sobj> buffer(numBlocks);
|
||||
@ -218,6 +220,54 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
||||
auto result = buffer_v[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::vector_type vector;
|
||||
typedef typename vobj::scalar_typeD scalarD;
|
||||
typedef typename vobj::scalar_objectD sobj;
|
||||
sobj ret;
|
||||
scalarD *ret_p = (scalarD *)&ret;
|
||||
|
||||
const int words = sizeof(vobj)/sizeof(vector);
|
||||
|
||||
Vector<vector> buffer(osites);
|
||||
vector *dat = (vector *)lat;
|
||||
vector *buf = &buffer[0];
|
||||
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
|
||||
for(int w=0;w<words;w++) {
|
||||
|
||||
accelerator_for(ss,osites,1,{
|
||||
buf[ss] = dat[ss*words+w];
|
||||
});
|
||||
|
||||
ret_p[w] = sumD_gpu_small(tbuf,osites);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::vector_type vector;
|
||||
typedef typename vobj::scalar_typeD scalarD;
|
||||
typedef typename vobj::scalar_objectD sobj;
|
||||
sobj ret;
|
||||
|
||||
Integer nsimd= vobj::Nsimd();
|
||||
Integer size = osites*nsimd;
|
||||
Integer numThreads, numBlocks;
|
||||
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
|
||||
|
||||
if ( ok ) {
|
||||
ret = sumD_gpu_small(lat,osites);
|
||||
} else {
|
||||
ret = sumD_gpu_large(lat,osites);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Return as same precision as input performing reduction in double precision though
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -230,6 +280,13 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
sobj result;
|
||||
result = sumD_gpu_large(lat,osites);
|
||||
return result;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -726,8 +726,8 @@ public:
|
||||
static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
|
||||
conformable(f, m);
|
||||
auto grid = f.Grid();
|
||||
const int Nsite = grid->oSites();
|
||||
const int Nsimd = grid->Nsimd();
|
||||
const uint32_t Nsite = grid->oSites();
|
||||
const uint32_t Nsimd = grid->Nsimd();
|
||||
autoView(f_v, f, AcceleratorWrite);
|
||||
autoView(m_v, m, AcceleratorRead);
|
||||
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
|
||||
|
@ -483,9 +483,10 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
|
||||
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
|
||||
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
||||
inline void acceleratorCopySynchronise(void) {};
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||
|
@ -72,3 +72,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define thread_region DO_PRAGMA(omp parallel)
|
||||
#define thread_critical DO_PRAGMA(omp critical)
|
||||
|
||||
#ifdef GRID_OMP
|
||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
||||
{
|
||||
uint64_t *ufrom = (uint64_t *)from;
|
||||
uint64_t *uto = (uint64_t *)to;
|
||||
assert(bytes%8==0);
|
||||
uint64_t words=bytes/8;
|
||||
thread_for(w,words,{
|
||||
uto[w] = ufrom[w];
|
||||
});
|
||||
}
|
||||
#else
|
||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
||||
{
|
||||
bcopy(from,to,bytes);
|
||||
}
|
||||
#endif
|
||||
|
@ -534,6 +534,7 @@ void Grid_init(int *argc,char ***argv)
|
||||
void Grid_finalize(void)
|
||||
{
|
||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
MPI_Finalize();
|
||||
Grid_unquiesce_nodes();
|
||||
#endif
|
||||
|
1
systems/mac-arm/config-command-mpi
Normal file
1
systems/mac-arm/config-command-mpi
Normal file
@ -0,0 +1 @@
|
||||
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi
|
Loading…
Reference in New Issue
Block a user