mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Merge branch 'develop' into feature/distil
* develop: Change to reporting NVCC timer support Fix nocompilee under NVCC --enable-summit flag IBM summit optimisation. Synchronise in node is still btweeen 2 halves of AC922, so could be a little faster Sliced propagator contraction was not producing any results because buf.size()=0 several typos in hadrons
This commit is contained in:
commit
7983ff2fdd
@ -162,11 +162,8 @@ static inline int divides(int a,int b)
|
|||||||
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
|
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
|
||||||
{
|
{
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Assert power of two shm_size.
|
// Powers of 2,3,5 only in prime decomposition for now
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
|
|
||||||
assert(log2size != -1);
|
|
||||||
|
|
||||||
int ndimension = WorldDims.size();
|
int ndimension = WorldDims.size();
|
||||||
ShmDims=Coordinate(ndimension,1);
|
ShmDims=Coordinate(ndimension,1);
|
||||||
|
|
||||||
@ -177,7 +174,8 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
|
|||||||
while(AutoShmSize != WorldShmSize) {
|
while(AutoShmSize != WorldShmSize) {
|
||||||
for(int p=0;p<primes.size();p++) {
|
for(int p=0;p<primes.size();p++) {
|
||||||
int prime=primes[p];
|
int prime=primes[p];
|
||||||
if ( divides(prime,WorldDims[dim]/ShmDims[dim]) ) {
|
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
|
||||||
|
&& divides(prime,WorldShmSize/AutoShmSize) ) {
|
||||||
AutoShmSize*=prime;
|
AutoShmSize*=prime;
|
||||||
ShmDims[dim]*=prime;
|
ShmDims[dim]*=prime;
|
||||||
break;
|
break;
|
||||||
@ -308,7 +306,6 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
|
|||||||
}
|
}
|
||||||
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
|
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
|
||||||
{
|
{
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Identify subblock of ranks on node spreading across dims
|
// Identify subblock of ranks on node spreading across dims
|
||||||
// in a maximally symmetrical way
|
// in a maximally symmetrical way
|
||||||
@ -435,10 +432,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// e.g. DGX1, supermicro board,
|
// e.g. DGX1, supermicro board,
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
||||||
|
|
||||||
#ifdef GRID_IBM_SUMMIT
|
#ifdef GRID_IBM_SUMMIT
|
||||||
std::cout << header << "flag IBM_SUMMIT disabled CUDA set device: ensure jsrun is used correctly" <<std::endl;
|
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||||
|
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
|
||||||
#else
|
#else
|
||||||
cudaSetDevice(WorldShmRank);
|
std::cout << "setting device to WorldShmRank"<<std::endl;
|
||||||
|
cudaSetDevice(WorldShmRank);
|
||||||
#endif
|
#endif
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
@ -466,7 +466,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// If it is me, pass around the IPC access key
|
// If it is me, pass around the IPC access key
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
cudaIpcMemHandle_t handle;
|
cudaIpcMemHandle_t handle;
|
||||||
|
|
||||||
if ( r==WorldShmRank ) {
|
if ( r==WorldShmRank ) {
|
||||||
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
||||||
if ( err != cudaSuccess) {
|
if ( err != cudaSuccess) {
|
||||||
@ -735,6 +735,24 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
|
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
|
||||||
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
|
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
|
||||||
|
|
||||||
|
#ifdef GRID_IBM_SUMMIT
|
||||||
|
// Hide the shared memory path between sockets
|
||||||
|
// if even number of nodes
|
||||||
|
if ( (ShmSize & 0x1)==0 ) {
|
||||||
|
int SocketSize = ShmSize/2;
|
||||||
|
int mySocket = ShmRank/SocketSize;
|
||||||
|
for(int r=0;r<size;r++){
|
||||||
|
int hisRank=ShmRanks[r];
|
||||||
|
if ( hisRank!= MPI_UNDEFINED ) {
|
||||||
|
int hisSocket=hisRank/SocketSize;
|
||||||
|
if ( hisSocket != mySocket ) {
|
||||||
|
ShmRanks[r] = MPI_UNDEFINED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
SharedMemoryTest();
|
SharedMemoryTest();
|
||||||
}
|
}
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
|
@ -44,8 +44,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <sys/syscall.h>
|
#include <sys/syscall.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
||||||
|
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
|
||||||
|
#else
|
||||||
#include <x86intrin.h>
|
#include <x86intrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
@ -89,13 +94,8 @@ inline uint64_t cyclecount(void){
|
|||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
#elif defined __x86_64__
|
#elif defined __x86_64__
|
||||||
#ifdef GRID_NVCC
|
|
||||||
accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
|
||||||
#endif
|
|
||||||
inline uint64_t cyclecount(void){
|
inline uint64_t cyclecount(void){
|
||||||
return __rdtsc();
|
return __rdtsc();
|
||||||
// unsigned int dummy;
|
|
||||||
// return __rdtscp(&dummy);
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -42,8 +42,8 @@ public:
|
|||||||
typedef typename FImpl::SitePropagator pobj;
|
typedef typename FImpl::SitePropagator pobj;
|
||||||
typedef typename ComplexField::vector_object vobj;
|
typedef typename ComplexField::vector_object vobj;
|
||||||
|
|
||||||
static constexpr int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
|
static const int epsilon[6][3] ;
|
||||||
static constexpr Complex epsilon_sgn[6]= {1,1,1,-1,-1,-1};
|
static const Complex epsilon_sgn[6];
|
||||||
|
|
||||||
private:
|
private:
|
||||||
template <class mobj, class robj>
|
template <class mobj, class robj>
|
||||||
@ -83,10 +83,15 @@ public:
|
|||||||
robj &result);
|
robj &result);
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class FImpl>
|
template <class FImpl>
|
||||||
constexpr int BaryonUtils<FImpl>::epsilon[6][3];
|
const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
|
||||||
template <class FImpl>
|
template <class FImpl>
|
||||||
constexpr Complex BaryonUtils<FImpl>::epsilon_sgn[6];
|
const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
|
||||||
|
Complex(1),
|
||||||
|
Complex(1),
|
||||||
|
Complex(-1),
|
||||||
|
Complex(-1),
|
||||||
|
Complex(-1)};
|
||||||
|
|
||||||
template <class FImpl>
|
template <class FImpl>
|
||||||
template <class mobj, class robj>
|
template <class mobj, class robj>
|
||||||
|
@ -1233,7 +1233,7 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
void Report(void) {
|
void Report(void) {
|
||||||
#define AVERAGE(A) _grid->GlobalSum(A);A/=NP;
|
#define AVERAGE(A)
|
||||||
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
||||||
RealD NP = _grid->_Nprocessors;
|
RealD NP = _grid->_Nprocessors;
|
||||||
RealD NN = _grid->NodeCount();
|
RealD NN = _grid->NodeCount();
|
||||||
@ -1281,11 +1281,13 @@ public:
|
|||||||
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
||||||
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
PRINTIT(mpi3synctime);
|
PRINTIT(mpi3synctime);
|
||||||
PRINTIT(mpi3synctime_g);
|
PRINTIT(mpi3synctime_g);
|
||||||
PRINTIT(shmmergetime);
|
PRINTIT(shmmergetime);
|
||||||
PRINTIT(splicetime);
|
PRINTIT(splicetime);
|
||||||
PRINTIT(nosplicetime);
|
PRINTIT(nosplicetime);
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
#undef PRINTIT
|
#undef PRINTIT
|
||||||
#undef AVERAGE
|
#undef AVERAGE
|
||||||
|
@ -272,7 +272,7 @@ struct Correlator: Serializable
|
|||||||
{
|
{
|
||||||
GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>),
|
GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>),
|
||||||
Metadata, info,
|
Metadata, info,
|
||||||
std::vector<Complex>, corr);
|
std::vector<Scalar>, corr);
|
||||||
};
|
};
|
||||||
|
|
||||||
END_HADRONS_NAMESPACE
|
END_HADRONS_NAMESPACE
|
||||||
|
@ -199,7 +199,7 @@ void TMeson<FImpl1, FImpl2>::execute(void)
|
|||||||
Gamma gSnk(gammaList[i].first);
|
Gamma gSnk(gammaList[i].first);
|
||||||
Gamma gSrc(gammaList[i].second);
|
Gamma gSrc(gammaList[i].second);
|
||||||
|
|
||||||
for (unsigned int t = 0; t < buf.size(); ++t)
|
for (unsigned int t = 0; t < nt; ++t)
|
||||||
{
|
{
|
||||||
result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
|
result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
|
||||||
}
|
}
|
||||||
|
@ -144,7 +144,7 @@ void TWeakEye3pt<FImpl>::execute(void)
|
|||||||
{
|
{
|
||||||
LOG(Message) << "Computing mesonic weak 3pt contractions, eye topologies" << std::endl;
|
LOG(Message) << "Computing mesonic weak 3pt contractions, eye topologies" << std::endl;
|
||||||
LOG(Message) << "gIn : " << par().gammaIn << std::endl;
|
LOG(Message) << "gIn : " << par().gammaIn << std::endl;
|
||||||
LOG(Message) << "gOut: " << par().gammaIn << std::endl;
|
LOG(Message) << "gOut: " << par().gammaOut << std::endl;
|
||||||
LOG(Message) << "tOut: " << par().tOut << std::endl;
|
LOG(Message) << "tOut: " << par().tOut << std::endl;
|
||||||
LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
|
LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
|
||||||
LOG(Message) << "qbr : " << par().qBarRight << std::endl;
|
LOG(Message) << "qbr : " << par().qBarRight << std::endl;
|
||||||
|
@ -144,7 +144,7 @@ void TWeakNonEye3pt<FImpl>::execute(void)
|
|||||||
{
|
{
|
||||||
LOG(Message) << "Computing mesonic weak 3pt contractions, non-eye topologies" << std::endl;
|
LOG(Message) << "Computing mesonic weak 3pt contractions, non-eye topologies" << std::endl;
|
||||||
LOG(Message) << "gIn : " << par().gammaIn << std::endl;
|
LOG(Message) << "gIn : " << par().gammaIn << std::endl;
|
||||||
LOG(Message) << "gOut: " << par().gammaIn << std::endl;
|
LOG(Message) << "gOut: " << par().gammaOut << std::endl;
|
||||||
LOG(Message) << "ql : " << par().qLeft << std::endl;
|
LOG(Message) << "ql : " << par().qLeft << std::endl;
|
||||||
LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
|
LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
|
||||||
LOG(Message) << "qr : " << par().qRight << std::endl;
|
LOG(Message) << "qr : " << par().qRight << std::endl;
|
||||||
|
14
configure.ac
14
configure.ac
@ -136,15 +136,15 @@ case ${ac_SFW_FP16} in
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
############### SUMMIT JSRUN
|
############### SUMMIT JSRUN
|
||||||
AC_ARG_ENABLE([jsrun],
|
AC_ARG_ENABLE([summit],
|
||||||
[AC_HELP_STRING([--enable-jsrun=yes|no], [enable IBMs jsrun resource manager for SUMMIT])],
|
[AC_HELP_STRING([--enable-summit=yes|no], [enable IBMs jsrun resource manager for SUMMIT])],
|
||||||
[ac_JSRUN=${enable_jsrun}], [ac_JSRUN=no])
|
[ac_JSRUN=${enable_summit}], [ac_SUMMIT=no])
|
||||||
case ${ac_JSRUN} in
|
case ${ac_SUMMIT} in
|
||||||
|
no);;
|
||||||
yes)
|
yes)
|
||||||
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
|
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
|
||||||
no);;
|
|
||||||
*)
|
*)
|
||||||
AC_MSG_ERROR(["JSRUN option not supported ${ac_JSRUN}"]);;
|
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
############### Intel libraries
|
############### Intel libraries
|
||||||
@ -255,7 +255,7 @@ AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=code],
|
|||||||
|
|
||||||
AC_ARG_ENABLE([gen-simd-width],
|
AC_ARG_ENABLE([gen-simd-width],
|
||||||
[AS_HELP_STRING([--enable-gen-simd-width=size],
|
[AS_HELP_STRING([--enable-gen-simd-width=size],
|
||||||
[size (in bytes) of the generic SIMD vectors (default: 32)])],
|
[size (in bytes) of the generic SIMD vectors (default: 64)])],
|
||||||
[ac_gen_simd_width=$enable_gen_simd_width],
|
[ac_gen_simd_width=$enable_gen_simd_width],
|
||||||
[ac_gen_simd_width=64])
|
[ac_gen_simd_width=64])
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user