1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Merge branch 'develop' into feature/distil

* develop:
  Change to reporting
  NVCC timer support
  Fix nocompilee under NVCC
  --enable-summit flag
  IBM summit optimisation. Synchronise in node is still btweeen 2 halves of AC922, so could be a little faster
  Sliced propagator contraction was not producing any results because buf.size()=0
  several typos in hadrons
This commit is contained in:
Michael Marshall 2019-11-30 16:47:03 +00:00
commit 7983ff2fdd
9 changed files with 57 additions and 32 deletions

View File

@ -162,11 +162,8 @@ static inline int divides(int a,int b)
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{ {
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Assert power of two shm_size. // Powers of 2,3,5 only in prime decomposition for now
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
int ndimension = WorldDims.size(); int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1); ShmDims=Coordinate(ndimension,1);
@ -177,7 +174,8 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
while(AutoShmSize != WorldShmSize) { while(AutoShmSize != WorldShmSize) {
for(int p=0;p<primes.size();p++) { for(int p=0;p<primes.size();p++) {
int prime=primes[p]; int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim]) ) { if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime; AutoShmSize*=prime;
ShmDims[dim]*=prime; ShmDims[dim]*=prime;
break; break;
@ -308,7 +306,6 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
} }
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims // Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way // in a maximally symmetrical way
@ -435,10 +432,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// e.g. DGX1, supermicro board, // e.g. DGX1, supermicro board,
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); // cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
#ifdef GRID_IBM_SUMMIT #ifdef GRID_IBM_SUMMIT
std::cout << header << "flag IBM_SUMMIT disabled CUDA set device: ensure jsrun is used correctly" <<std::endl; // IBM Jsrun makes cuda Device numbering screwy and not match rank
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
#else #else
cudaSetDevice(WorldShmRank); std::cout << "setting device to WorldShmRank"<<std::endl;
cudaSetDevice(WorldShmRank);
#endif #endif
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
@ -735,6 +735,24 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r; std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
#ifdef GRID_IBM_SUMMIT
// Hide the shared memory path between sockets
// if even number of nodes
if ( (ShmSize & 0x1)==0 ) {
int SocketSize = ShmSize/2;
int mySocket = ShmRank/SocketSize;
for(int r=0;r<size;r++){
int hisRank=ShmRanks[r];
if ( hisRank!= MPI_UNDEFINED ) {
int hisSocket=hisRank/SocketSize;
if ( hisSocket != mySocket ) {
ShmRanks[r] = MPI_UNDEFINED;
}
}
}
}
#endif
SharedMemoryTest(); SharedMemoryTest();
} }
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////

View File

@ -44,8 +44,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <sys/syscall.h> #include <sys/syscall.h>
#endif #endif
#ifdef __x86_64__ #ifdef __x86_64__
#ifdef GRID_NVCC
accelerator_inline uint64_t __rdtsc(void) { return 0; }
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
#else
#include <x86intrin.h> #include <x86intrin.h>
#endif #endif
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@ -89,13 +94,8 @@ inline uint64_t cyclecount(void){
return tmp; return tmp;
} }
#elif defined __x86_64__ #elif defined __x86_64__
#ifdef GRID_NVCC
accelerator_inline uint64_t __rdtsc(void) { return 0; }
#endif
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
return __rdtsc(); return __rdtsc();
// unsigned int dummy;
// return __rdtscp(&dummy);
} }
#else #else

View File

@ -42,8 +42,8 @@ public:
typedef typename FImpl::SitePropagator pobj; typedef typename FImpl::SitePropagator pobj;
typedef typename ComplexField::vector_object vobj; typedef typename ComplexField::vector_object vobj;
static constexpr int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; static const int epsilon[6][3] ;
static constexpr Complex epsilon_sgn[6]= {1,1,1,-1,-1,-1}; static const Complex epsilon_sgn[6];
private: private:
template <class mobj, class robj> template <class mobj, class robj>
@ -84,9 +84,14 @@ public:
}; };
template <class FImpl> template <class FImpl>
constexpr int BaryonUtils<FImpl>::epsilon[6][3]; const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
template <class FImpl> template <class FImpl>
constexpr Complex BaryonUtils<FImpl>::epsilon_sgn[6]; const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
Complex(1),
Complex(1),
Complex(-1),
Complex(-1),
Complex(-1)};
template <class FImpl> template <class FImpl>
template <class mobj, class robj> template <class mobj, class robj>

View File

@ -1233,7 +1233,7 @@ public:
}; };
void Report(void) { void Report(void) {
#define AVERAGE(A) _grid->GlobalSum(A);A/=NP; #define AVERAGE(A)
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
RealD NP = _grid->_Nprocessors; RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount(); RealD NN = _grid->NodeCount();
@ -1281,11 +1281,13 @@ public:
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl; std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl; std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
} }
/*
PRINTIT(mpi3synctime); PRINTIT(mpi3synctime);
PRINTIT(mpi3synctime_g); PRINTIT(mpi3synctime_g);
PRINTIT(shmmergetime); PRINTIT(shmmergetime);
PRINTIT(splicetime); PRINTIT(splicetime);
PRINTIT(nosplicetime); PRINTIT(nosplicetime);
*/
} }
#undef PRINTIT #undef PRINTIT
#undef AVERAGE #undef AVERAGE

View File

@ -272,7 +272,7 @@ struct Correlator: Serializable
{ {
GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>), GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>),
Metadata, info, Metadata, info,
std::vector<Complex>, corr); std::vector<Scalar>, corr);
}; };
END_HADRONS_NAMESPACE END_HADRONS_NAMESPACE

View File

@ -199,7 +199,7 @@ void TMeson<FImpl1, FImpl2>::execute(void)
Gamma gSnk(gammaList[i].first); Gamma gSnk(gammaList[i].first);
Gamma gSrc(gammaList[i].second); Gamma gSrc(gammaList[i].second);
for (unsigned int t = 0; t < buf.size(); ++t) for (unsigned int t = 0; t < nt; ++t)
{ {
result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc))); result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
} }

View File

@ -144,7 +144,7 @@ void TWeakEye3pt<FImpl>::execute(void)
{ {
LOG(Message) << "Computing mesonic weak 3pt contractions, eye topologies" << std::endl; LOG(Message) << "Computing mesonic weak 3pt contractions, eye topologies" << std::endl;
LOG(Message) << "gIn : " << par().gammaIn << std::endl; LOG(Message) << "gIn : " << par().gammaIn << std::endl;
LOG(Message) << "gOut: " << par().gammaIn << std::endl; LOG(Message) << "gOut: " << par().gammaOut << std::endl;
LOG(Message) << "tOut: " << par().tOut << std::endl; LOG(Message) << "tOut: " << par().tOut << std::endl;
LOG(Message) << "qbl : " << par().qBarLeft << std::endl; LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
LOG(Message) << "qbr : " << par().qBarRight << std::endl; LOG(Message) << "qbr : " << par().qBarRight << std::endl;

View File

@ -144,7 +144,7 @@ void TWeakNonEye3pt<FImpl>::execute(void)
{ {
LOG(Message) << "Computing mesonic weak 3pt contractions, non-eye topologies" << std::endl; LOG(Message) << "Computing mesonic weak 3pt contractions, non-eye topologies" << std::endl;
LOG(Message) << "gIn : " << par().gammaIn << std::endl; LOG(Message) << "gIn : " << par().gammaIn << std::endl;
LOG(Message) << "gOut: " << par().gammaIn << std::endl; LOG(Message) << "gOut: " << par().gammaOut << std::endl;
LOG(Message) << "ql : " << par().qLeft << std::endl; LOG(Message) << "ql : " << par().qLeft << std::endl;
LOG(Message) << "qbl : " << par().qBarLeft << std::endl; LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
LOG(Message) << "qr : " << par().qRight << std::endl; LOG(Message) << "qr : " << par().qRight << std::endl;

View File

@ -136,15 +136,15 @@ case ${ac_SFW_FP16} in
esac esac
############### SUMMIT JSRUN ############### SUMMIT JSRUN
AC_ARG_ENABLE([jsrun], AC_ARG_ENABLE([summit],
[AC_HELP_STRING([--enable-jsrun=yes|no], [enable IBMs jsrun resource manager for SUMMIT])], [AC_HELP_STRING([--enable-summit=yes|no], [enable IBMs jsrun resource manager for SUMMIT])],
[ac_JSRUN=${enable_jsrun}], [ac_JSRUN=no]) [ac_JSRUN=${enable_summit}], [ac_SUMMIT=no])
case ${ac_JSRUN} in case ${ac_SUMMIT} in
no);;
yes) yes)
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);; AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
no);;
*) *)
AC_MSG_ERROR(["JSRUN option not supported ${ac_JSRUN}"]);; AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
esac esac
############### Intel libraries ############### Intel libraries
@ -255,7 +255,7 @@ AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=code],
AC_ARG_ENABLE([gen-simd-width], AC_ARG_ENABLE([gen-simd-width],
[AS_HELP_STRING([--enable-gen-simd-width=size], [AS_HELP_STRING([--enable-gen-simd-width=size],
[size (in bytes) of the generic SIMD vectors (default: 32)])], [size (in bytes) of the generic SIMD vectors (default: 64)])],
[ac_gen_simd_width=$enable_gen_simd_width], [ac_gen_simd_width=$enable_gen_simd_width],
[ac_gen_simd_width=64]) [ac_gen_simd_width=64])