1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-24 09:44:47 +01:00

Merge branch 'develop' of https://github.com/paboyle/Grid into develop

This commit is contained in:
Peter Boyle
2019-11-21 20:09:31 +00:00
9 changed files with 102 additions and 76 deletions

View File

@@ -62,9 +62,12 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
basis_v[k] = basis[k].View(); basis_v[k] = basis[k].View();
} }
std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
thread_region thread_region
{ {
std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private vobj* B = Bt.data() + Nm * thread_num();
thread_for_in_region(ss, grid->oSites(),{ thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.; for(int j=j0; j<j1; ++j) B[j]=0.;

View File

@@ -41,9 +41,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <sys/shm.h> #include <sys/shm.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <zlib.h> #include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@@ -99,6 +96,7 @@ public:
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// Provide shared memory facilities off comm world // Provide shared memory facilities off comm world
/////////////////////////////////////////////////// ///////////////////////////////////////////////////

View File

@@ -155,6 +155,35 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm); if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm); else OptimalCommunicatorSharedMemory(processors,optimal_comm);
} }
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
for(int p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
break;
}
}
dim=(dim+1) %ndimension;
}
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@@ -221,17 +250,13 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// in a maximally symmetrical way // in a maximally symmetrical way
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ndimension = processors.size(); int ndimension = processors.size();
std::vector<int> processor_coor(ndimension); Coordinate processor_coor(ndimension);
std::vector<int> WorldDims = processors.toVector(); Coordinate WorldDims = processors;
std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension); Coordinate ShmDims (ndimension); Coordinate NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension); Coordinate ShmCoor (ndimension); Coordinate NodeCoor (ndimension); Coordinate WorldCoor(ndimension);
std::vector<int> HyperCoor(ndimension); Coordinate HyperCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){ GetShmDims(WorldDims,ShmDims);
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings // Establish torus of processes and nodes with sub-blockings
@@ -281,27 +306,16 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
} }
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims // Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way // in a maximally symmetrical way
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ndimension = processors.size(); int ndimension = processors.size();
Coordinate processor_coor(ndimension); Coordinate processor_coor(ndimension);
Coordinate WorldDims = processors; Coordinate ShmDims(ndimension,1); Coordinate NodeDims (ndimension); Coordinate WorldDims = processors; Coordinate ShmDims(ndimension); Coordinate NodeDims (ndimension);
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension); Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
GetShmDims(WorldDims,ShmDims);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings // Establish torus of processes and nodes with sub-blockings
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@@ -418,7 +432,14 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// e.g. DGX1, supermicro board, // e.g. DGX1, supermicro board,
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); // cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
#else
std::cout << "setting device to WorldShmRank"<<std::endl;
cudaSetDevice(WorldShmRank); cudaSetDevice(WorldShmRank);
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -714,6 +735,24 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r; std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
#ifdef GRID_IBM_SUMMIT
// Hide the shared memory path between sockets
// if even number of nodes
if ( (ShmSize & 0x1)==0 ) {
int SocketSize = ShmSize/2;
int mySocket = ShmRank/SocketSize;
for(int r=0;r<size;r++){
int hisRank=ShmRanks[r];
if ( hisRank!= MPI_UNDEFINED ) {
int hisSocket=hisRank/SocketSize;
if ( hisSocket != mySocket ) {
ShmRanks[r] = MPI_UNDEFINED;
}
}
}
}
#endif
SharedMemoryTest(); SharedMemoryTest();
} }
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////

View File

@@ -178,8 +178,8 @@ public:
private: private:
void dealloc(void) void dealloc(void)
{ {
alignedAllocator<vobj> alloc;
if( this->_odata_size ) { if( this->_odata_size ) {
alignedAllocator<vobj> alloc;
alloc.deallocate(this->_odata,this->_odata_size); alloc.deallocate(this->_odata,this->_odata_size);
this->_odata=nullptr; this->_odata=nullptr;
this->_odata_size=0; this->_odata_size=0;
@@ -187,16 +187,18 @@ private:
} }
void resize(uint64_t size) void resize(uint64_t size)
{ {
alignedAllocator<vobj> alloc;
if ( this->_odata_size != size ) { if ( this->_odata_size != size ) {
alignedAllocator<vobj> alloc;
dealloc(); dealloc();
}
this->_odata_size = size; this->_odata_size = size;
if ( size ) if ( size )
this->_odata = alloc.allocate(this->_odata_size); this->_odata = alloc.allocate(this->_odata_size);
else else
this->_odata = nullptr; this->_odata = nullptr;
} }
}
public: public:
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops. // Return a view object that may be dereferenced in site loops.
@@ -346,7 +348,7 @@ public:
void reset(GridBase* grid) { void reset(GridBase* grid) {
if (this->_grid != grid) { if (this->_grid != grid) {
this->_grid = grid; this->_grid = grid;
this->_odata.resize(grid->oSites()); this->resize(grid->oSites());
this->checkerboard = 0; this->checkerboard = 0;
} }
} }

View File

@@ -292,7 +292,7 @@ void GridGpuInit(void)
gpu_props = new cudaDeviceProp[nDevices]; gpu_props = new cudaDeviceProp[nDevices];
char * localRankStr = NULL; char * localRankStr = NULL;
int rank = 0, device = 0, world_rank=0; int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" #define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
@@ -301,23 +301,16 @@ void GridGpuInit(void)
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{ {
rank = atoi(localRankStr); rank = atoi(localRankStr);
device = rank %nDevices;
} }
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{ {
rank = atoi(localRankStr); rank = atoi(localRankStr);
device = rank %nDevices;
} }
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
cudaSetDevice(device);
if ( world_rank == 0 ) { if ( world_rank == 0 ) {
GridBanner(); GridBanner();
printf("GpuInit: ================================================\n");
printf("GpuInit: Setting up Cuda Device map before first MPI call\n",nDevices);
printf("GpuInit: ================================================\n");
printf("GpuInit: Cuda reports %d GPUs on MPI rank 0\n",nDevices);
} }
for (int i = 0; i < nDevices; i++) { for (int i = 0; i < nDevices; i++) {
@@ -325,7 +318,6 @@ void GridGpuInit(void)
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
// cudaGetDeviceProperties(&prop, i);
cudaGetDeviceProperties(&gpu_props[i], i); cudaGetDeviceProperties(&gpu_props[i], i);
if ( world_rank == 0) { if ( world_rank == 0) {
cudaDeviceProp prop; cudaDeviceProp prop;
@@ -334,15 +326,13 @@ void GridGpuInit(void)
printf("GpuInit: Device Number : %d\n", i); printf("GpuInit: Device Number : %d\n", i);
printf("GpuInit: ========================\n"); printf("GpuInit: ========================\n");
printf("GpuInit: Device identifier: %s\n", prop.name); printf("GpuInit: Device identifier: %s\n", prop.name);
// printf("GpuInit: Peak Memory Bandwidth (GB/s): %f\n",(float)2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
GPU_PROP(managedMemory); GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard); GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize); GPU_PROP(warpSize);
#if 0 // GPU_PROP(unifiedAddressing);
GPU_PROP(unifiedAddressing); // GPU_PROP(l2CacheSize);
GPU_PROP(l2CacheSize); // GPU_PROP(singleToDoublePrecisionPerfRatio);
GPU_PROP(singleToDoublePrecisionPerfRatio);
#endif
} }
} }
if ( world_rank == 0 ) { if ( world_rank == 0 ) {

View File

@@ -272,7 +272,7 @@ struct Correlator: Serializable
{ {
GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>), GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>),
Metadata, info, Metadata, info,
std::vector<Complex>, corr); std::vector<Scalar>, corr);
}; };
END_HADRONS_NAMESPACE END_HADRONS_NAMESPACE

View File

@@ -144,7 +144,7 @@ void TWeakEye3pt<FImpl>::execute(void)
{ {
LOG(Message) << "Computing mesonic weak 3pt contractions, eye topologies" << std::endl; LOG(Message) << "Computing mesonic weak 3pt contractions, eye topologies" << std::endl;
LOG(Message) << "gIn : " << par().gammaIn << std::endl; LOG(Message) << "gIn : " << par().gammaIn << std::endl;
LOG(Message) << "gOut: " << par().gammaIn << std::endl; LOG(Message) << "gOut: " << par().gammaOut << std::endl;
LOG(Message) << "tOut: " << par().tOut << std::endl; LOG(Message) << "tOut: " << par().tOut << std::endl;
LOG(Message) << "qbl : " << par().qBarLeft << std::endl; LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
LOG(Message) << "qbr : " << par().qBarRight << std::endl; LOG(Message) << "qbr : " << par().qBarRight << std::endl;

View File

@@ -144,7 +144,7 @@ void TWeakNonEye3pt<FImpl>::execute(void)
{ {
LOG(Message) << "Computing mesonic weak 3pt contractions, non-eye topologies" << std::endl; LOG(Message) << "Computing mesonic weak 3pt contractions, non-eye topologies" << std::endl;
LOG(Message) << "gIn : " << par().gammaIn << std::endl; LOG(Message) << "gIn : " << par().gammaIn << std::endl;
LOG(Message) << "gOut: " << par().gammaIn << std::endl; LOG(Message) << "gOut: " << par().gammaOut << std::endl;
LOG(Message) << "ql : " << par().qLeft << std::endl; LOG(Message) << "ql : " << par().qLeft << std::endl;
LOG(Message) << "qbl : " << par().qBarLeft << std::endl; LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
LOG(Message) << "qr : " << par().qRight << std::endl; LOG(Message) << "qr : " << par().qRight << std::endl;

View File

@@ -67,7 +67,6 @@ AC_CHECK_HEADERS(malloc/malloc.h)
AC_CHECK_HEADERS(malloc.h) AC_CHECK_HEADERS(malloc.h)
AC_CHECK_HEADERS(endian.h) AC_CHECK_HEADERS(endian.h)
AC_CHECK_HEADERS(execinfo.h) AC_CHECK_HEADERS(execinfo.h)
AC_CHECK_HEADERS(numaif.h)
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
@@ -136,6 +135,18 @@ case ${ac_SFW_FP16} in
AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
esac esac
############### SUMMIT JSRUN
AC_ARG_ENABLE([summit],
[AC_HELP_STRING([--enable-summit=yes|no], [enable IBMs jsrun resource manager for SUMMIT])],
[ac_JSRUN=${enable_summit}], [ac_SUMMIT=no])
case ${ac_SUMMIT} in
no);;
yes)
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
*)
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
esac
############### Intel libraries ############### Intel libraries
AC_ARG_ENABLE([mkl], AC_ARG_ENABLE([mkl],
[AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
@@ -173,19 +184,6 @@ AC_ARG_WITH([hdf5],
[AM_CXXFLAGS="-I$with_hdf5/include $AM_CXXFLAGS"] [AM_CXXFLAGS="-I$with_hdf5/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_hdf5/lib $AM_LDFLAGS"]) [AM_LDFLAGS="-L$with_hdf5/lib $AM_LDFLAGS"])
############### first-touch
AC_ARG_ENABLE([numa],
[AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])],
[ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
case ${ac_NUMA} in
no)
;;
yes)
AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
*)
AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
esac
############### Checks for library functions ############### Checks for library functions
CXXFLAGS_CPY=$CXXFLAGS CXXFLAGS_CPY=$CXXFLAGS
@@ -241,10 +239,6 @@ AC_SEARCH_LIBS([crc32], [z],
[have_zlib=true] [LIBS="${LIBS} -lz"], [have_zlib=true] [LIBS="${LIBS} -lz"],
[AC_MSG_ERROR(zlib library was not found in your system.)]) [AC_MSG_ERROR(zlib library was not found in your system.)])
AC_SEARCH_LIBS([move_pages], [numa],
[AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
[have_libnuma=true] [LIBS="${LIBS} -lnuma"],
[AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp], AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
[AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])] [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
@@ -261,9 +255,9 @@ AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=code],
AC_ARG_ENABLE([gen-simd-width], AC_ARG_ENABLE([gen-simd-width],
[AS_HELP_STRING([--enable-gen-simd-width=size], [AS_HELP_STRING([--enable-gen-simd-width=size],
[size (in bytes) of the generic SIMD vectors (default: 32)])], [size (in bytes) of the generic SIMD vectors (default: 64)])],
[ac_gen_simd_width=$enable_gen_simd_width], [ac_gen_simd_width=$enable_gen_simd_width],
[ac_gen_simd_width=32]) [ac_gen_simd_width=64])
AC_ARG_ENABLE([gen-scalar], AC_ARG_ENABLE([gen-scalar],
[AS_HELP_STRING([--enable-gen-scalar=yes|no], [AS_HELP_STRING([--enable-gen-scalar=yes|no],