1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-11 14:40:46 +01:00

Shmem comms [NO MPI] target added. The dwf test runs and passes.

Not really shaken out to my satisfaction though as I want more tests done, so don't declare as working.
But committing my current while I try a few experimentals.
This commit is contained in:
Peter Boyle 2016-02-14 14:24:38 -06:00
parent 294dbf1bf0
commit 41c2b09184
14 changed files with 157 additions and 31 deletions

2
configure vendored
View File

@ -5898,7 +5898,7 @@ _ACEOF
echo Configuring for SHMEM communications echo Configuring for SHMEM communications
cat >>confdefs.h <<\_ACEOF cat >>confdefs.h <<\_ACEOF
#define GRID_COMMS_MPI 1 #define GRID_COMMS_SHMEM 1
_ACEOF _ACEOF
;; ;;

View File

@ -180,7 +180,7 @@ case ${ac_COMMS} in
;; ;;
shmem) shmem)
echo Configuring for SHMEM communications echo Configuring for SHMEM communications
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_SHMEM] ) AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
;; ;;
*) *)
AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);

View File

@ -75,18 +75,30 @@ public:
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* = 0) pointer allocate(size_type __n, const void* _p= 0)
{ {
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
_Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
#define PARANOID_SYMMETRIC_HEAP
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast; static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static long psync[_SHMEM_REDUCE_SYNC_SIZE];
shmem_barrier_all(); bcast = (void *) ptr;
_Tp *ptr = (_Tp *) shmem_align(128,__n*sizeof(_Tp));
shmem_barrier_all();
bcast = (void *) _Tp;
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync); shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
assert( bcast == (void *) _Tp);
if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
BACKTRACEFILE();
exit(0);
}
assert( bcast == (void *) ptr);
#endif
#else #else
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H

View File

@ -15,12 +15,15 @@
/* EMPTY_SIMD only for DEBUGGING */ /* EMPTY_SIMD only for DEBUGGING */
#undef EMPTY_SIMD #undef EMPTY_SIMD
/* GRID_COMMS_SHMEM */ /* GRID_COMMS_MPI */
#undef GRID_COMMS_MPI #undef GRID_COMMS_MPI
/* GRID_COMMS_NONE */ /* GRID_COMMS_NONE */
#undef GRID_COMMS_NONE #undef GRID_COMMS_NONE
/* GRID_COMMS_SHMEM */
#undef GRID_COMMS_SHMEM
/* GRID_DEFAULT_PRECISION is DOUBLE */ /* GRID_DEFAULT_PRECISION is DOUBLE */
#undef GRID_DEFAULT_PRECISION_DOUBLE #undef GRID_DEFAULT_PRECISION_DOUBLE

View File

@ -37,4 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_MPI #ifdef GRID_COMMS_MPI
#include <cshift/Cshift_mpi.h> #include <cshift/Cshift_mpi.h>
#endif #endif
#ifdef GRID_COMMS_SHMEM
#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
#endif
#endif #endif

View File

@ -47,9 +47,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define __X86_64 #define __X86_64
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
namespace Grid { namespace Grid {
@ -174,9 +171,8 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
void Grid_init(int *argc,char ***argv) void Grid_init(int *argc,char ***argv)
{ {
#ifdef GRID_COMMS_MPI CartesianCommunicator::Init(argc,argv);
MPI_Init(argc,argv);
#endif
// Parse command line args. // Parse command line args.
GridLogger::StopWatch.Start(); GridLogger::StopWatch.Start();
@ -284,7 +280,6 @@ double usecond(void) {
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec; return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
} }
#define _NBACKTRACE (256)
void * Grid_backtrace_buffer[_NBACKTRACE]; void * Grid_backtrace_buffer[_NBACKTRACE];
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)

View File

@ -78,13 +78,16 @@ void GridLogConfigure(std::vector<std::string> &logstreams)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void Grid_quiesce_nodes(void) void Grid_quiesce_nodes(void)
{ {
#ifdef GRID_COMMS_MPI
int me; int me;
#ifdef GRID_COMMS_MPI
MPI_Comm_rank(MPI_COMM_WORLD,&me); MPI_Comm_rank(MPI_COMM_WORLD,&me);
#endif
#ifdef GRID_COMMS_SHMEM
me = shmem_my_pe();
#endif
if ( me ) { if ( me ) {
std::cout.setstate(std::ios::badbit); std::cout.setstate(std::ios::badbit);
} }
#endif
} }
void Grid_unquiesce_nodes(void) void Grid_unquiesce_nodes(void)

View File

@ -29,9 +29,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LOG_H #ifndef GRID_LOG_H
#define GRID_LOG_H #define GRID_LOG_H
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
namespace Grid { namespace Grid {
// Dress the output; use std::chrono for time stamping via the StopWatch class // Dress the output; use std::chrono for time stamping via the StopWatch class
int Rank(void); // used for early stage debug before library init
class Logger { class Logger {
@ -89,5 +95,35 @@ extern GridLogger GridLogPerformance;
extern GridLogger GridLogIterative ; extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ; extern GridLogger GridLogIntegrator ;
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#ifdef HAVE_EXECINFO_H
#define BACKTRACEFILE() {\
char string[20]; \
std::sprintf(string,"backtrace.%d",Rank()); \
std::FILE * fp = std::fopen(string,"w"); \
BACKTRACEFP(fp)\
std::fclose(fp); \
}
#define BACKTRACE() BACKTRACE(std::stdout)
#define BACKTRACEFP(fp) { \
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);\
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
for (int i = 0; i < symbols; i++){\
std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
}\
}
#else
#define BACKTRACE() BACKTRACE(std::stdout);
#define BACKTRACEFP(fp) { \
for (int i = 0; i < 4; i++){\
std::fprintf (fp,"BT %d %lx\n",i, __builtin_return_address(i); std::fflush(fp); \
}\
}
#endif
} }
#endif #endif

View File

@ -114,15 +114,19 @@ namespace Grid {
} }
void Communicate(void ) { void Communicate(void ) {
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
std::vector<CommsRequest_t> reqs(0);
commtime-=usecond(); commtime-=usecond();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
_grid->SendToRecvFrom(Packets[i].send_buf, _grid->SendToRecvFromBegin(reqs,
Packets[i].send_buf,
Packets[i].to_rank, Packets[i].to_rank,
Packets[i].recv_buf, Packets[i].recv_buf,
Packets[i].from_rank, Packets[i].from_rank,
Packets[i].bytes); Packets[i].bytes);
Packets[i].done = 1; Packets[i].done = 1;
} }
_grid->SendToRecvFromComplete(reqs);
commtime+=usecond(); commtime+=usecond();
} }
@ -648,7 +652,7 @@ PARALLEL_FOR_LOOP
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
assert (xmit_to_rank != _grid->ThisRank()); assert (xmit_to_rank != _grid->ThisRank());
assert (recv_from_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank());
// FIXME Implement asynchronous send & also avoid buffer copy // FIXME Implement asynchronous send & also avoid buffer copy

View File

@ -56,6 +56,8 @@ class CartesianCommunicator {
typedef int CommsRequest_t; typedef int CommsRequest_t;
#endif #endif
static void Init(int *argc, char ***argv);
// Constructor // Constructor
CartesianCommunicator(const std::vector<int> &pdimensions_in); CartesianCommunicator(const std::vector<int> &pdimensions_in);

View File

@ -31,6 +31,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
// Should error check all MPI calls. // Should error check all MPI calls.
void CartesianCommunicator::Init(int *argc, char ***argv) {
MPI_Init(argc,argv);
}
int Rank(void) {
int pe;
MPI_Comm_rank(MPI_COMM_WORLD,&pe);
return pe;
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {

View File

@ -28,6 +28,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include "Grid.h" #include "Grid.h"
namespace Grid { namespace Grid {
void CartesianCommunicator::Init(int *argc, char *** arv)
{
}
int Rank(void ){ return 0 };
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
_processors = processors; _processors = processors;

View File

@ -31,7 +31,20 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
// Should error check all MPI calls. // Should error check all MPI calls.
#define SHMEM_VET(addr)
#define SHMEM_VET_DEBUG(addr) { \
if ( ! shmem_addr_accessible(addr,_processor) ) {\
std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
BACKTRACEFILE(); \
}\
}
int Rank(void) {
return shmem_my_pe();
}
void CartesianCommunicator::Init(int *argc, char ***argv) {
shmem_init();
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
_ndimension = processors.size(); _ndimension = processors.size();
@ -41,8 +54,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
_processors = processors; _processors = processors;
_processor_coor.resize(_ndimension); _processor_coor.resize(_ndimension);
// shmem_init_thread(SHMEM_THREAD_FUNNELED);
start_pes(0);
_processor = shmem_my_pe(); _processor = shmem_my_pe();
Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors); Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
@ -50,10 +61,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
for(int i=0;i<_ndimension;i++){ for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i]; _Nprocessors*=_processors[i];
} }
if ( _processor == 0 ) {
printf("I'm running SHMEM communications %d \n",_processor);
}
int Size = shmem_n_pes(); int Size = shmem_n_pes();
assert(Size==_Nprocessors); assert(Size==_Nprocessors);
} }
@ -85,6 +96,12 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N)
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static long psync[_SHMEM_REDUCE_SYNC_SIZE];
// Inefficient, but don't want to dynamic alloc
if ( shmem_addr_accessible(f,_processor) ){
shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
return;
}
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
source = f[i]; source = f[i];
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
@ -108,6 +125,11 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static long psync[_SHMEM_REDUCE_SYNC_SIZE];
if ( shmem_addr_accessible(d,_processor) ){
shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
return;
}
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
source = d[i]; source = d[i];
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
@ -117,12 +139,13 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
std::vector<int> coor = _processor_coor; std::vector<int> coor = _processor_coor;
assert(std::abs(shift) <_processors[dim]); assert(std::abs(shift) <_processors[dim]);
coor[dim] = (coor[dim] + shift + _processors[dim])%_processors[dim]; coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
Lexicographic::IndexFromCoor(coor,source,_processors); Lexicographic::IndexFromCoor(coor,source,_processors);
coor[dim] = (coor[dim] - shift + _processors[dim])%_processors[dim]; coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
Lexicographic::IndexFromCoor(coor,dest,_processors); Lexicographic::IndexFromCoor(coor,dest,_processors);
} }
@ -144,6 +167,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int from, int from,
int bytes) int bytes)
{ {
SHMEM_VET(xmit);
SHMEM_VET(recv);
std::vector<CommsRequest_t> reqs(0); std::vector<CommsRequest_t> reqs(0);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs); SendToRecvFromComplete(reqs);
@ -171,6 +196,9 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
int from, int from,
int bytes) int bytes)
{ {
SHMEM_VET(xmit);
SHMEM_VET(recv);
// shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
shmem_putmem(recv,xmit,bytes,dest); shmem_putmem(recv,xmit,bytes,dest);
} }
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@ -185,14 +213,37 @@ void CartesianCommunicator::Barrier(void)
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{ {
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static uint32_t word;
uint32_t *array = (uint32_t *) data;
assert( (bytes % 4)==0); assert( (bytes % 4)==0);
shmem_broadcast32(data,data,bytes/4,root,0,0,_Nprocessors,psync); int words = bytes/4;
for(int w=0;w<words;w++){
word = array[w];
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
if ( shmem_my_pe() != root ) {
array[w] = word;
}
shmem_barrier_all();
}
} }
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{ {
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static uint32_t word;
uint32_t *array = (uint32_t *) data;
assert( (bytes % 4)==0); assert( (bytes % 4)==0);
shmem_broadcast32(data,data,bytes/4,root,0,0,shmem_n_pes(),psync); int words = bytes/4;
for(int w=0;w<words;w++){
word = array[w];
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
if ( shmem_my_pe() != root ) {
array[w]= word;
}
shmem_barrier_all();
}
} }
} }

View File

@ -191,8 +191,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type); int words = sizeof(vobj)/sizeof(vector_type);
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) ); std::vector<Vector<scalar_object> > send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) ); std::vector<Vector<scalar_object> > recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
std::vector<scalar_object *> pointers(Nsimd); // std::vector<scalar_object *> pointers(Nsimd); //