1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Shmem comms [NO MPI] target added. The dwf test runs and passes.

Not really shaken out to my satisfaction though as I want more tests done, so don't declare as working.
But committing my current while I try a few experimentals.
This commit is contained in:
Peter Boyle 2016-02-14 14:24:38 -06:00
parent 294dbf1bf0
commit 41c2b09184
14 changed files with 157 additions and 31 deletions

2
configure vendored
View File

@ -5898,7 +5898,7 @@ _ACEOF
echo Configuring for SHMEM communications
cat >>confdefs.h <<\_ACEOF
#define GRID_COMMS_MPI 1
#define GRID_COMMS_SHMEM 1
_ACEOF
;;

View File

@ -180,7 +180,7 @@ case ${ac_COMMS} in
;;
shmem)
echo Configuring for SHMEM communications
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_SHMEM] )
AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
;;
*)
AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);

View File

@ -75,18 +75,30 @@ public:
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* = 0)
pointer allocate(size_type __n, const void* _p= 0)
{
#ifdef GRID_COMMS_SHMEM
_Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
#define PARANOID_SYMMETRIC_HEAP
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
shmem_barrier_all();
_Tp *ptr = (_Tp *) shmem_align(128,__n*sizeof(_Tp));
shmem_barrier_all();
bcast = (void *) _Tp;
bcast = (void *) ptr;
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
assert( bcast == (void *) _Tp);
if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
BACKTRACEFILE();
exit(0);
}
assert( bcast == (void *) ptr);
#endif
#else
#ifdef HAVE_MM_MALLOC_H

View File

@ -15,12 +15,15 @@
/* EMPTY_SIMD only for DEBUGGING */
#undef EMPTY_SIMD
/* GRID_COMMS_SHMEM */
/* GRID_COMMS_MPI */
#undef GRID_COMMS_MPI
/* GRID_COMMS_NONE */
#undef GRID_COMMS_NONE
/* GRID_COMMS_SHMEM */
#undef GRID_COMMS_SHMEM
/* GRID_DEFAULT_PRECISION is DOUBLE */
#undef GRID_DEFAULT_PRECISION_DOUBLE

View File

@ -37,4 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_MPI
#include <cshift/Cshift_mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM
#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
#endif
#endif

View File

@ -47,9 +47,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define __X86_64
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
namespace Grid {
@ -174,9 +171,8 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
/////////////////////////////////////////////////////////
void Grid_init(int *argc,char ***argv)
{
#ifdef GRID_COMMS_MPI
MPI_Init(argc,argv);
#endif
CartesianCommunicator::Init(argc,argv);
// Parse command line args.
GridLogger::StopWatch.Start();
@ -284,7 +280,6 @@ double usecond(void) {
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
}
#define _NBACKTRACE (256)
void * Grid_backtrace_buffer[_NBACKTRACE];
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)

View File

@ -78,13 +78,16 @@ void GridLogConfigure(std::vector<std::string> &logstreams)
////////////////////////////////////////////////////////////
void Grid_quiesce_nodes(void)
{
#ifdef GRID_COMMS_MPI
int me;
#ifdef GRID_COMMS_MPI
MPI_Comm_rank(MPI_COMM_WORLD,&me);
#endif
#ifdef GRID_COMMS_SHMEM
me = shmem_my_pe();
#endif
if ( me ) {
std::cout.setstate(std::ios::badbit);
}
#endif
}
void Grid_unquiesce_nodes(void)

View File

@ -29,9 +29,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#ifndef GRID_LOG_H
#define GRID_LOG_H
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
namespace Grid {
// Dress the output; use std::chrono for time stamping via the StopWatch class
int Rank(void); // used for early stage debug before library init
class Logger {
@ -89,5 +95,35 @@ extern GridLogger GridLogPerformance;
extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ;
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#ifdef HAVE_EXECINFO_H
#define BACKTRACEFILE() {\
char string[20]; \
std::sprintf(string,"backtrace.%d",Rank()); \
std::FILE * fp = std::fopen(string,"w"); \
BACKTRACEFP(fp)\
std::fclose(fp); \
}
#define BACKTRACE() BACKTRACE(std::stdout)
#define BACKTRACEFP(fp) { \
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);\
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
for (int i = 0; i < symbols; i++){\
std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
}\
}
#else
#define BACKTRACE() BACKTRACE(std::stdout);
#define BACKTRACEFP(fp) { \
for (int i = 0; i < 4; i++){\
std::fprintf (fp,"BT %d %lx\n",i, __builtin_return_address(i); std::fflush(fp); \
}\
}
#endif
}
#endif

View File

@ -114,15 +114,19 @@ namespace Grid {
}
void Communicate(void ) {
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
std::vector<CommsRequest_t> reqs(0);
commtime-=usecond();
for(int i=0;i<Packets.size();i++){
_grid->SendToRecvFrom(Packets[i].send_buf,
_grid->SendToRecvFromBegin(reqs,
Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes);
Packets[i].done = 1;
}
_grid->SendToRecvFromComplete(reqs);
commtime+=usecond();
}
@ -648,7 +652,7 @@ PARALLEL_FOR_LOOP
int recv_from_rank;
int xmit_to_rank;
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
assert (xmit_to_rank != _grid->ThisRank());
assert (xmit_to_rank != _grid->ThisRank());
assert (recv_from_rank != _grid->ThisRank());
// FIXME Implement asynchronous send & also avoid buffer copy

View File

@ -56,6 +56,8 @@ class CartesianCommunicator {
typedef int CommsRequest_t;
#endif
static void Init(int *argc, char ***argv);
// Constructor
CartesianCommunicator(const std::vector<int> &pdimensions_in);

View File

@ -31,6 +31,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid {
// Should error check all MPI calls.
void CartesianCommunicator::Init(int *argc, char ***argv) {
MPI_Init(argc,argv);
}
int Rank(void) {
int pe;
MPI_Comm_rank(MPI_COMM_WORLD,&pe);
return pe;
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{

View File

@ -28,6 +28,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include "Grid.h"
namespace Grid {
void CartesianCommunicator::Init(int *argc, char *** arv)
{
}
int Rank(void ){ return 0 };
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_processors = processors;

View File

@ -31,7 +31,20 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid {
// Should error check all MPI calls.
#define SHMEM_VET(addr)
#define SHMEM_VET_DEBUG(addr) { \
if ( ! shmem_addr_accessible(addr,_processor) ) {\
std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
BACKTRACEFILE(); \
}\
}
int Rank(void) {
return shmem_my_pe();
}
void CartesianCommunicator::Init(int *argc, char ***argv) {
shmem_init();
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_ndimension = processors.size();
@ -41,8 +54,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
_processors = processors;
_processor_coor.resize(_ndimension);
// shmem_init_thread(SHMEM_THREAD_FUNNELED);
start_pes(0);
_processor = shmem_my_pe();
Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
@ -50,10 +61,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
if ( _processor == 0 ) {
printf("I'm running SHMEM communications %d \n",_processor);
}
int Size = shmem_n_pes();
assert(Size==_Nprocessors);
}
@ -85,6 +96,12 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N)
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
// Inefficient, but don't want to dynamic alloc
if ( shmem_addr_accessible(f,_processor) ){
shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
return;
}
for(int i=0;i<N;i++){
source = f[i];
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
@ -108,6 +125,11 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
if ( shmem_addr_accessible(d,_processor) ){
shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
return;
}
for(int i=0;i<N;i++){
source = d[i];
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
@ -117,12 +139,13 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
std::vector<int> coor = _processor_coor;
assert(std::abs(shift) <_processors[dim]);
coor[dim] = (coor[dim] + shift + _processors[dim])%_processors[dim];
coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
Lexicographic::IndexFromCoor(coor,source,_processors);
coor[dim] = (coor[dim] - shift + _processors[dim])%_processors[dim];
coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
Lexicographic::IndexFromCoor(coor,dest,_processors);
}
@ -144,6 +167,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int from,
int bytes)
{
SHMEM_VET(xmit);
SHMEM_VET(recv);
std::vector<CommsRequest_t> reqs(0);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
@ -171,6 +196,9 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
int from,
int bytes)
{
SHMEM_VET(xmit);
SHMEM_VET(recv);
// shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
shmem_putmem(recv,xmit,bytes,dest);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@ -185,14 +213,37 @@ void CartesianCommunicator::Barrier(void)
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static uint32_t word;
uint32_t *array = (uint32_t *) data;
assert( (bytes % 4)==0);
shmem_broadcast32(data,data,bytes/4,root,0,0,_Nprocessors,psync);
int words = bytes/4;
for(int w=0;w<words;w++){
word = array[w];
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
if ( shmem_my_pe() != root ) {
array[w] = word;
}
shmem_barrier_all();
}
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static uint32_t word;
uint32_t *array = (uint32_t *) data;
assert( (bytes % 4)==0);
shmem_broadcast32(data,data,bytes/4,root,0,0,shmem_n_pes(),psync);
int words = bytes/4;
for(int w=0;w<words;w++){
word = array[w];
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
if ( shmem_my_pe() != root ) {
array[w]= word;
}
shmem_barrier_all();
}
}
}

View File

@ -191,8 +191,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
std::vector<Vector<scalar_object> > send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
std::vector<Vector<scalar_object> > recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object);
std::vector<scalar_object *> pointers(Nsimd); //