1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

mpi, mpi3, shmem all compile.

mpi, mpi3 pass single node multi-rank
This commit is contained in:
azusayamaguchi 2016-10-24 23:45:31 +01:00
parent b6a65059a2
commit b94478fa51
6 changed files with 49 additions and 85 deletions

View File

@ -141,7 +141,7 @@ public:
if ( bcast != ptr ) { if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout); std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
BACKTRACEFILE(); // BACKTRACEFILE();
exit(0); exit(0);
} }
assert( bcast == (void *) ptr); assert( bcast == (void *) ptr);

View File

@ -32,8 +32,6 @@
#include <Grid/stencil/Lebesgue.h> // subdir aggregate #include <Grid/stencil/Lebesgue.h> // subdir aggregate
const int ShmDirectCopy = 1;
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
// Must not lose sight that goal is to be able to construct really efficient // Must not lose sight that goal is to be able to construct really efficient
// gather to a point stencil code. CSHIFT is not the best way, so need // gather to a point stencil code. CSHIFT is not the best way, so need
@ -170,13 +168,13 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
reqs.resize(Packets.size()); reqs.resize(Packets.size());
commtime-=usecond(); commtime-=usecond();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
if( ShmDirectCopy ) {
_grid->StencilSendToRecvFromBegin(reqs[i], _grid->StencilSendToRecvFromBegin(reqs[i],
Packets[i].send_buf, Packets[i].send_buf,
Packets[i].to_rank, Packets[i].to_rank,
Packets[i].recv_buf, Packets[i].recv_buf,
Packets[i].from_rank, Packets[i].from_rank,
Packets[i].bytes); Packets[i].bytes);
/*
}else{ }else{
_grid->SendToRecvFromBegin(reqs[i], _grid->SendToRecvFromBegin(reqs[i],
Packets[i].send_buf, Packets[i].send_buf,
@ -185,17 +183,19 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
Packets[i].from_rank, Packets[i].from_rank,
Packets[i].bytes); Packets[i].bytes);
} }
*/
} }
commtime+=usecond(); commtime+=usecond();
} }
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
commtime-=usecond(); commtime-=usecond();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
if( ShmDirectCopy ) // if( ShmDirectCopy )
_grid->StencilSendToRecvFromComplete(reqs[i]); _grid->StencilSendToRecvFromComplete(reqs[i]);
else // else
_grid->SendToRecvFromComplete(reqs[i]); // _grid->SendToRecvFromComplete(reqs[i]);
} }
commtime+=usecond(); commtime+=usecond();
} }
@ -253,8 +253,6 @@ PARALLEL_FOR_LOOP
// Flat vector, change layout for cache friendly. // Flat vector, change layout for cache friendly.
Vector<StencilEntry> _entries; Vector<StencilEntry> _entries;
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
void PrecomputeByteOffsets(void){ void PrecomputeByteOffsets(void){
for(int i=0;i<_entries.size();i++){ for(int i=0;i<_entries.size();i++){
if( _entries[i]._is_local ) { if( _entries[i]._is_local ) {
@ -265,9 +263,7 @@ PARALLEL_FOR_LOOP
} }
}; };
inline uint64_t Touch(int ent) { inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
// _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
}
inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
local = _entries[ent]._is_local; local = _entries[ent]._is_local;
@ -685,7 +681,9 @@ PARALLEL_FOR_LOOP
_grid->StencilBarrier(); _grid->StencilBarrier();
HaloGather(source,compress); HaloGather(source,compress);
this->CommunicateBegin(reqs); this->CommunicateBegin(reqs);
_grid->StencilBarrier();
this->CommunicateComplete(reqs); this->CommunicateComplete(reqs);
_grid->StencilBarrier();
CommsMerge(); // spins CommsMerge(); // spins
} }
@ -823,11 +821,13 @@ PARALLEL_FOR_LOOP
cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p); cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
if ( (ShmDirectCopy==0)||send_buf==NULL ) { if ( (send_buf==NULL) ) {
cobj *send_buf = u_send_buf_p; send_buf = u_send_buf_p;
} }
// std::cout << " send_bufs "<<std::hex<< send_buf <<" ubp "<<u_send_buf_p <<std::dec<<std::endl;
t_data-=usecond(); t_data-=usecond();
assert(u_send_buf_p!=NULL);
assert(send_buf!=NULL);
Gather_plane_simple_table (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; Gather_plane_simple_table (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
t_data+=usecond(); t_data+=usecond();
@ -931,7 +931,8 @@ PARALLEL_FOR_LOOP
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp);
if ((ShmDirectCopy==0)||(shm==NULL)) { // if ((ShmDirectCopy==0)||(shm==NULL)) {
if (shm==NULL) {
shm = rp; shm = rp;
} }

View File

@ -69,7 +69,7 @@ int CartesianCommunicator::ProcessorCount(void) { return
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int CartesianCommunicator::RankWorld(void) { return WorldRank; }; int CartesianCommunicator::RankWorld(void){ return WorldRank; };
int CartesianCommunicator::Ranks (void) { return WorldSize; }; int CartesianCommunicator::Ranks (void) { return WorldSize; };
int CartesianCommunicator::Nodes (void) { return GroupSize; }; int CartesianCommunicator::Nodes (void) { return GroupSize; };
int CartesianCommunicator::Cores (void) { return ShmSize; }; int CartesianCommunicator::Cores (void) { return ShmSize; };
@ -108,22 +108,22 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
{ {
SendToRecvFromComplete(waitall); SendToRecvFromComplete(waitall);
} }
void StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector; commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; } void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
void *CartesianCommunicator::ShmBuffer(int rank) { void *CartesianCommunicator::ShmBuffer(int rank) {
if (rank != ShmRank ) return NULL; return NULL;
else return ShmCommBuf;
} }
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
if (rank != ShmRank ) return NULL; return NULL;
else return local_p;
} }
void CartesianCommunicator::ShmInitGeneric(void){ void CartesianCommunicator::ShmInitGeneric(void){
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
ShmCommBuf=(void *)&ShmBufStorageVector[0]; ShmCommBuf=(void *)&ShmBufStorageVector[0];
std::cout << "allocated persistent buffer"<<std::hex << ShmCommBuf << std::dec<<std::endl;
} }
#endif #endif

View File

@ -400,7 +400,9 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
int from, int from,
int bytes) int bytes)
{ {
#if 1 #if 0
this->StencilBarrier();
MPI_Request xrq; MPI_Request xrq;
MPI_Request rrq; MPI_Request rrq;
@ -440,9 +442,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int w=0;w<words;w++) { for(int w=0;w<words;w++) {
op[w]=ip[w]; op[w]=ip[w];
if ( w == 0 ) {
// std::cout << " xmit "<< ShmRank <<" -> "<< gdest<<" " <<std::hex<<op[w]<<std::dec<<std::endl;
}
} }
bcopy(&_processor,&to_ptr[bytes],sizeof(_processor)); bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
@ -453,9 +452,7 @@ PARALLEL_FOR_LOOP
list.push_back(xrq); list.push_back(xrq);
} }
MPI_Win_sync (ShmWindow); this->StencilBarrier();
MPI_Barrier (ShmComm);
MPI_Win_sync (ShmWindow);
if (small && (gfrom !=MPI_UNDEFINED) ) { if (small && (gfrom !=MPI_UNDEFINED) ) {
T *ip = (T *)from_ptr; T *ip = (T *)from_ptr;
@ -463,9 +460,6 @@ PARALLEL_FOR_LOOP
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int w=0;w<words;w++) { for(int w=0;w<words;w++) {
op[w]=ip[w]; op[w]=ip[w];
if ( w == 0 ) {
// std::cout << " recv "<< ShmRank <<" <- "<< gfrom<<" " <<std::hex<<op[w]<<std::dec<<std::endl;
}
} }
bcopy(&from_ptr[bytes] ,&tag ,sizeof(tag)); bcopy(&from_ptr[bytes] ,&tag ,sizeof(tag));
bcopy(&from_ptr[bytes+4],&check,sizeof(check)); bcopy(&from_ptr[bytes+4],&check,sizeof(check));
@ -477,9 +471,8 @@ PARALLEL_FOR_LOOP
list.push_back(rrq); list.push_back(rrq);
} }
MPI_Win_sync (ShmWindow); this->StencilBarrier();
MPI_Barrier (ShmComm);
MPI_Win_sync (ShmWindow);
#else #else
MPI_Request xrq; MPI_Request xrq;
MPI_Request rrq; MPI_Request rrq;
@ -528,9 +521,6 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_
list.push_back(rrq); list.push_back(rrq);
} }
StencilBarrier();
} }

View File

@ -47,8 +47,8 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
WorldSize = 1; WorldSize = 1;
ShmRank=0; ShmRank=0;
ShmSize=1; ShmSize=1;
GroupRank=_WorldRank; GroupRank=WorldRank;
GroupSize=_WorldSize; GroupSize=WorldSize;
Slave =0; Slave =0;
ShmInitGeneric(); ShmInitGeneric();
} }

View File

@ -44,13 +44,16 @@ namespace Grid {
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
int CartesianCommunicator::ShmRank;
int CartesianCommunicator::ShmSize; typedef struct HandShake_t {
int CartesianCommunicator::GroupRank; uint64_t seq_local;
int CartesianCommunicator::GroupSize; uint64_t seq_remote;
int CartesianCommunicator::WorldRank; } HandShake;
int CartesianCommunicator::WorldSize;
int CartesianCommunicator::Slave;
static Vector< HandShake > XConnections;
static Vector< HandShake > RConnections;
void CartesianCommunicator::Init(int *argc, char ***argv) { void CartesianCommunicator::Init(int *argc, char ***argv) {
shmem_init(); shmem_init();
@ -62,37 +65,17 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
RConnections[pe].seq_local = 0; RConnections[pe].seq_local = 0;
RConnections[pe].seq_remote= 0; RConnections[pe].seq_remote= 0;
} }
WorldSize = shmem_n_pes();
WorldRank = shmem_my_pe();
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
shmem_barrier_all(); shmem_barrier_all();
ShmInitGeneric(); ShmInitGeneric();
} }
// Should error check all MPI calls.
void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init(argc,argv);
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
MPI_Comm_rank(communicator_world,&_WorldRank);
MPI_Comm_size(communicator_world,&_WorldSize);
_ShmRank=0;
_ShmSize=1;
_GroupRank=_WorldRank;
_GroupSize=_WorldSize;
_Slave =0;
}
}
typedef struct HandShake_t {
uint64_t seq_local;
uint64_t seq_remote;
} HandShake;
static Vector< HandShake > XConnections;
static Vector< HandShake > RConnections;
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
_ndimension = processors.size(); _ndimension = processors.size();
@ -261,12 +244,9 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
if ( _processor == sender ) { if ( _processor == sender ) {
printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
// Check he has posted a receive // Check he has posted a receive
while(SendSeq->seq_remote == SendSeq->seq_local); while(SendSeq->seq_remote == SendSeq->seq_local);
printf("Sender receive %d posted\n",sender,receiver);
// Advance our send count // Advance our send count
seq = ++(SendSeq->seq_local); seq = ++(SendSeq->seq_local);
@ -275,26 +255,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
shmem_putmem(recv,xmit,bytes,receiver); shmem_putmem(recv,xmit,bytes,receiver);
shmem_fence(); shmem_fence();
printf("Sender sent payload %d\n",seq);
//Notify him we're done //Notify him we're done
shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver); shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
shmem_fence(); shmem_fence();
printf("Sender ringing door bell %d\n",seq);
} }
if ( _processor == receiver ) { if ( _processor == receiver ) {
printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
// Post a receive // Post a receive
seq = ++(RecvSeq->seq_local); seq = ++(RecvSeq->seq_local);
shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender); shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
printf("Receiver Opening letter box %d\n",seq);
// Now wait until he has advanced our reception counter // Now wait until he has advanced our reception counter
while(RecvSeq->seq_remote != RecvSeq->seq_local); while(RecvSeq->seq_remote != RecvSeq->seq_local);
printf("Receiver Got the mail %d\n",seq);
} }
} }