diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h index 44929ca8..fa001adc 100644 --- a/lib/AlignedAllocator.h +++ b/lib/AlignedAllocator.h @@ -141,7 +141,7 @@ public: if ( bcast != ptr ) { std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout); - BACKTRACEFILE(); + // BACKTRACEFILE(); exit(0); } assert( bcast == (void *) ptr); diff --git a/lib/Stencil.h b/lib/Stencil.h index f22acb6f..221785b4 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -32,8 +32,6 @@ #include // subdir aggregate -const int ShmDirectCopy = 1; - ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient // gather to a point stencil code. CSHIFT is not the best way, so need @@ -170,13 +168,13 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal reqs.resize(Packets.size()); commtime-=usecond(); for(int i=0;iStencilSendToRecvFromBegin(reqs[i], Packets[i].send_buf, Packets[i].to_rank, Packets[i].recv_buf, Packets[i].from_rank, Packets[i].bytes); + /* }else{ _grid->SendToRecvFromBegin(reqs[i], Packets[i].send_buf, @@ -185,17 +183,19 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal Packets[i].from_rank, Packets[i].bytes); } + */ } commtime+=usecond(); } void CommunicateComplete(std::vector > &reqs) { commtime-=usecond(); + for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); - else - _grid->SendToRecvFromComplete(reqs[i]); + // else + // _grid->SendToRecvFromComplete(reqs[i]); } commtime+=usecond(); } @@ -253,8 +253,6 @@ PARALLEL_FOR_LOOP // Flat vector, change layout for cache friendly. Vector _entries; - inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } - void PrecomputeByteOffsets(void){ for(int i=0;i<_entries.size();i++){ if( _entries[i]._is_local ) { @@ -265,9 +263,7 @@ PARALLEL_FOR_LOOP } }; - inline uint64_t Touch(int ent) { - // _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); - } + inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; local = _entries[ent]._is_local; @@ -685,7 +681,9 @@ PARALLEL_FOR_LOOP _grid->StencilBarrier(); HaloGather(source,compress); this->CommunicateBegin(reqs); + _grid->StencilBarrier(); this->CommunicateComplete(reqs); + _grid->StencilBarrier(); CommsMerge(); // spins } @@ -823,11 +821,13 @@ PARALLEL_FOR_LOOP cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p); - if ( (ShmDirectCopy==0)||send_buf==NULL ) { - cobj *send_buf = u_send_buf_p; + if ( (send_buf==NULL) ) { + send_buf = u_send_buf_p; } - + // std::cout << " send_bufs "<ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); - if ((ShmDirectCopy==0)||(shm==NULL)) { + // if ((ShmDirectCopy==0)||(shm==NULL)) { + if (shm==NULL) { shm = rp; } diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 1272b6a2..91e9cf9b 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -69,7 +69,7 @@ int CartesianCommunicator::ProcessorCount(void) { return //////////////////////////////////////////////////////////////////////////////// // very VERY rarely (Log, serial RNG) we need world without a grid //////////////////////////////////////////////////////////////////////////////// -int CartesianCommunicator::RankWorld(void) { return WorldRank; }; +int CartesianCommunicator::RankWorld(void){ return WorldRank; }; int CartesianCommunicator::Ranks (void) { return WorldSize; }; int CartesianCommunicator::Nodes (void) { return GroupSize; }; int CartesianCommunicator::Cores (void) { return ShmSize; }; @@ -108,22 +108,22 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector CartesianCommunicator::ShmBufStorageVector; void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; } + void *CartesianCommunicator::ShmBuffer(int rank) { - if (rank != ShmRank ) return NULL; - else return ShmCommBuf; + return NULL; } void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { - if (rank != ShmRank ) return NULL; - else return local_p; + return NULL; } void CartesianCommunicator::ShmInitGeneric(void){ ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); ShmCommBuf=(void *)&ShmBufStorageVector[0]; + std::cout << "allocated persistent buffer"< &lis int from, int bytes) { -#if 1 +#if 0 + this->StencilBarrier(); + MPI_Request xrq; MPI_Request rrq; @@ -440,9 +442,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis PARALLEL_FOR_LOOP for(int w=0;w "<< gdest<<" " <StencilBarrier(); + if (small && (gfrom !=MPI_UNDEFINED) ) { T *ip = (T *)from_ptr; T *op = (T *)recv; PARALLEL_FOR_LOOP for(int w=0;wStencilBarrier(); + #else MPI_Request xrq; MPI_Request rrq; @@ -528,9 +521,6 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector XConnections; +static Vector< HandShake > RConnections; + void CartesianCommunicator::Init(int *argc, char ***argv) { shmem_init(); @@ -62,37 +65,17 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { RConnections[pe].seq_local = 0; RConnections[pe].seq_remote= 0; } + WorldSize = shmem_n_pes(); + WorldRank = shmem_my_pe(); + ShmRank=0; + ShmSize=1; + GroupRank=WorldRank; + GroupSize=WorldSize; + Slave =0; shmem_barrier_all(); ShmInitGeneric(); } - -// Should error check all MPI calls. -void CartesianCommunicator::Init(int *argc, char ***argv) { - int flag; - MPI_Initialized(&flag); // needed to coexist with other libs apparently - if ( !flag ) { - MPI_Init(argc,argv); - MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); - MPI_Comm_rank(communicator_world,&_WorldRank); - MPI_Comm_size(communicator_world,&_WorldSize); - _ShmRank=0; - _ShmSize=1; - _GroupRank=_WorldRank; - _GroupSize=_WorldSize; - _Slave =0; - } -} - - -typedef struct HandShake_t { - uint64_t seq_local; - uint64_t seq_remote; -} HandShake; - -static Vector< HandShake > XConnections; -static Vector< HandShake > RConnections; - CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { _ndimension = processors.size(); @@ -261,12 +244,9 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, if ( _processor == sender ) { - printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver); // Check he has posted a receive while(SendSeq->seq_remote == SendSeq->seq_local); - printf("Sender receive %d posted\n",sender,receiver); - // Advance our send count seq = ++(SendSeq->seq_local); @@ -275,26 +255,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, shmem_putmem(recv,xmit,bytes,receiver); shmem_fence(); - printf("Sender sent payload %d\n",seq); //Notify him we're done shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver); shmem_fence(); - printf("Sender ringing door bell %d\n",seq); } if ( _processor == receiver ) { - printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver); // Post a receive seq = ++(RecvSeq->seq_local); shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender); - printf("Receiver Opening letter box %d\n",seq); - - // Now wait until he has advanced our reception counter while(RecvSeq->seq_remote != RecvSeq->seq_local); - printf("Receiver Got the mail %d\n",seq); } }