mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Stencil working in SHM MPI3
This commit is contained in:
parent
6ea2184e18
commit
485ad6fde0
126
lib/Stencil.h
126
lib/Stencil.h
@ -1,4 +1,4 @@
|
|||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
@ -25,12 +25,10 @@
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#ifndef GRID_STENCIL_H
|
#ifndef GRID_STENCIL_H
|
||||||
#define GRID_STENCIL_H
|
#define GRID_STENCIL_H
|
||||||
|
|
||||||
#include <thread>
|
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
|
||||||
|
|
||||||
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Must not lose sight that goal is to be able to construct really efficient
|
// Must not lose sight that goal is to be able to construct really efficient
|
||||||
@ -80,9 +78,10 @@ template<class vobj,class cobj,class compressor>
|
|||||||
void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
|
void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
|
||||||
{
|
{
|
||||||
int num=table.size();
|
int num=table.size();
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int i=0;i<num;i++){
|
for(int i=0;i<num;i++){
|
||||||
vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second]));
|
vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second]));
|
||||||
|
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -94,6 +93,8 @@ struct StencilEntry {
|
|||||||
uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
|
uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
|
||||||
};
|
};
|
||||||
|
|
||||||
|
//extern int dump;
|
||||||
|
|
||||||
template<class vobj,class cobj>
|
template<class vobj,class cobj>
|
||||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||||
public:
|
public:
|
||||||
@ -143,30 +144,38 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,
|
Packets[i].from_rank,
|
||||||
Packets[i].bytes);
|
Packets[i].bytes);
|
||||||
/*
|
|
||||||
}else{
|
|
||||||
_grid->SendToRecvFromBegin(reqs[i],
|
|
||||||
Packets[i].send_buf,
|
|
||||||
Packets[i].to_rank,
|
|
||||||
Packets[i].recv_buf,
|
|
||||||
Packets[i].from_rank,
|
|
||||||
Packets[i].bytes);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
commtime-=usecond();
|
commtime-=usecond();
|
||||||
|
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
// if( ShmDirectCopy )
|
|
||||||
_grid->StencilSendToRecvFromComplete(reqs[i]);
|
_grid->StencilSendToRecvFromComplete(reqs[i]);
|
||||||
// else
|
|
||||||
// _grid->SendToRecvFromComplete(reqs[i]);
|
|
||||||
}
|
}
|
||||||
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
|
/*
|
||||||
|
if(dump){
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
cobj * ptr = (cobj *) Packets[i].recv_buf;
|
||||||
|
uint64_t num=Packets[i].bytes/sizeof(cobj);
|
||||||
|
std::cout << " CommunicateComplete " << i<< " / " << Packets.size()<< " num " << num <<std::endl;
|
||||||
|
std::stringstream ss;
|
||||||
|
ss<<"recvbuf";
|
||||||
|
for(int d=0;d<_grid->_ndimension;d++){
|
||||||
|
ss<<"."<<_grid->_processor_coor[d];
|
||||||
|
}
|
||||||
|
ss<<"_mu_"<<i;
|
||||||
|
std::string fname(ss.str());
|
||||||
|
std::ofstream fout(fname);
|
||||||
|
for(int k=0;k<num;k++) {
|
||||||
|
fout << i<<" "<<k<<" "<<ptr[k]<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dump =0;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
@ -193,14 +202,24 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
void CommsMerge(void ) {
|
void CommsMerge(void ) {
|
||||||
|
|
||||||
for(int i=0;i<Mergers.size();i++){
|
for(int i=0;i<Mergers.size();i++){
|
||||||
|
|
||||||
mergetime-=usecond();
|
mergetime-=usecond();
|
||||||
|
|
||||||
|
// std::cout << "Merge " <<i << std::endl;
|
||||||
|
// std::stringstream ss;
|
||||||
|
// ss<<"mergebuf";
|
||||||
|
// for(int d=0;d<_grid->_ndimension;d++){
|
||||||
|
// ss<<"."<<_grid->_processor_coor[d];
|
||||||
|
// }
|
||||||
|
// ss<<"_m_"<<i;
|
||||||
|
// std::string fname(ss.str());
|
||||||
|
// std::ofstream fout(fname);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int o=0;o<Mergers[i].buffer_size;o++){
|
for(int o=0;o<Mergers[i].buffer_size;o++){
|
||||||
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
||||||
|
// fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
|
||||||
}
|
}
|
||||||
mergetime+=usecond();
|
mergetime+=usecond();
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -275,8 +294,8 @@ PARALLEL_FOR_LOOP
|
|||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
// Timing info; ugly; possibly temporary
|
// Timing info; ugly; possibly temporary
|
||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
#define TIMING_HACK
|
#define TIMING_HACK
|
||||||
#ifdef TIMING_HACK
|
#ifdef TIMING_HACK
|
||||||
double jointime;
|
double jointime;
|
||||||
double gathertime;
|
double gathertime;
|
||||||
double commtime;
|
double commtime;
|
||||||
@ -363,7 +382,9 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
_checkerboard = checkerboard;
|
_checkerboard = checkerboard;
|
||||||
|
|
||||||
|
//////////////////////////
|
||||||
// the permute type
|
// the permute type
|
||||||
|
//////////////////////////
|
||||||
int simd_layout = _grid->_simd_layout[dimension];
|
int simd_layout = _grid->_simd_layout[dimension];
|
||||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||||
@ -373,9 +394,11 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
int sshift[2];
|
int sshift[2];
|
||||||
|
|
||||||
|
//////////////////////////
|
||||||
// Underlying approach. For each local site build
|
// Underlying approach. For each local site build
|
||||||
// up a table containing the npoint "neighbours" and whether they
|
// up a table containing the npoint "neighbours" and whether they
|
||||||
// live in lattice or a comms buffer.
|
// live in lattice or a comms buffer.
|
||||||
|
//////////////////////////
|
||||||
if ( !comm_dim ) {
|
if ( !comm_dim ) {
|
||||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
||||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||||
@ -386,11 +409,11 @@ PARALLEL_FOR_LOOP
|
|||||||
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||||
Local(point,dimension,shift,0x2);// both with block stride loop iteration
|
Local(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||||
}
|
}
|
||||||
} else { // All permute extract done in comms phase prior to Stencil application
|
} else {
|
||||||
|
// All permute extract done in comms phase prior to Stencil application
|
||||||
// So tables are the same whether comm_dim or splice_dim
|
// So tables are the same whether comm_dim or splice_dim
|
||||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
||||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||||
|
|
||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
Comms(point,dimension,shift,0x3);
|
Comms(point,dimension,shift,0x3);
|
||||||
} else {
|
} else {
|
||||||
@ -482,9 +505,11 @@ PARALLEL_FOR_LOOP
|
|||||||
assert(shift>=0);
|
assert(shift>=0);
|
||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
|
||||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
|
// done in reduced dims, so SIMD factored
|
||||||
|
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||||
|
|
||||||
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
||||||
|
|
||||||
// send to one or more remote nodes.
|
// send to one or more remote nodes.
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
@ -707,6 +732,8 @@ PARALLEL_FOR_LOOP
|
|||||||
template<class compressor>
|
template<class compressor>
|
||||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
|
||||||
// conformable(source._grid,_grid);
|
// conformable(source._grid,_grid);
|
||||||
assert(source._grid==_grid);
|
assert(source._grid==_grid);
|
||||||
halogtime-=usecond();
|
halogtime-=usecond();
|
||||||
@ -767,8 +794,7 @@ PARALLEL_FOR_LOOP
|
|||||||
if ( !face_table_computed ) {
|
if ( !face_table_computed ) {
|
||||||
t_table-=usecond();
|
t_table-=usecond();
|
||||||
face_table.resize(face_idx+1);
|
face_table.resize(face_idx+1);
|
||||||
Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
|
Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
|
||||||
face_table[face_idx]);
|
|
||||||
t_table+=usecond();
|
t_table+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -789,12 +815,11 @@ PARALLEL_FOR_LOOP
|
|||||||
cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
|
cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
|
||||||
if ( send_buf==NULL ) {
|
if ( send_buf==NULL ) {
|
||||||
send_buf = u_send_buf_p;
|
send_buf = u_send_buf_p;
|
||||||
}
|
}
|
||||||
// std::cout << " send_bufs "<<std::hex<< send_buf <<" ubp "<<u_send_buf_p <<std::dec<<std::endl;
|
|
||||||
t_data-=usecond();
|
t_data-=usecond();
|
||||||
assert(u_send_buf_p!=NULL);
|
|
||||||
assert(send_buf!=NULL);
|
assert(send_buf!=NULL);
|
||||||
Gather_plane_simple_table (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
|
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
|
||||||
t_data+=usecond();
|
t_data+=usecond();
|
||||||
|
|
||||||
AddPacket((void *)&send_buf[u_comm_offset],
|
AddPacket((void *)&send_buf[u_comm_offset],
|
||||||
@ -841,7 +866,9 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
std::vector<scalar_object *> rpointers(Nsimd);
|
std::vector<scalar_object *> rpointers(Nsimd);
|
||||||
std::vector<scalar_object *> spointers(Nsimd);
|
std::vector<scalar_object *> spointers(Nsimd);
|
||||||
|
|
||||||
|
// std::cout << "GatherSimd " << dimension << " shift "<<shift<<std::endl;
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Work out what to send where
|
// Work out what to send where
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
@ -853,7 +880,7 @@ PARALLEL_FOR_LOOP
|
|||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
int any_offnode = ( ((x+sshift)%fd) >= rd );
|
int any_offnode = ( ((x+sshift)%fd) >= rd );
|
||||||
|
|
||||||
if ( any_offnode ) {
|
if ( any_offnode ) {
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
@ -868,15 +895,15 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
// FIXME
|
// FIXME - This logic is hard coded to simd_layout==2 and not allowing >2
|
||||||
// This logic is hard coded to simd_layout ==2 and not allowing >2
|
// for(int w=0;w<buffer_size;w++){
|
||||||
// std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
|
// std::cout << "GatherSimd<"<<Nsimd<<"> : lane " << i <<" elem "<<w<<" "<< u_simd_send_buf[i ][u_comm_offset+w]<<std::endl;
|
||||||
|
// }
|
||||||
int inner_bit = (Nsimd>>(permute_type+1));
|
int inner_bit = (Nsimd>>(permute_type+1));
|
||||||
int ic= (i&inner_bit)? 1:0;
|
int ic= (i&inner_bit)? 1:0;
|
||||||
|
|
||||||
int my_coor = rd*ic + x;
|
int my_coor = rd*ic + x;
|
||||||
int nbr_coor = my_coor+sshift;
|
int nbr_coor = my_coor+sshift;
|
||||||
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||||
int nbr_lcoor= (nbr_coor%ld);
|
int nbr_lcoor= (nbr_coor%ld);
|
||||||
int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer
|
int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer
|
||||||
@ -885,10 +912,10 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
if (nbr_ic) nbr_lane|=inner_bit;
|
if (nbr_ic) nbr_lane|=inner_bit;
|
||||||
assert (sx == nbr_ox);
|
assert (sx == nbr_ox);
|
||||||
|
|
||||||
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
|
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
|
||||||
auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
|
auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
|
||||||
|
|
||||||
if(nbr_proc){
|
if(nbr_proc){
|
||||||
|
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
@ -896,16 +923,17 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
// shm == receive pointer if offnode
|
||||||
|
// shm == Translate[send pointer] if on node -- my view of his send pointer
|
||||||
scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp);
|
scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp);
|
||||||
// if ((ShmDirectCopy==0)||(shm==NULL)) {
|
|
||||||
if (shm==NULL) {
|
if (shm==NULL) {
|
||||||
shm = rp;
|
shm = rp;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
|
// if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
|
||||||
// assuming above pointer flip
|
// assuming above pointer flip
|
||||||
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
|
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
|
||||||
|
|
||||||
rpointers[i] = shm;
|
rpointers[i] = shm;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user