1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Stencil fix on BNL KNL system

This commit is contained in:
paboyle 2017-02-20 17:51:31 -05:00
parent 5a1fb29db7
commit 3906cd2149
2 changed files with 157 additions and 27 deletions

View File

@ -29,19 +29,18 @@
namespace Grid { namespace Grid {
void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask, void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table) int off,std::vector<std::pair<int,int> > & table)
{ {
table.resize(0); table.resize(0);
int rd = grid->_rdimensions[dimension];
if ( !grid->CheckerBoarded(dimension) ) { if ( !grid->CheckerBoarded(dimension) ) {
cbmask = 0x3; cbmask = 0x3;
} }
int rd = grid->_rdimensions[dimension];
int so= plane*grid->_ostride[dimension]; // base offset for start of plane int so= plane*grid->_ostride[dimension]; // base offset for start of plane
int e1=grid->_slice_nblock[dimension]; int e1=grid->_slice_nblock[dimension];
int e2=grid->_slice_block[dimension]; int e2=grid->_slice_block[dimension];
int stride=grid->_slice_stride[dimension]; int stride=grid->_slice_stride[dimension];
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
table.resize(e1*e2); table.resize(e1*e2);
@ -66,4 +65,5 @@ void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,i
} }
} }
} }
} }

View File

@ -29,7 +29,7 @@
#define GRID_STENCIL_H #define GRID_STENCIL_H
#include <Grid/stencil/Lebesgue.h> // subdir aggregate #include <Grid/stencil/Lebesgue.h> // subdir aggregate
#define NEW_XYZT_GATHER
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
// Must not lose sight that goal is to be able to construct really efficient // Must not lose sight that goal is to be able to construct really efficient
// gather to a point stencil code. CSHIFT is not the best way, so need // gather to a point stencil code. CSHIFT is not the best way, so need
@ -68,7 +68,10 @@
namespace Grid { namespace Grid {
void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask, ///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split with compression
///////////////////////////////////////////////////////////////////
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table); int off,std::vector<std::pair<int,int> > & table);
template<class vobj,class cobj,class compressor> template<class vobj,class cobj,class compressor>
@ -85,6 +88,95 @@ PARALLEL_FOR_LOOP
} }
} }
///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split with compression
///////////////////////////////////////////////////////////////////
/*
template<class cobj,class vobj,class compressor> double
Gather_plane_exchange(const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
{
int rd = rhs._grid->_rdimensions[dimension];
double t1,t2;
if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask = 0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1 =rhs._grid->_slice_nblock[dimension];
int e2 =rhs._grid->_slice_block [dimension];
int n1 =rhs._grid->_slice_stride[dimension];
// Need to switch to a table loop
std::vector<std::pair<int,int> > table;
if ( cbmask ==0x3){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*n1;
int offset = b+n*e2;
table.push_back(std::pair<int,int> (offset,o+b));
}
}
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*n1;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int offset = b+n*e2;
if ( ocb & cbmask ) {
table.push_back(std::pair<int,int> (offset,o+b));
}
}
}
}
assert( (table.size()&0x1)==0);
t1=usecond();
PARALLEL_FOR_LOOP
for(int j=0;j<table.size()/2;j++){
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
cobj temp3;
cobj temp4;
exchange(temp3,temp4,temp1,temp2,type);
vstream(pointers[0][j],temp3);
vstream(pointers[1][j],temp4);
}
t2=usecond();
return t2-t1;
}
*/
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type)
{
assert( (table.size()&0x1)==0);
int num=table.size()/2;
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
PARALLEL_FOR_LOOP
for(int j=0;j<num;j++){
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
cobj temp3;
cobj temp4;
exchange(temp3,temp4,temp1,temp2,type);
vstream(pointers[0][j],temp3);
vstream(pointers[1][j],temp4);
}
}
struct StencilEntry { struct StencilEntry {
uint64_t _offset; uint64_t _offset;
uint64_t _byte_offset; uint64_t _byte_offset;
@ -129,7 +221,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
p.to_rank = to; p.to_rank = to;
p.from_rank= from; p.from_rank= from;
p.bytes = bytes; p.bytes = bytes;
comms_bytes+=2.0*bytes;
Packets.push_back(p); Packets.push_back(p);
} }
@ -138,23 +229,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
reqs.resize(Packets.size()); reqs.resize(Packets.size());
commtime-=usecond(); commtime-=usecond();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(reqs[i], comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
Packets[i].send_buf, Packets[i].send_buf,
Packets[i].to_rank, Packets[i].to_rank,
Packets[i].recv_buf, Packets[i].recv_buf,
Packets[i].from_rank, Packets[i].from_rank,
Packets[i].bytes); Packets[i].bytes);
if( _grid->CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySendrecv ) {
_grid->StencilSendToRecvFromComplete(reqs[i]);
}
} }
commtime+=usecond(); commtime+=usecond();
} }
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
commtime-=usecond(); commtime-=usecond();
for(int i=0;i<Packets.size();i++){ if( _grid->CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicyIsend ) {
for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromComplete(reqs[i]); _grid->StencilSendToRecvFromComplete(reqs[i]);
}
} }
_grid->StencilBarrier();// Synch shared memory on a single nodes _grid->StencilBarrier();// Synch shared memory on a single nodes
commtime+=usecond(); commtime+=usecond();
/*
int dump=1; int dump=1;
if(dump){ if(dump){
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
@ -175,7 +272,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
} }
} }
dump =0; dump =0;
*/
} }
/////////////////////////////////////////// ///////////////////////////////////////////
@ -310,10 +407,10 @@ PARALLEL_FOR_LOOP
// depending on comms target // depending on comms target
cobj* u_recv_buf_p; cobj* u_recv_buf_p;
cobj* u_send_buf_p; cobj* u_send_buf_p;
std::vector<scalar_object *> u_simd_send_buf;
std::vector<scalar_object *> u_simd_recv_buf;
std::vector<cobj *> new_simd_send_buf; std::vector<cobj *> new_simd_send_buf;
std::vector<cobj *> new_simd_recv_buf; std::vector<cobj *> new_simd_recv_buf;
std::vector<scalar_object *> u_simd_send_buf;
std::vector<scalar_object *> u_simd_recv_buf;
int u_comm_offset; int u_comm_offset;
int _unified_buffer_size; int _unified_buffer_size;
@ -358,6 +455,10 @@ PARALLEL_FOR_LOOP
void Report(void) { void Report(void) {
#define PRINTIT(A) \ #define PRINTIT(A) \
std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
if ( calls > 0. ) { if ( calls > 0. ) {
std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl; std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
PRINTIT(halogtime); PRINTIT(halogtime);
@ -367,7 +468,8 @@ PARALLEL_FOR_LOOP
if(comms_bytes>1.0){ if(comms_bytes>1.0){
PRINTIT(comms_bytes); PRINTIT(comms_bytes);
PRINTIT(commtime); PRINTIT(commtime);
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "<<std::endl; std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
} }
PRINTIT(jointime); PRINTIT(jointime);
PRINTIT(spintime); PRINTIT(spintime);
@ -465,12 +567,17 @@ PARALLEL_FOR_LOOP
new_simd_recv_buf.resize(Nsimd); new_simd_recv_buf.resize(Nsimd);
u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
for(int l=0;l<Nsimd;l++){ #ifdef NEW_XYZT_GATHER
u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); for(int l=0;l<2;l++){
u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
} }
#else
for(int l=0;l<Nsimd;l++){
u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
}
#endif
PrecomputeByteOffsets(); PrecomputeByteOffsets();
} }
@ -740,7 +847,11 @@ PARALLEL_FOR_LOOP
splicetime-=usecond(); splicetime-=usecond();
// GatherSimd(source,dimension,shift,0x3,compress,face_idx); // GatherSimd(source,dimension,shift,0x3,compress,face_idx);
// std::cout << "GatherSimdNew"<<std::endl; // std::cout << "GatherSimdNew"<<std::endl;
#ifdef NEW_XYZT_GATHER
GatherSimdNew(source,dimension,shift,0x3,compress,face_idx); GatherSimdNew(source,dimension,shift,0x3,compress,face_idx);
#else
GatherSimd(source,dimension,shift,0x3,compress,face_idx);
#endif
splicetime+=usecond(); splicetime+=usecond();
} else { } else {
nosplicetime-=usecond(); nosplicetime-=usecond();
@ -751,8 +862,13 @@ PARALLEL_FOR_LOOP
if(splice_dim){ if(splice_dim){
splicetime-=usecond(); splicetime-=usecond();
// std::cout << "GatherSimdNew2calls"<<std::endl; // std::cout << "GatherSimdNew2calls"<<std::endl;
#ifdef NEW_XYZT_GATHER
GatherSimdNew(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes GatherSimdNew(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
GatherSimdNew(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration GatherSimdNew(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
#else
GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
#endif
splicetime+=usecond(); splicetime+=usecond();
} else { } else {
nosplicetime-=usecond(); nosplicetime-=usecond();
@ -829,12 +945,13 @@ PARALLEL_FOR_LOOP
if ( !face_table_computed ) { if ( !face_table_computed ) {
t_table-=usecond(); t_table-=usecond();
face_table.resize(face_idx+1); face_table.resize(face_idx+1);
Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
// std::cout << " face table size "<<face_idx <<" " << face_table[face_idx].size() <<" computed buffer size "<< words <<
// " bytes = " << bytes <<std::endl;
t_table+=usecond(); t_table+=usecond();
} }
int rank = _grid->_processor;
int rank = _grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
@ -845,8 +962,6 @@ PARALLEL_FOR_LOOP
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
// try the direct copy if possible // try the direct copy if possible
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p); cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
if ( send_buf==NULL ) { if ( send_buf==NULL ) {
send_buf = u_send_buf_p; send_buf = u_send_buf_p;
@ -1003,7 +1118,7 @@ PARALLEL_FOR_LOOP
assert(simd_layout==maxl); assert(simd_layout==maxl);
assert(shift>=0); assert(shift>=0);
assert(shift<fd); assert(shift<fd);
int permute_type=_grid->PermuteType(dimension); int permute_type=_grid->PermuteType(dimension);
// std::cout << "SimdNew permute type "<<permute_type<<std::endl; // std::cout << "SimdNew permute type "<<permute_type<<std::endl;
@ -1015,8 +1130,11 @@ PARALLEL_FOR_LOOP
assert(cbmask==0x3); // Fixme think there is a latent bug if not true assert(cbmask==0x3); // Fixme think there is a latent bug if not true
int bytes = (buffer_size*sizeof(cobj))/simd_layout; int reduced_buffer_size = buffer_size;
assert(bytes*simd_layout == buffer_size*sizeof(cobj)); if (cbmask != 0x3) reduced_buffer_size=buffer_size>>1;
int bytes = (reduced_buffer_size*sizeof(cobj))/simd_layout;
assert(bytes*simd_layout == reduced_buffer_size*sizeof(cobj));
std::vector<cobj *> rpointers(maxl); std::vector<cobj *> rpointers(maxl);
std::vector<cobj *> spointers(maxl); std::vector<cobj *> spointers(maxl);
@ -1034,15 +1152,28 @@ PARALLEL_FOR_LOOP
int any_offnode = ( ((x+sshift)%fd) >= rd ); int any_offnode = ( ((x+sshift)%fd) >= rd );
if ( any_offnode ) { if ( any_offnode ) {
for(int i=0;i<maxl;i++){ for(int i=0;i<maxl;i++){
spointers[i] = (cobj *) &new_simd_send_buf[i][u_comm_offset]; spointers[i] = (cobj *) &new_simd_send_buf[i][u_comm_offset];
} }
int sx = (x+sshift)%rd; int sx = (x+sshift)%rd;
gathermtime+=Gather_plane_exchange(rhs,spointers,dimension,sx,cbmask,compress,permute_type);
// if ( cbmask==0x3 ) {
// std::vector<std::pair<int,int> > table;
t_table-=usecond();
if ( !face_table_computed ) {
face_table.resize(face_idx+1);
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
// std::cout << " face table size "<<face_idx <<" " << face_table[face_idx].size() <<" computed buffer size "<< reduced_buffer_size <<
// " bytes = "<<bytes <<std::endl;
}
t_table+=usecond();
gathermtime-=usecond();
Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); face_idx++;
gathermtime+=usecond();
//spointers[0] -- low //spointers[0] -- low
//spointers[1] -- high //spointers[1] -- high
@ -1089,13 +1220,12 @@ PARALLEL_FOR_LOOP
} }
} }
AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1,permute_type); AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,Packets.size()-1,permute_type);
u_comm_offset +=buffer_size; u_comm_offset +=buffer_size;
} }
} }
} }
}; };
} }