1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 19:25:56 +01:00

Cache face tables to improve performance.

Extract merge now looking poor.
This commit is contained in:
paboyle 2016-10-18 09:51:37 +01:00
parent 811ca45473
commit 7af9b87318

View File

@ -71,39 +71,31 @@
namespace Grid { namespace Grid {
template<class vobj,class cobj,class compressor> void template<class vobj,class cobj,class compressor> void
Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off, Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
double &t_table ,double & t_data )
{ {
table.resize(0);
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask = 0x3; cbmask = 0x3;
} }
int so= plane*rhs._grid->_ostride[dimension]; // base offset for start of plane int so= plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
t_table = 0.0;
t_data = 0.0;
int stride=rhs._grid->_slice_stride[dimension]; int stride=rhs._grid->_slice_stride[dimension];
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
t_data-=usecond(); table.resize(e1*e2);
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*stride; int o = n*stride;
int bo = n*e2; int bo = n*e2;
buffer[off+bo+b]=compress(rhs._odata[so+o+b]); table[bo+b]=std::pair<int,int>(bo+b,o+b);
} }
} }
t_data+=usecond();
} else { } else {
int bo=0; int bo=0;
t_table-=usecond(); table.resize(e1*e2/2);
std::vector<std::pair<int,int> > table(e1*e2);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*stride; int o = n*stride;
@ -113,17 +105,32 @@ PARALLEL_NESTED_LOOP2
} }
} }
} }
t_table+=usecond(); }
t_data-=usecond(); }
template<class vobj,class cobj,class compressor> void
Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,
compressor &compress, int off,int so)
{
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int i=0;i<bo;i++){ for(int i=0;i<table.size();i++){
buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]); buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
} }
t_data+=usecond();
} }
template<class vobj,class cobj,class compressor> void
Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
double &t_table ,double & t_data )
{
std::vector<std::pair<int,int> > table;
Gather_plane_simple_table_compute (rhs, buffer,dimension,plane,cbmask,compress,off,table);
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
Gather_plane_simple_table (table,rhs,buffer,compress,off,so);
} }
struct StencilEntry { struct StencilEntry {
uint64_t _offset; uint64_t _offset;
uint64_t _byte_offset; uint64_t _byte_offset;
@ -156,6 +163,9 @@ PARALLEL_FOR_LOOP
std::vector<Packet> Packets; std::vector<Packet> Packets;
int face_table_computed;
std::vector<std::vector<std::pair<int,int> > > face_table ;
#define SEND_IMMEDIATE #define SEND_IMMEDIATE
#define SERIAL_SENDS #define SERIAL_SENDS
@ -384,14 +394,17 @@ PARALLEL_FOR_LOOP
std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
if ( calls > 0. ) { if ( calls > 0. ) {
std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl; std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
PRINTIT(jointime);
PRINTIT(gathertime);
PRINTIT(commtime);
PRINTIT(halogtime); PRINTIT(halogtime);
PRINTIT(mergetime); PRINTIT(gathertime);
PRINTIT(spintime);
PRINTIT(comms_bytes);
PRINTIT(gathermtime); PRINTIT(gathermtime);
PRINTIT(mergetime);
if(comms_bytes>1.0){
PRINTIT(comms_bytes);
PRINTIT(commtime);
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "<<std::endl;
}
PRINTIT(jointime);
PRINTIT(spintime);
PRINTIT(splicetime); PRINTIT(splicetime);
PRINTIT(nosplicetime); PRINTIT(nosplicetime);
PRINTIT(t_table); PRINTIT(t_table);
@ -407,6 +420,7 @@ PARALLEL_FOR_LOOP
const std::vector<int> &distances) const std::vector<int> &distances)
: _permute_type(npoints), _comm_buf_size(npoints) : _permute_type(npoints), _comm_buf_size(npoints)
{ {
face_table_computed=0;
_npoints = npoints; _npoints = npoints;
_grid = grid; _grid = grid;
_directions = directions; _directions = directions;
@ -734,7 +748,7 @@ PARALLEL_FOR_LOOP
} }
#endif #endif
template<class compressor> template<class compressor>
void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point) void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
{ {
int dimension = _directions[point]; int dimension = _directions[point];
int displacement = _distances[point]; int displacement = _distances[point];
@ -762,23 +776,23 @@ PARALLEL_FOR_LOOP
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
if (splice_dim) { if (splice_dim) {
splicetime-=usecond(); splicetime-=usecond();
GatherSimd(source,dimension,shift,0x3,compress); GatherSimd(source,dimension,shift,0x3,compress,face_idx);
splicetime+=usecond(); splicetime+=usecond();
} else { } else {
nosplicetime-=usecond(); nosplicetime-=usecond();
Gather(source,dimension,shift,0x3,compress); Gather(source,dimension,shift,0x3,compress,face_idx);
nosplicetime+=usecond(); nosplicetime+=usecond();
} }
} else { } else {
if(splice_dim){ if(splice_dim){
splicetime-=usecond(); splicetime-=usecond();
GatherSimd(source,dimension,shift,0x1,compress);// if checkerboard is unfavourable take two passes GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
GatherSimd(source,dimension,shift,0x2,compress);// both with block stride loop iteration GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
splicetime+=usecond(); splicetime+=usecond();
} else { } else {
nosplicetime-=usecond(); nosplicetime-=usecond();
Gather(source,dimension,shift,0x1,compress); Gather(source,dimension,shift,0x1,compress,face_idx);
Gather(source,dimension,shift,0x2,compress); Gather(source,dimension,shift,0x2,compress,face_idx);
nosplicetime+=usecond(); nosplicetime+=usecond();
} }
} }
@ -796,17 +810,19 @@ PARALLEL_FOR_LOOP
u_comm_offset=0; u_comm_offset=0;
// Gather all comms buffers // Gather all comms buffers
int face_idx=0;
for(int point = 0 ; point < _npoints; point++) { for(int point = 0 ; point < _npoints; point++) {
compress.Point(point); compress.Point(point);
HaloGatherDir(source,compress,point); HaloGatherDir(source,compress,point,face_idx);
} }
face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size); assert(u_comm_offset==_unified_buffer_size);
halogtime+=usecond(); halogtime+=usecond();
} }
template<class compressor> template<class compressor>
void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress) void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx)
{ {
typedef typename cobj::vector_type vector_type; typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_type scalar_type;
@ -843,9 +859,21 @@ PARALLEL_FOR_LOOP
int bytes = words * sizeof(cobj); int bytes = words * sizeof(cobj);
gathertime-=usecond(); gathertime-=usecond();
Gather_plane_simple_stencil (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,t_table,t_data); int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane
if ( !face_table_computed ) {
t_table-=usecond();
face_table.resize(face_idx+1);
Gather_plane_simple_table_compute (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,face_table[face_idx]);
t_table+=usecond();
}
t_data-=usecond();
Gather_plane_simple_table (face_table[face_idx],rhs,u_send_buf,compress,u_comm_offset,so);
face_idx++;
t_data+=usecond();
gathertime+=usecond(); gathertime+=usecond();
// Gather_plane_simple_stencil (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,t_table,t_data);
int rank = _grid->_processor; int rank = _grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
@ -867,7 +895,7 @@ PARALLEL_FOR_LOOP
template<class compressor> template<class compressor>
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress) void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
{ {
const int Nsimd = _grid->Nsimd(); const int Nsimd = _grid->Nsimd();