1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM

This commit is contained in:
Peter Boyle 2022-10-04 11:11:10 -07:00
parent 9296299b61
commit e1e5c75023

View File

@ -80,11 +80,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
///////////////////////////////////////////////////////////////////
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
commVector<cobj *> pointers,
int dimension,int plane,
int cbmask,compressor &compress,int type) __attribute__((noinline));
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
compressor &compress,int type)
{
assert( (table.size()&0x1)==0);
@ -92,14 +95,15 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead);
auto rhs_p = &rhs_v[0];
auto p0=&pointers[0][0];
auto p1=&pointers[1][0];
auto tp=&table[0];
accelerator_forNB(j, num, vobj::Nsimd(), {
compress.CompressExchange(p0,p1, &rhs_v[0], j,
so+tp[2*j ].second,
so+tp[2*j+1].second,
type);
compress.CompressExchange(p0,p1, rhs_p, j,
so+tp[2*j ].second,
so+tp[2*j+1].second,
type);
});
rhs_v.ViewClose();
}
@ -230,8 +234,8 @@ public:
};
struct Merge {
cobj * mpointer;
Vector<scalar_object *> rpointers;
Vector<cobj *> vpointers;
// std::vector<scalar_object *> rpointers;
std::vector<cobj *> vpointers;
Integer buffer_size;
Integer type;
};
@ -406,6 +410,7 @@ public:
comms_bytes+=bytes;
shm_bytes +=2*Packets[i].bytes-bytes;
}
_grid->StencilBarrier();// Synch shared memory on a single nodes
}
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
@ -420,7 +425,7 @@ public:
////////////////////////////////////////////////////////////////////////
void Communicate(void)
{
if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
if ( 0 ){
thread_region {
// must be called in parallel region
int mythread = thread_num();
@ -569,7 +574,7 @@ public:
d.buffer_size = buffer_size;
dv.push_back(d);
}
void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
Merge m;
m.type = type;
m.mpointer = merge_p;
@ -582,6 +587,7 @@ public:
}
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
mpi3synctime-=usecond();
accelerator_barrier();
_grid->StencilBarrier();// Synch shared memory on a single nodes
mpi3synctime+=usecond();
shmmergetime-=usecond();
@ -1114,8 +1120,8 @@ public:
int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
Vector<cobj *> rpointers(maxl);
Vector<cobj *> spointers(maxl);
std::vector<cobj *> rpointers(maxl);
std::vector<cobj *> spointers(maxl);
///////////////////////////////////////////
// Work out what to send where