1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM

This commit is contained in:
Peter Boyle 2022-10-04 11:11:10 -07:00
parent 9296299b61
commit e1e5c75023

View File

@ -80,11 +80,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class cobj,class vobj,class compressor> template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(const Lattice<vobj> &rhs, void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline)); commVector<cobj *> pointers,
int dimension,int plane,
int cbmask,compressor &compress,int type) __attribute__((noinline));
template<class cobj,class vobj,class compressor> template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
Vector<cobj *> pointers,int dimension,int plane,int cbmask, const Lattice<vobj> &rhs,
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
compressor &compress,int type) compressor &compress,int type)
{ {
assert( (table.size()&0x1)==0); assert( (table.size()&0x1)==0);
@ -92,14 +95,15 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead); auto rhs_v = rhs.View(AcceleratorRead);
auto rhs_p = &rhs_v[0];
auto p0=&pointers[0][0]; auto p0=&pointers[0][0];
auto p1=&pointers[1][0]; auto p1=&pointers[1][0];
auto tp=&table[0]; auto tp=&table[0];
accelerator_forNB(j, num, vobj::Nsimd(), { accelerator_forNB(j, num, vobj::Nsimd(), {
compress.CompressExchange(p0,p1, &rhs_v[0], j, compress.CompressExchange(p0,p1, rhs_p, j,
so+tp[2*j ].second, so+tp[2*j ].second,
so+tp[2*j+1].second, so+tp[2*j+1].second,
type); type);
}); });
rhs_v.ViewClose(); rhs_v.ViewClose();
} }
@ -230,8 +234,8 @@ public:
}; };
struct Merge { struct Merge {
cobj * mpointer; cobj * mpointer;
Vector<scalar_object *> rpointers; // std::vector<scalar_object *> rpointers;
Vector<cobj *> vpointers; std::vector<cobj *> vpointers;
Integer buffer_size; Integer buffer_size;
Integer type; Integer type;
}; };
@ -406,6 +410,7 @@ public:
comms_bytes+=bytes; comms_bytes+=bytes;
shm_bytes +=2*Packets[i].bytes-bytes; shm_bytes +=2*Packets[i].bytes-bytes;
} }
_grid->StencilBarrier();// Synch shared memory on a single nodes
} }
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
@ -420,7 +425,7 @@ public:
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
void Communicate(void) void Communicate(void)
{ {
if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){ if ( 0 ){
thread_region { thread_region {
// must be called in parallel region // must be called in parallel region
int mythread = thread_num(); int mythread = thread_num();
@ -569,7 +574,7 @@ public:
d.buffer_size = buffer_size; d.buffer_size = buffer_size;
dv.push_back(d); dv.push_back(d);
} }
void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) { void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
Merge m; Merge m;
m.type = type; m.type = type;
m.mpointer = merge_p; m.mpointer = merge_p;
@ -582,6 +587,7 @@ public:
} }
template<class decompressor> void CommsMergeSHM(decompressor decompress) { template<class decompressor> void CommsMergeSHM(decompressor decompress) {
mpi3synctime-=usecond(); mpi3synctime-=usecond();
accelerator_barrier();
_grid->StencilBarrier();// Synch shared memory on a single nodes _grid->StencilBarrier();// Synch shared memory on a single nodes
mpi3synctime+=usecond(); mpi3synctime+=usecond();
shmmergetime-=usecond(); shmmergetime-=usecond();
@ -1114,8 +1120,8 @@ public:
int bytes = (reduced_buffer_size*datum_bytes)/simd_layout; int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
assert(bytes*simd_layout == reduced_buffer_size*datum_bytes); assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
Vector<cobj *> rpointers(maxl); std::vector<cobj *> rpointers(maxl);
Vector<cobj *> spointers(maxl); std::vector<cobj *> spointers(maxl);
/////////////////////////////////////////// ///////////////////////////////////////////
// Work out what to send where // Work out what to send where