mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
GPU friendly Stencil needs a view
This commit is contained in:
parent
8a1d303ab9
commit
d4ce7d9905
@ -28,6 +28,8 @@
|
|||||||
#ifndef GRID_STENCIL_H
|
#ifndef GRID_STENCIL_H
|
||||||
#define GRID_STENCIL_H
|
#define GRID_STENCIL_H
|
||||||
|
|
||||||
|
#define STENCIL_MAX (16)
|
||||||
|
|
||||||
#include <Grid/stencil/SimpleCompressor.h> // subdir aggregate
|
#include <Grid/stencil/SimpleCompressor.h> // subdir aggregate
|
||||||
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
|
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
|
||||||
|
|
||||||
@ -99,18 +101,74 @@ struct StencilEntry {
|
|||||||
uint16_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
|
uint16_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
|
||||||
uint16_t _pad;
|
uint16_t _pad;
|
||||||
};
|
};
|
||||||
|
template<class vobj,class cobj>
|
||||||
|
class CartesianStencilView {
|
||||||
|
public:
|
||||||
|
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
|
||||||
|
|
||||||
|
// Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||||
|
////////////////////////////////////////
|
||||||
|
// Basic Grid and stencil info
|
||||||
|
////////////////////////////////////////
|
||||||
|
int _checkerboard;
|
||||||
|
int _npoints; // Move to template param?
|
||||||
|
StencilVector _directions;
|
||||||
|
StencilVector _distances;
|
||||||
|
StencilVector _comm_buf_size;
|
||||||
|
StencilVector _permute_type;
|
||||||
|
StencilVector same_node;
|
||||||
|
Coordinate _simd_layout;
|
||||||
|
Coordinate twists;
|
||||||
|
StencilEntry* _entries_p;
|
||||||
|
cobj* u_recv_buf_p;
|
||||||
|
cobj* u_send_buf_p;
|
||||||
|
|
||||||
|
accelerator_inline cobj *CommBuf(void) { return u_recv_buf_p; }
|
||||||
|
|
||||||
|
accelerator_inline int GetNodeLocal(int osite,int point) {
|
||||||
|
return this->_entries_p[point+this->_npoints*osite]._is_local;
|
||||||
|
}
|
||||||
|
accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) {
|
||||||
|
ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite];
|
||||||
|
}
|
||||||
|
|
||||||
|
accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
|
||||||
|
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
||||||
|
local = this->_entries_p[ent]._is_local;
|
||||||
|
perm = this->_entries_p[ent]._permute;
|
||||||
|
if (perm) ptype = this->_permute_type[point];
|
||||||
|
if (local) {
|
||||||
|
return base + this->_entries_p[ent]._byte_offset;
|
||||||
|
} else {
|
||||||
|
return cbase + this->_entries_p[ent]._byte_offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) {
|
||||||
|
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
||||||
|
int local = this->_entries_p[ent]._is_local;
|
||||||
|
if (local) return base + this->_entries_p[ent]._byte_offset;
|
||||||
|
else return cbase + this->_entries_p[ent]._byte_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane)
|
||||||
|
{
|
||||||
|
Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
// The Stencil Class itself
|
// The Stencil Class itself
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
template<class vobj,class cobj>
|
template<class vobj,class cobj>
|
||||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
class CartesianStencil : public CartesianStencilView<vobj,cobj> { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||||
public:
|
public:
|
||||||
|
|
||||||
typedef typename cobj::vector_type vector_type;
|
typedef typename cobj::vector_type vector_type;
|
||||||
typedef typename cobj::scalar_type scalar_type;
|
typedef typename cobj::scalar_type scalar_type;
|
||||||
typedef typename cobj::scalar_object scalar_object;
|
typedef typename cobj::scalar_object scalar_object;
|
||||||
|
typedef CartesianStencilView<vobj,cobj> View_type;
|
||||||
|
typedef typename View_type::StencilVector StencilVector;
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Helper structs
|
// Helper structs
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
@ -134,33 +192,23 @@ public:
|
|||||||
Integer buffer_size;
|
Integer buffer_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
////////////////////////////////////////
|
|
||||||
// Basic Grid and stencil info
|
protected:
|
||||||
////////////////////////////////////////
|
GridBase * _grid;
|
||||||
|
|
||||||
|
public:
|
||||||
|
GridBase *Grid(void) const { return _grid; }
|
||||||
|
|
||||||
|
View_type View(void) const {
|
||||||
|
View_type accessor(*( (View_type *) this));
|
||||||
|
return accessor;
|
||||||
|
}
|
||||||
|
|
||||||
int face_table_computed;
|
int face_table_computed;
|
||||||
std::vector<std::vector<std::pair<int,int> > > face_table ;
|
std::vector<std::vector<std::pair<int,int> > > face_table ;
|
||||||
|
|
||||||
|
|
||||||
int _checkerboard;
|
|
||||||
int _npoints; // Move to template param?
|
|
||||||
protected:
|
|
||||||
GridBase * _grid;
|
|
||||||
public:
|
|
||||||
GridBase *Grid(void) const { return _grid; }
|
|
||||||
// npoints of these; make it a template param and std::array
|
|
||||||
std::vector<int> _directions;
|
|
||||||
std::vector<int> _distances;
|
|
||||||
std::vector<int> _comm_buf_size;
|
|
||||||
std::vector<int> _permute_type;
|
|
||||||
Coordinate _simd_layout;
|
|
||||||
|
|
||||||
accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane)
|
|
||||||
{
|
|
||||||
Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
|
|
||||||
}
|
|
||||||
|
|
||||||
Vector<StencilEntry> _entries; // Resident in managed memory
|
Vector<StencilEntry> _entries; // Resident in managed memory
|
||||||
StencilEntry* _entries_p;
|
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
std::vector<Merge> Mergers;
|
std::vector<Merge> Mergers;
|
||||||
std::vector<Merge> MergersSHM;
|
std::vector<Merge> MergersSHM;
|
||||||
@ -173,14 +221,11 @@ public:
|
|||||||
// Vectors that live on the symmetric heap in case of SHMEM
|
// Vectors that live on the symmetric heap in case of SHMEM
|
||||||
// These are used; either SHM objects or refs to the above symmetric heap vectors
|
// These are used; either SHM objects or refs to the above symmetric heap vectors
|
||||||
// depending on comms target
|
// depending on comms target
|
||||||
cobj* u_recv_buf_p;
|
|
||||||
cobj* u_send_buf_p;
|
|
||||||
std::vector<cobj *> u_simd_send_buf;
|
std::vector<cobj *> u_simd_send_buf;
|
||||||
std::vector<cobj *> u_simd_recv_buf;
|
std::vector<cobj *> u_simd_recv_buf;
|
||||||
|
|
||||||
int u_comm_offset;
|
int u_comm_offset;
|
||||||
int _unified_buffer_size;
|
int _unified_buffer_size;
|
||||||
cobj *CommBuf(void) { return u_recv_buf_p; }
|
|
||||||
|
|
||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
// Timing info; ugly; possibly temporary
|
// Timing info; ugly; possibly temporary
|
||||||
@ -208,8 +253,8 @@ public:
|
|||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
inline int SameNode(int point) {
|
inline int SameNode(int point) {
|
||||||
|
|
||||||
int dimension = _directions[point];
|
int dimension = this->_directions[point];
|
||||||
int displacement = _distances[point];
|
int displacement = this->_distances[point];
|
||||||
assert( (displacement==1) || (displacement==-1));
|
assert( (displacement==1) || (displacement==-1));
|
||||||
|
|
||||||
int pd = _grid->_processors[dimension];
|
int pd = _grid->_processors[dimension];
|
||||||
@ -230,37 +275,12 @@ public:
|
|||||||
|
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
|
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
|
||||||
|
|
||||||
if ( shm==NULL ) return 0;
|
if ( shm==NULL ) return 0;
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
accelerator_inline int GetNodeLocal(int osite,int point) {
|
|
||||||
return _entries_p[point+_npoints*osite]._is_local;
|
|
||||||
}
|
|
||||||
accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) {
|
|
||||||
ptype = _permute_type[point]; return & _entries_p[point+_npoints*osite];
|
|
||||||
}
|
|
||||||
|
|
||||||
accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
|
|
||||||
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
|
||||||
local = _entries_p[ent]._is_local;
|
|
||||||
perm = _entries_p[ent]._permute;
|
|
||||||
if (perm) ptype = _permute_type[point];
|
|
||||||
if (local) {
|
|
||||||
return base + _entries_p[ent]._byte_offset;
|
|
||||||
} else {
|
|
||||||
return cbase + _entries_p[ent]._byte_offset;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) {
|
|
||||||
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
|
||||||
int local = _entries_p[ent]._is_local;
|
|
||||||
if (local) return base + _entries_p[ent]._byte_offset;
|
|
||||||
else return cbase + _entries_p[ent]._byte_offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Comms packet queue for asynch thread
|
// Comms packet queue for asynch thread
|
||||||
@ -377,8 +397,8 @@ public:
|
|||||||
|
|
||||||
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
||||||
{
|
{
|
||||||
int dimension = _directions[point];
|
int dimension = this->_directions[point];
|
||||||
int displacement = _distances[point];
|
int displacement = this->_distances[point];
|
||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
@ -386,29 +406,29 @@ public:
|
|||||||
// Map to always positive shift modulo global full dimension.
|
// Map to always positive shift modulo global full dimension.
|
||||||
int shift = (displacement+fd)%fd;
|
int shift = (displacement+fd)%fd;
|
||||||
|
|
||||||
assert (source.Checkerboard()== _checkerboard);
|
assert (source.Checkerboard()== this->_checkerboard);
|
||||||
|
|
||||||
// the permute type
|
// the permute type
|
||||||
int simd_layout = _grid->_simd_layout[dimension];
|
int simd_layout = _grid->_simd_layout[dimension];
|
||||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||||
|
|
||||||
int same_node = 1;
|
int is_same_node = 1;
|
||||||
// Gather phase
|
// Gather phase
|
||||||
int sshift [2];
|
int sshift [2];
|
||||||
if ( comm_dim ) {
|
if ( comm_dim ) {
|
||||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even);
|
||||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd);
|
||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
if (splice_dim) {
|
if (splice_dim) {
|
||||||
splicetime-=usecond();
|
splicetime-=usecond();
|
||||||
auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx);
|
auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx);
|
||||||
same_node = same_node && tmp;
|
is_same_node = is_same_node && tmp;
|
||||||
splicetime+=usecond();
|
splicetime+=usecond();
|
||||||
} else {
|
} else {
|
||||||
nosplicetime-=usecond();
|
nosplicetime-=usecond();
|
||||||
auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx);
|
auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx);
|
||||||
same_node = same_node && tmp;
|
is_same_node = is_same_node && tmp;
|
||||||
nosplicetime+=usecond();
|
nosplicetime+=usecond();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -418,18 +438,18 @@ public:
|
|||||||
// both with block stride loop iteration
|
// both with block stride loop iteration
|
||||||
auto tmp1 = GatherSimd(source,dimension,shift,0x1,compress,face_idx);
|
auto tmp1 = GatherSimd(source,dimension,shift,0x1,compress,face_idx);
|
||||||
auto tmp2 = GatherSimd(source,dimension,shift,0x2,compress,face_idx);
|
auto tmp2 = GatherSimd(source,dimension,shift,0x2,compress,face_idx);
|
||||||
same_node = same_node && tmp1 && tmp2;
|
is_same_node = is_same_node && tmp1 && tmp2;
|
||||||
splicetime+=usecond();
|
splicetime+=usecond();
|
||||||
} else {
|
} else {
|
||||||
nosplicetime-=usecond();
|
nosplicetime-=usecond();
|
||||||
auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx);
|
auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx);
|
||||||
auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx);
|
auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx);
|
||||||
same_node = same_node && tmp1 && tmp2;
|
is_same_node = is_same_node && tmp1 && tmp2;
|
||||||
nosplicetime+=usecond();
|
nosplicetime+=usecond();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return same_node;
|
return is_same_node;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class compressor>
|
template<class compressor>
|
||||||
@ -447,7 +467,7 @@ public:
|
|||||||
|
|
||||||
// Gather all comms buffers
|
// Gather all comms buffers
|
||||||
int face_idx=0;
|
int face_idx=0;
|
||||||
for(int point = 0 ; point < _npoints; point++) {
|
for(int point = 0 ; point < this->_npoints; point++) {
|
||||||
compress.Point(point);
|
compress.Point(point);
|
||||||
HaloGatherDir(source,compress,point,face_idx);
|
HaloGatherDir(source,compress,point,face_idx);
|
||||||
}
|
}
|
||||||
@ -546,25 +566,30 @@ public:
|
|||||||
int checkerboard,
|
int checkerboard,
|
||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: _permute_type(npoints),
|
: comm_bytes_thr(npoints),
|
||||||
_comm_buf_size(npoints),
|
|
||||||
comm_bytes_thr(npoints),
|
|
||||||
comm_enter_thr(npoints),
|
comm_enter_thr(npoints),
|
||||||
comm_leave_thr(npoints),
|
comm_leave_thr(npoints),
|
||||||
comm_time_thr(npoints)
|
comm_time_thr(npoints)
|
||||||
{
|
{
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_npoints = npoints;
|
|
||||||
_grid = grid;
|
_grid = grid;
|
||||||
_directions = directions;
|
|
||||||
_distances = distances;
|
/////////////////////////////////////
|
||||||
|
// Initialise the base
|
||||||
|
/////////////////////////////////////
|
||||||
|
this->_npoints = npoints;
|
||||||
|
this->_comm_buf_size.resize(npoints),
|
||||||
|
this->_permute_type.resize(npoints),
|
||||||
|
this->_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels
|
||||||
|
this->_directions = StencilVector(directions);
|
||||||
|
this->_distances = StencilVector(distances);
|
||||||
|
|
||||||
_unified_buffer_size=0;
|
_unified_buffer_size=0;
|
||||||
_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels
|
|
||||||
|
|
||||||
int osites = _grid->oSites();
|
int osites = _grid->oSites();
|
||||||
|
|
||||||
_entries.resize(_npoints* osites);
|
_entries.resize(this->_npoints* osites);
|
||||||
_entries_p = &_entries[0];
|
this->_entries_p = &_entries[0];
|
||||||
for(int ii=0;ii<npoints;ii++){
|
for(int ii=0;ii<npoints;ii++){
|
||||||
|
|
||||||
int i = ii; // reverse direction to get SIMD comms done first
|
int i = ii; // reverse direction to get SIMD comms done first
|
||||||
@ -576,9 +601,9 @@ public:
|
|||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
_permute_type[point]=_grid->PermuteType(dimension);
|
this->_permute_type[point]=_grid->PermuteType(dimension);
|
||||||
|
|
||||||
_checkerboard = checkerboard;
|
this->_checkerboard = checkerboard;
|
||||||
|
|
||||||
//////////////////////////
|
//////////////////////////
|
||||||
// the permute type
|
// the permute type
|
||||||
@ -598,8 +623,8 @@ public:
|
|||||||
// live in lattice or a comms buffer.
|
// live in lattice or a comms buffer.
|
||||||
//////////////////////////
|
//////////////////////////
|
||||||
if ( !comm_dim ) {
|
if ( !comm_dim ) {
|
||||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even);
|
||||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd);
|
||||||
|
|
||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
Local(point,dimension,shift,0x3);
|
Local(point,dimension,shift,0x3);
|
||||||
@ -610,8 +635,8 @@ public:
|
|||||||
} else {
|
} else {
|
||||||
// All permute extract done in comms phase prior to Stencil application
|
// All permute extract done in comms phase prior to Stencil application
|
||||||
// So tables are the same whether comm_dim or splice_dim
|
// So tables are the same whether comm_dim or splice_dim
|
||||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even);
|
||||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd);
|
||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
Comms(point,dimension,shift,0x3);
|
Comms(point,dimension,shift,0x3);
|
||||||
} else {
|
} else {
|
||||||
@ -630,8 +655,8 @@ public:
|
|||||||
|
|
||||||
u_simd_send_buf.resize(Nsimd);
|
u_simd_send_buf.resize(Nsimd);
|
||||||
u_simd_recv_buf.resize(Nsimd);
|
u_simd_recv_buf.resize(Nsimd);
|
||||||
u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
this->u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||||
u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
this->u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||||
|
|
||||||
for(int l=0;l<2;l++){
|
for(int l=0;l<2;l++){
|
||||||
u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||||
@ -662,7 +687,7 @@ public:
|
|||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
|
|
||||||
int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
int sshift = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb);
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
int wraparound=0;
|
int wraparound=0;
|
||||||
@ -706,12 +731,12 @@ public:
|
|||||||
// done in reduced dims, so SIMD factored
|
// done in reduced dims, so SIMD factored
|
||||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||||
|
|
||||||
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
this->_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
||||||
|
|
||||||
// send to one or more remote nodes.
|
// send to one or more remote nodes.
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
int sshift= _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb);
|
||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
@ -783,7 +808,7 @@ public:
|
|||||||
// Simple block stride gather of SIMD objects
|
// Simple block stride gather of SIMD objects
|
||||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||||
int idx=point+(lo+o+b)*_npoints;
|
int idx=point+(lo+o+b)*this->_npoints;
|
||||||
_entries[idx]._offset =ro+o+b;
|
_entries[idx]._offset =ro+o+b;
|
||||||
_entries[idx]._permute=permute;
|
_entries[idx]._permute=permute;
|
||||||
_entries[idx]._is_local=1;
|
_entries[idx]._is_local=1;
|
||||||
@ -804,7 +829,7 @@ public:
|
|||||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
|
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
|
||||||
|
|
||||||
if ( ocb&cbmask ) {
|
if ( ocb&cbmask ) {
|
||||||
int idx = point+(lo+o+b)*_npoints;
|
int idx = point+(lo+o+b)*this->_npoints;
|
||||||
_entries[idx]._offset =ro+o+b;
|
_entries[idx]._offset =ro+o+b;
|
||||||
_entries[idx]._is_local=1;
|
_entries[idx]._is_local=1;
|
||||||
_entries[idx]._permute=permute;
|
_entries[idx]._permute=permute;
|
||||||
@ -831,7 +856,7 @@ public:
|
|||||||
// Simple block stride gather of SIMD objects
|
// Simple block stride gather of SIMD objects
|
||||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||||
int idx=point+(so+o+b)*_npoints;
|
int idx=point+(so+o+b)*this->_npoints;
|
||||||
_entries[idx]._offset =offset+(bo++);
|
_entries[idx]._offset =offset+(bo++);
|
||||||
_entries[idx]._is_local=0;
|
_entries[idx]._is_local=0;
|
||||||
_entries[idx]._permute=0;
|
_entries[idx]._permute=0;
|
||||||
@ -851,7 +876,7 @@ public:
|
|||||||
|
|
||||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
int idx = point+(so+o+b)*_npoints;
|
int idx = point+(so+o+b)*this->_npoints;
|
||||||
_entries[idx]._offset =offset+(bo++);
|
_entries[idx]._offset =offset+(bo++);
|
||||||
_entries[idx]._is_local=0;
|
_entries[idx]._is_local=0;
|
||||||
_entries[idx]._permute =0;
|
_entries[idx]._permute =0;
|
||||||
@ -922,16 +947,16 @@ public:
|
|||||||
if ( compress.DecompressionStep() ) {
|
if ( compress.DecompressionStep() ) {
|
||||||
recv_buf=u_simd_recv_buf[0];
|
recv_buf=u_simd_recv_buf[0];
|
||||||
} else {
|
} else {
|
||||||
recv_buf=u_recv_buf_p;
|
recv_buf=this->u_recv_buf_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
|
send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
|
||||||
if ( send_buf==NULL ) {
|
if ( send_buf==NULL ) {
|
||||||
send_buf = u_send_buf_p;
|
send_buf = this->u_send_buf_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find out if we get the direct copy.
|
// Find out if we get the direct copy.
|
||||||
void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_send_buf_p);
|
void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_send_buf_p);
|
||||||
if (success==NULL) {
|
if (success==NULL) {
|
||||||
// we found a packet that comes from MPI and contributes to this leg of stencil
|
// we found a packet that comes from MPI and contributes to this leg of stencil
|
||||||
shm_receive_only = 0;
|
shm_receive_only = 0;
|
||||||
@ -945,11 +970,11 @@ public:
|
|||||||
if ( compress.DecompressionStep() ) {
|
if ( compress.DecompressionStep() ) {
|
||||||
|
|
||||||
if ( shm_receive_only ) { // Early decompress before MPI is finished is possible
|
if ( shm_receive_only ) { // Early decompress before MPI is finished is possible
|
||||||
AddDecompress(&u_recv_buf_p[u_comm_offset],
|
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
|
||||||
&recv_buf[u_comm_offset],
|
&recv_buf[u_comm_offset],
|
||||||
words,DecompressionsSHM);
|
words,DecompressionsSHM);
|
||||||
} else { // Decompress after MPI is finished
|
} else { // Decompress after MPI is finished
|
||||||
AddDecompress(&u_recv_buf_p[u_comm_offset],
|
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
|
||||||
&recv_buf[u_comm_offset],
|
&recv_buf[u_comm_offset],
|
||||||
words,Decompressions);
|
words,Decompressions);
|
||||||
}
|
}
|
||||||
@ -962,7 +987,7 @@ public:
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
AddPacket((void *)&send_buf[u_comm_offset],
|
AddPacket((void *)&send_buf[u_comm_offset],
|
||||||
(void *)&u_recv_buf_p[u_comm_offset],
|
(void *)&this->u_recv_buf_p[u_comm_offset],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
@ -1072,8 +1097,8 @@ public:
|
|||||||
if (shm==NULL) {
|
if (shm==NULL) {
|
||||||
shm = rp;
|
shm = rp;
|
||||||
// we found a packet that comes from MPI and contributes to this shift.
|
// we found a packet that comes from MPI and contributes to this shift.
|
||||||
// same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
|
// is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
|
||||||
// Kernel will add the exterior_terms except if same_node.
|
// Kernel will add the exterior_terms except if is_same_node.
|
||||||
shm_receive_only = 0;
|
shm_receive_only = 0;
|
||||||
// leg of stencil
|
// leg of stencil
|
||||||
}
|
}
|
||||||
@ -1092,9 +1117,9 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ( shm_receive_only ) {
|
if ( shm_receive_only ) {
|
||||||
AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM);
|
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM);
|
||||||
} else {
|
} else {
|
||||||
AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
|
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||||
}
|
}
|
||||||
|
|
||||||
u_comm_offset +=buffer_size;
|
u_comm_offset +=buffer_size;
|
||||||
@ -1109,7 +1134,7 @@ public:
|
|||||||
mpi3synctime=0.;
|
mpi3synctime=0.;
|
||||||
mpi3synctime_g=0.;
|
mpi3synctime_g=0.;
|
||||||
shmmergetime=0.;
|
shmmergetime=0.;
|
||||||
for(int i=0;i<_npoints;i++){
|
for(int i=0;i<this->_npoints;i++){
|
||||||
comm_time_thr[i]=0;
|
comm_time_thr[i]=0;
|
||||||
comm_bytes_thr[i]=0;
|
comm_bytes_thr[i]=0;
|
||||||
comm_enter_thr[i]=0;
|
comm_enter_thr[i]=0;
|
||||||
|
Loading…
Reference in New Issue
Block a user