mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-25 13:15:55 +01:00
Stencil now runs with coalesced accesses
This commit is contained in:
parent
a584b16c4a
commit
6c4da3bbc7
@ -55,6 +55,45 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Gather for when there *is* need to SIMD split with compression
|
// Gather for when there *is* need to SIMD split with compression
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
//////////////////////////////////////////
|
||||||
|
// EExtract and insert slices on the GPU
|
||||||
|
//////////////////////////////////////////
|
||||||
|
template<class vobj> accelerator_inline
|
||||||
|
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||||
|
int lane = threadIdx.x % Nsimd;
|
||||||
|
return extractLane(lane,vec);
|
||||||
|
}
|
||||||
|
template<class vobj> accelerator_inline
|
||||||
|
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||||
|
int lane = threadIdx.x % Nsimd;
|
||||||
|
insertLane(lane,vec,extracted);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
//////////////////////////////////////////
|
||||||
|
// Trivial mapping of vectors on host
|
||||||
|
//////////////////////////////////////////
|
||||||
|
template<class vobj> accelerator_inline
|
||||||
|
vobj coalescedRead(const vobj & __restrict__ vec)
|
||||||
|
{
|
||||||
|
return vec;
|
||||||
|
}
|
||||||
|
template<class vobj> accelerator_inline
|
||||||
|
void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted)
|
||||||
|
{
|
||||||
|
vec = extracted;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||||
int off,Vector<std::pair<int,int> > & table);
|
int off,Vector<std::pair<int,int> > & table);
|
||||||
|
|
||||||
@ -66,11 +105,30 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
|
|||||||
{
|
{
|
||||||
int num=table.size();
|
int num=table.size();
|
||||||
std::pair<int,int> *table_v = & table[0];
|
std::pair<int,int> *table_v = & table[0];
|
||||||
|
|
||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
|
// auto tmp_ucc = coalescedRead(rhs_v[so+table_v[0].second]);
|
||||||
|
// coalescedWrite(rhs_v[so+table_v[0].second],tmp_ucc);
|
||||||
|
#if 1
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||||
|
accelerator_loopNB( ii,num*Nsimd, {
|
||||||
|
|
||||||
|
typedef decltype(coalescedRead(buffer[0])) compressed_t;
|
||||||
|
typedef decltype(coalescedRead(rhs_v [0])) uncompressed_t;
|
||||||
|
|
||||||
|
int i = ii/Nsimd;
|
||||||
|
compressed_t tmp_c;
|
||||||
|
uncompressed_t tmp_uc = coalescedRead(rhs_v[so+table_v[i].second]);
|
||||||
|
uint64_t o = table_v[i].first;
|
||||||
|
compress.Compress(&tmp_c,0,tmp_uc);
|
||||||
|
coalescedWrite(buffer[off+o],tmp_c);
|
||||||
|
});
|
||||||
|
#else
|
||||||
accelerator_loopN( i,num, {
|
accelerator_loopN( i,num, {
|
||||||
compress.Compress(&buffer[off],table_v[i].first,rhs_v[so+table_v[i].second]);
|
compress.Compress(&buffer[off],table_v[i].first,rhs_v[so+table_v[i].second]);
|
||||||
});
|
});
|
||||||
|
#endif
|
||||||
// Further optimisatoin: i) streaming store the result
|
// Further optimisatoin: i) streaming store the result
|
||||||
// ii) software prefetch the first element of the next table entry
|
// ii) software prefetch the first element of the next table entry
|
||||||
}
|
}
|
||||||
@ -224,7 +282,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
int face_table_computed;
|
int face_table_computed;
|
||||||
Vector<Vector<std::pair<int,int> > > face_table ;
|
std::vector<Vector<std::pair<int,int> > > face_table ;
|
||||||
Vector<int> surface_list;
|
Vector<int> surface_list;
|
||||||
|
|
||||||
Vector<StencilEntry> _entries; // Resident in managed memory
|
Vector<StencilEntry> _entries; // Resident in managed memory
|
||||||
@ -259,10 +317,12 @@ public:
|
|||||||
double mergetime;
|
double mergetime;
|
||||||
double decompresstime;
|
double decompresstime;
|
||||||
double comms_bytes;
|
double comms_bytes;
|
||||||
|
double shm_bytes;
|
||||||
double splicetime;
|
double splicetime;
|
||||||
double nosplicetime;
|
double nosplicetime;
|
||||||
double calls;
|
double calls;
|
||||||
std::vector<double> comm_bytes_thr;
|
std::vector<double> comm_bytes_thr;
|
||||||
|
std::vector<double> shm_bytes_thr;
|
||||||
std::vector<double> comm_time_thr;
|
std::vector<double> comm_time_thr;
|
||||||
std::vector<double> comm_enter_thr;
|
std::vector<double> comm_enter_thr;
|
||||||
std::vector<double> comm_leave_thr;
|
std::vector<double> comm_leave_thr;
|
||||||
@ -326,6 +386,8 @@ public:
|
|||||||
Packets[i].from_rank,
|
Packets[i].from_rank,
|
||||||
Packets[i].bytes,i);
|
Packets[i].bytes,i);
|
||||||
comm_bytes_thr[mythread] += bytes;
|
comm_bytes_thr[mythread] += bytes;
|
||||||
|
shm_bytes_thr[mythread] += 2*Packets[i].bytes-bytes; // Send + Recv.
|
||||||
|
|
||||||
}
|
}
|
||||||
comm_leave_thr[mythread]= usecond();
|
comm_leave_thr[mythread]= usecond();
|
||||||
comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
|
comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
|
||||||
@ -343,11 +405,13 @@ public:
|
|||||||
double t0 = comm_enter_thr[t];
|
double t0 = comm_enter_thr[t];
|
||||||
double t1 = comm_leave_thr[t];
|
double t1 = comm_leave_thr[t];
|
||||||
comms_bytes+=comm_bytes_thr[t];
|
comms_bytes+=comm_bytes_thr[t];
|
||||||
|
shm_bytes +=shm_bytes_thr[t];
|
||||||
|
|
||||||
comm_enter_thr[t] = 0.0;
|
comm_enter_thr[t] = 0.0;
|
||||||
comm_leave_thr[t] = 0.0;
|
comm_leave_thr[t] = 0.0;
|
||||||
comm_time_thr[t] = 0.0;
|
comm_time_thr[t] = 0.0;
|
||||||
comm_bytes_thr[t]=0;
|
comm_bytes_thr[t]=0;
|
||||||
|
shm_bytes_thr[t]=0;
|
||||||
|
|
||||||
if ( first == 0.0 ) first = t0; // first is t0
|
if ( first == 0.0 ) first = t0; // first is t0
|
||||||
if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen
|
if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen
|
||||||
@ -362,12 +426,14 @@ public:
|
|||||||
reqs.resize(Packets.size());
|
reqs.resize(Packets.size());
|
||||||
commtime-=usecond();
|
commtime-=usecond();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
|
uint64_t bytes=_grid->StencilSendToRecvFromBegin(reqs[i],
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
Packets[i].to_rank,
|
Packets[i].to_rank,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,
|
Packets[i].from_rank,
|
||||||
Packets[i].bytes,i);
|
Packets[i].bytes,i);
|
||||||
|
comms_bytes+=bytes;
|
||||||
|
shm_bytes +=2*Packets[i].bytes-bytes;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -391,12 +457,14 @@ public:
|
|||||||
if (mythread < nthreads) {
|
if (mythread < nthreads) {
|
||||||
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
||||||
double start = usecond();
|
double start = usecond();
|
||||||
comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
||||||
Packets[i].to_rank,
|
Packets[i].to_rank,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,
|
Packets[i].from_rank,
|
||||||
Packets[i].bytes,i);
|
Packets[i].bytes,i);
|
||||||
comm_time_thr[mythread] += usecond() - start;
|
comm_bytes_thr[mythread] += bytes;
|
||||||
|
shm_bytes_thr[mythread] += Packets[i].bytes - bytes;
|
||||||
|
comm_time_thr[mythread] += usecond() - start;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -610,7 +678,8 @@ public:
|
|||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances,
|
const std::vector<int> &distances,
|
||||||
Parameters p)
|
Parameters p)
|
||||||
: comm_bytes_thr(npoints),
|
: shm_bytes_thr(npoints),
|
||||||
|
comm_bytes_thr(npoints),
|
||||||
comm_enter_thr(npoints),
|
comm_enter_thr(npoints),
|
||||||
comm_leave_thr(npoints),
|
comm_leave_thr(npoints),
|
||||||
comm_time_thr(npoints)
|
comm_time_thr(npoints)
|
||||||
@ -1189,6 +1258,7 @@ public:
|
|||||||
comm_bytes_thr[i]=0;
|
comm_bytes_thr[i]=0;
|
||||||
comm_enter_thr[i]=0;
|
comm_enter_thr[i]=0;
|
||||||
comm_leave_thr[i]=0;
|
comm_leave_thr[i]=0;
|
||||||
|
shm_bytes_thr[i]=0;
|
||||||
}
|
}
|
||||||
halogtime = 0.;
|
halogtime = 0.;
|
||||||
mergetime = 0.;
|
mergetime = 0.;
|
||||||
@ -1197,6 +1267,7 @@ public:
|
|||||||
splicetime = 0.;
|
splicetime = 0.;
|
||||||
nosplicetime = 0.;
|
nosplicetime = 0.;
|
||||||
comms_bytes = 0.;
|
comms_bytes = 0.;
|
||||||
|
shm_bytes = 0.;
|
||||||
calls = 0.;
|
calls = 0.;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1213,6 +1284,7 @@ public:
|
|||||||
if ( comm_time_thr[i]>0.0 ) {
|
if ( comm_time_thr[i]>0.0 ) {
|
||||||
threaded = 1;
|
threaded = 1;
|
||||||
comms_bytes += comm_bytes_thr[i];
|
comms_bytes += comm_bytes_thr[i];
|
||||||
|
shm_bytes += shm_bytes_thr[i];
|
||||||
if (t < comm_time_thr[i]) t = comm_time_thr[i];
|
if (t < comm_time_thr[i]) t = comm_time_thr[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1232,6 +1304,19 @@ public:
|
|||||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
||||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||||
}
|
}
|
||||||
|
if(shm_bytes>1.0){
|
||||||
|
PRINTIT(shm_bytes); // X bytes + R bytes
|
||||||
|
// Double this to include spin projection overhead with 2:1 ratio in wilson
|
||||||
|
auto gatheralltime = gathertime+gathermtime;
|
||||||
|
auto allbytes = comms_bytes+shm_bytes;
|
||||||
|
std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||||
|
|
||||||
|
auto membytes = (shm_bytes + comms_bytes/2) // read/write
|
||||||
|
+ (shm_bytes+comms_bytes)/2 * sizeof(vobj)/sizeof(cobj);
|
||||||
|
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||||
|
}
|
||||||
PRINTIT(mpi3synctime);
|
PRINTIT(mpi3synctime);
|
||||||
PRINTIT(mpi3synctime_g);
|
PRINTIT(mpi3synctime_g);
|
||||||
PRINTIT(shmmergetime);
|
PRINTIT(shmmergetime);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user