mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 11:15:55 +01:00
Stencil now runs with coalesced accesses
This commit is contained in:
parent
a584b16c4a
commit
6c4da3bbc7
@ -55,6 +55,45 @@ NAMESPACE_BEGIN(Grid);
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Gather for when there *is* need to SIMD split with compression
|
||||
///////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
//////////////////////////////////////////
|
||||
// EExtract and insert slices on the GPU
|
||||
//////////////////////////////////////////
|
||||
template<class vobj> accelerator_inline
|
||||
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec)
|
||||
{
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||
int lane = threadIdx.x % Nsimd;
|
||||
return extractLane(lane,vec);
|
||||
}
|
||||
template<class vobj> accelerator_inline
|
||||
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted)
|
||||
{
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||
int lane = threadIdx.x % Nsimd;
|
||||
insertLane(lane,vec,extracted);
|
||||
}
|
||||
#else
|
||||
//////////////////////////////////////////
|
||||
// Trivial mapping of vectors on host
|
||||
//////////////////////////////////////////
|
||||
template<class vobj> accelerator_inline
|
||||
vobj coalescedRead(const vobj & __restrict__ vec)
|
||||
{
|
||||
return vec;
|
||||
}
|
||||
template<class vobj> accelerator_inline
|
||||
void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted)
|
||||
{
|
||||
vec = extracted;
|
||||
}
|
||||
#endif
|
||||
|
||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||
int off,Vector<std::pair<int,int> > & table);
|
||||
|
||||
@ -66,11 +105,30 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
|
||||
{
|
||||
int num=table.size();
|
||||
std::pair<int,int> *table_v = & table[0];
|
||||
|
||||
auto rhs_v = rhs.View();
|
||||
// auto tmp_ucc = coalescedRead(rhs_v[so+table_v[0].second]);
|
||||
// coalescedWrite(rhs_v[so+table_v[0].second],tmp_ucc);
|
||||
#if 1
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||
accelerator_loopNB( ii,num*Nsimd, {
|
||||
|
||||
typedef decltype(coalescedRead(buffer[0])) compressed_t;
|
||||
typedef decltype(coalescedRead(rhs_v [0])) uncompressed_t;
|
||||
|
||||
int i = ii/Nsimd;
|
||||
compressed_t tmp_c;
|
||||
uncompressed_t tmp_uc = coalescedRead(rhs_v[so+table_v[i].second]);
|
||||
uint64_t o = table_v[i].first;
|
||||
compress.Compress(&tmp_c,0,tmp_uc);
|
||||
coalescedWrite(buffer[off+o],tmp_c);
|
||||
});
|
||||
#else
|
||||
accelerator_loopN( i,num, {
|
||||
compress.Compress(&buffer[off],table_v[i].first,rhs_v[so+table_v[i].second]);
|
||||
});
|
||||
#endif
|
||||
// Further optimisatoin: i) streaming store the result
|
||||
// ii) software prefetch the first element of the next table entry
|
||||
}
|
||||
@ -224,7 +282,7 @@ public:
|
||||
}
|
||||
|
||||
int face_table_computed;
|
||||
Vector<Vector<std::pair<int,int> > > face_table ;
|
||||
std::vector<Vector<std::pair<int,int> > > face_table ;
|
||||
Vector<int> surface_list;
|
||||
|
||||
Vector<StencilEntry> _entries; // Resident in managed memory
|
||||
@ -259,10 +317,12 @@ public:
|
||||
double mergetime;
|
||||
double decompresstime;
|
||||
double comms_bytes;
|
||||
double shm_bytes;
|
||||
double splicetime;
|
||||
double nosplicetime;
|
||||
double calls;
|
||||
std::vector<double> comm_bytes_thr;
|
||||
std::vector<double> shm_bytes_thr;
|
||||
std::vector<double> comm_time_thr;
|
||||
std::vector<double> comm_enter_thr;
|
||||
std::vector<double> comm_leave_thr;
|
||||
@ -326,6 +386,8 @@ public:
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes,i);
|
||||
comm_bytes_thr[mythread] += bytes;
|
||||
shm_bytes_thr[mythread] += 2*Packets[i].bytes-bytes; // Send + Recv.
|
||||
|
||||
}
|
||||
comm_leave_thr[mythread]= usecond();
|
||||
comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
|
||||
@ -343,11 +405,13 @@ public:
|
||||
double t0 = comm_enter_thr[t];
|
||||
double t1 = comm_leave_thr[t];
|
||||
comms_bytes+=comm_bytes_thr[t];
|
||||
shm_bytes +=shm_bytes_thr[t];
|
||||
|
||||
comm_enter_thr[t] = 0.0;
|
||||
comm_leave_thr[t] = 0.0;
|
||||
comm_time_thr[t] = 0.0;
|
||||
comm_bytes_thr[t]=0;
|
||||
shm_bytes_thr[t]=0;
|
||||
|
||||
if ( first == 0.0 ) first = t0; // first is t0
|
||||
if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen
|
||||
@ -362,12 +426,14 @@ public:
|
||||
reqs.resize(Packets.size());
|
||||
commtime-=usecond();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
|
||||
uint64_t bytes=_grid->StencilSendToRecvFromBegin(reqs[i],
|
||||
Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes,i);
|
||||
comms_bytes+=bytes;
|
||||
shm_bytes +=2*Packets[i].bytes-bytes;
|
||||
}
|
||||
}
|
||||
|
||||
@ -391,12 +457,14 @@ public:
|
||||
if (mythread < nthreads) {
|
||||
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
||||
double start = usecond();
|
||||
comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes,i);
|
||||
comm_time_thr[mythread] += usecond() - start;
|
||||
uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes,i);
|
||||
comm_bytes_thr[mythread] += bytes;
|
||||
shm_bytes_thr[mythread] += Packets[i].bytes - bytes;
|
||||
comm_time_thr[mythread] += usecond() - start;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -610,7 +678,8 @@ public:
|
||||
const std::vector<int> &directions,
|
||||
const std::vector<int> &distances,
|
||||
Parameters p)
|
||||
: comm_bytes_thr(npoints),
|
||||
: shm_bytes_thr(npoints),
|
||||
comm_bytes_thr(npoints),
|
||||
comm_enter_thr(npoints),
|
||||
comm_leave_thr(npoints),
|
||||
comm_time_thr(npoints)
|
||||
@ -1189,6 +1258,7 @@ public:
|
||||
comm_bytes_thr[i]=0;
|
||||
comm_enter_thr[i]=0;
|
||||
comm_leave_thr[i]=0;
|
||||
shm_bytes_thr[i]=0;
|
||||
}
|
||||
halogtime = 0.;
|
||||
mergetime = 0.;
|
||||
@ -1197,6 +1267,7 @@ public:
|
||||
splicetime = 0.;
|
||||
nosplicetime = 0.;
|
||||
comms_bytes = 0.;
|
||||
shm_bytes = 0.;
|
||||
calls = 0.;
|
||||
};
|
||||
|
||||
@ -1213,6 +1284,7 @@ public:
|
||||
if ( comm_time_thr[i]>0.0 ) {
|
||||
threaded = 1;
|
||||
comms_bytes += comm_bytes_thr[i];
|
||||
shm_bytes += shm_bytes_thr[i];
|
||||
if (t < comm_time_thr[i]) t = comm_time_thr[i];
|
||||
}
|
||||
}
|
||||
@ -1232,6 +1304,19 @@ public:
|
||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
}
|
||||
if(shm_bytes>1.0){
|
||||
PRINTIT(shm_bytes); // X bytes + R bytes
|
||||
// Double this to include spin projection overhead with 2:1 ratio in wilson
|
||||
auto gatheralltime = gathertime+gathermtime;
|
||||
auto allbytes = comms_bytes+shm_bytes;
|
||||
std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
|
||||
auto membytes = (shm_bytes + comms_bytes/2) // read/write
|
||||
+ (shm_bytes+comms_bytes)/2 * sizeof(vobj)/sizeof(cobj);
|
||||
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
}
|
||||
PRINTIT(mpi3synctime);
|
||||
PRINTIT(mpi3synctime_g);
|
||||
PRINTIT(shmmergetime);
|
||||
|
Loading…
x
Reference in New Issue
Block a user