mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Faster halo exchange
This commit is contained in:
parent
e064f17346
commit
5fac47a26d
@ -47,16 +47,15 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<class vobj> inline void ScatterSlice(const cshiftVector<typename vobj::scalar_object> &buf,
|
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
||||||
Lattice<vobj> &lat,
|
Lattice<vobj> &lat,
|
||||||
int x,
|
int x,
|
||||||
int dim,
|
int dim,
|
||||||
int offset=0)
|
int offset=0)
|
||||||
{
|
{
|
||||||
|
const int Nsimd=vobj::Nsimd();
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
autoView(lat_v, lat, AcceleratorRead);
|
|
||||||
|
|
||||||
GridBase *grid = lat.Grid();
|
GridBase *grid = lat.Grid();
|
||||||
Coordinate simd = grid->_simd_layout;
|
Coordinate simd = grid->_simd_layout;
|
||||||
int Nd = grid->Nd();
|
int Nd = grid->Nd();
|
||||||
@ -73,7 +72,8 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<typename vobj::
|
|||||||
Coordinate rsimd= simd; rsimd[dim]=1; // maybe reduce Nsimd
|
Coordinate rsimd= simd; rsimd[dim]=1; // maybe reduce Nsimd
|
||||||
|
|
||||||
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
|
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
|
||||||
|
int rNsimda= Nsimd/simd[dim]; // should be equal
|
||||||
|
assert(rNsimda==rNsimd);
|
||||||
int face_ovol=block*nblock;
|
int face_ovol=block*nblock;
|
||||||
|
|
||||||
// assert(buf.size()==face_ovol*rNsimd);
|
// assert(buf.size()==face_ovol*rNsimd);
|
||||||
@ -82,22 +82,26 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<typename vobj::
|
|||||||
//Let's make it work on GPU and then make a special accelerator_for that
|
//Let's make it work on GPU and then make a special accelerator_for that
|
||||||
//doesn't hide the SIMD direction and keeps explicit in the threadIdx
|
//doesn't hide the SIMD direction and keeps explicit in the threadIdx
|
||||||
//for cross platform
|
//for cross platform
|
||||||
|
// FIXME -- can put internal indices into thread loop
|
||||||
auto buf_p = & buf[0];
|
auto buf_p = & buf[0];
|
||||||
accelerator_for(ss, face_ovol,rNsimd,{
|
autoView(lat_v, lat, AcceleratorRead);
|
||||||
|
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
|
||||||
|
|
||||||
// scalar layout won't coalesce
|
// scalar layout won't coalesce
|
||||||
int olane=acceleratorSIMTlane(rNsimd);
|
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
|
||||||
sobj obj = buf_p[ss+olane*face_ovol+offset];
|
int olane=blane%rNsimd; // reduced lattice lane
|
||||||
|
int obit =blane/rNsimd;
|
||||||
|
|
||||||
////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// osite
|
// osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
|
||||||
////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
int b = ss%block;
|
int ssp = ss*simd[dim]+obit;
|
||||||
int n = ss/block;
|
int b = ssp%block;
|
||||||
|
int n = ssp/block;
|
||||||
int osite= b+n*stride + ox*block;
|
int osite= b+n*stride + ox*block;
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// isite
|
// isite -- map lane within buffer to lane within lattice
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
Coordinate icoor;
|
Coordinate icoor;
|
||||||
int lane;
|
int lane;
|
||||||
@ -108,16 +112,18 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<typename vobj::
|
|||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Transfer into lattice - will coalesce
|
// Transfer into lattice - will coalesce
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
|
sobj obj = extractLane(blane,buf_p[ss+offset]);
|
||||||
insertLane(lane,lat_v[osite],obj);
|
insertLane(lane,lat_v[osite],obj);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void GatherSlice(cshiftVector<typename vobj::scalar_object> &buf,
|
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
|
||||||
const Lattice<vobj> &lat,
|
const Lattice<vobj> &lat,
|
||||||
int x,
|
int x,
|
||||||
int dim,
|
int dim,
|
||||||
int offset=0)
|
int offset=0)
|
||||||
{
|
{
|
||||||
|
const int Nsimd=vobj::Nsimd();
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
autoView(lat_v, lat, AcceleratorRead);
|
autoView(lat_v, lat, AcceleratorRead);
|
||||||
@ -149,20 +155,29 @@ template<class vobj> inline void GatherSlice(cshiftVector<typename vobj::scalar_
|
|||||||
//for cross platform
|
//for cross platform
|
||||||
//For CPU perhaps just run a loop over Nsimd
|
//For CPU perhaps just run a loop over Nsimd
|
||||||
auto buf_p = & buf[0];
|
auto buf_p = & buf[0];
|
||||||
accelerator_for(ss, face_ovol,rNsimd,{
|
std::cout << " simd["<<dim<<"] "<< simd[dim] <<std::endl;
|
||||||
|
std::cout << " simd "<< simd <<std::endl;
|
||||||
|
std::cout << " Nsimd "<< Nsimd <<std::endl;
|
||||||
|
std::cout << " rNsimd "<< rNsimd <<std::endl;
|
||||||
|
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
|
||||||
|
|
||||||
|
// scalar layout won't coalesce
|
||||||
|
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
|
||||||
|
int olane=blane%rNsimd; // reduced lattice lane
|
||||||
|
int obit =blane/rNsimd;
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// osite
|
// osite
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
int b = ss%block;
|
int ssp = ss*simd[dim]+obit;
|
||||||
int n = ss/block;
|
int b = ssp%block;
|
||||||
|
int n = ssp/block;
|
||||||
int osite= b+n*stride + ox*block;
|
int osite= b+n*stride + ox*block;
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// isite
|
// isite -- map lane within buffer to lane within lattice
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
Coordinate icoor;
|
Coordinate icoor;
|
||||||
int olane=acceleratorSIMTlane(rNsimd);
|
|
||||||
int lane;
|
int lane;
|
||||||
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
|
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
|
||||||
icoor[dim]=ix;
|
icoor[dim]=ix;
|
||||||
@ -171,10 +186,23 @@ template<class vobj> inline void GatherSlice(cshiftVector<typename vobj::scalar_
|
|||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Take out of lattice
|
// Take out of lattice
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
|
|
||||||
sobj obj = extractLane(lane,lat_v[osite]);
|
sobj obj = extractLane(lane,lat_v[osite]);
|
||||||
buf_p[ss+olane*face_ovol+offset] = obj;
|
insertLane(blane,buf_p[ss+offset],obj);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
/*
|
||||||
|
int words =block*nblock/simd[dim];
|
||||||
|
std::vector<vobj> tbuf(words);
|
||||||
|
acceleratorCopyFromDevice((void *)&buf[offset],(void *)&tbuf[0],words*sizeof(vobj));
|
||||||
|
typedef typename vobj::scalar_type scalar;
|
||||||
|
scalar *sbuf = (scalar *)&tbuf[0];
|
||||||
|
scalar tmp=0.0;
|
||||||
|
for(int w=0;w<words*sizeof(vobj)/sizeof(scalar);w++){
|
||||||
|
tmp=tmp+conjugate(sbuf[w])*sbuf[w];
|
||||||
|
}
|
||||||
|
std::cout << " Gathered buffer norm "<<tmp<<std::endl;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -260,13 +288,13 @@ public:
|
|||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline Lattice<vobj> ExchangeTest(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
|
inline Lattice<vobj> ExchangePeriodic(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
|
||||||
{
|
{
|
||||||
GridBase *old_grid = in.Grid();
|
GridBase *old_grid = in.Grid();
|
||||||
int dims = old_grid->Nd();
|
int dims = old_grid->Nd();
|
||||||
Lattice<vobj> tmp = in;
|
Lattice<vobj> tmp = in;
|
||||||
for(int d=0;d<dims;d++){
|
for(int d=0;d<dims;d++){
|
||||||
tmp = ExpandTest(d,tmp,cshift); // rvalue && assignment
|
tmp = ExpandPeriodic(d,tmp,cshift); // rvalue && assignment
|
||||||
}
|
}
|
||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
@ -337,9 +365,6 @@ public:
|
|||||||
InsertSliceLocal(shifted,padded,x,x,dim);
|
InsertSliceLocal(shifted,padded,x,x,dim);
|
||||||
}
|
}
|
||||||
tins += usecond() - t;
|
tins += usecond() - t;
|
||||||
// std::cout << GridLogMessage << "dimension " <<dim<<std::endl;
|
|
||||||
// DumpSliceNorm(std::string("Old_exchange from"),in,dim);
|
|
||||||
// DumpSliceNorm(std::string("Old_exchange to "),padded,dim);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
|
||||||
@ -348,7 +373,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline Lattice<vobj> ExpandTest(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
|
inline Lattice<vobj> ExpandPeriodic(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
|
||||||
{
|
{
|
||||||
Coordinate processors=unpadded_grid->_processors;
|
Coordinate processors=unpadded_grid->_processors;
|
||||||
GridBase *old_grid = in.Grid();
|
GridBase *old_grid = in.Grid();
|
||||||
@ -412,7 +437,7 @@ public:
|
|||||||
Coordinate simd= from.Grid()->_simd_layout;
|
Coordinate simd= from.Grid()->_simd_layout;
|
||||||
int ld = lds[dimension];
|
int ld = lds[dimension];
|
||||||
int nld = to.Grid()->_ldimensions[dimension];
|
int nld = to.Grid()->_ldimensions[dimension];
|
||||||
|
const int Nsimd = vobj::Nsimd();
|
||||||
|
|
||||||
assert(depth<=lds[dimension]); // A must be on neighbouring node
|
assert(depth<=lds[dimension]); // A must be on neighbouring node
|
||||||
assert(depth>0); // A caller bug if zero
|
assert(depth>0); // A caller bug if zero
|
||||||
@ -424,16 +449,17 @@ public:
|
|||||||
for(int d=0;d<lds.size();d++){
|
for(int d=0;d<lds.size();d++){
|
||||||
if ( d!= dimension) buffer_size=buffer_size*lds[d];
|
if ( d!= dimension) buffer_size=buffer_size*lds[d];
|
||||||
}
|
}
|
||||||
int rNsimd = vobj::Nsimd() / simd[dimension];
|
buffer_size = buffer_size / Nsimd;
|
||||||
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] *rNsimd);
|
int rNsimd = Nsimd / simd[dimension];
|
||||||
|
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
|
||||||
|
|
||||||
static cshiftVector<sobj> send_buf;
|
static cshiftVector<vobj> send_buf;
|
||||||
static cshiftVector<sobj> recv_buf;
|
static cshiftVector<vobj> recv_buf;
|
||||||
send_buf.resize(buffer_size*2*depth);
|
send_buf.resize(buffer_size*2*depth);
|
||||||
recv_buf.resize(buffer_size*2*depth);
|
recv_buf.resize(buffer_size*2*depth);
|
||||||
|
|
||||||
int words = buffer_size;
|
int words = buffer_size;
|
||||||
int bytes = words * sizeof(sobj);
|
int bytes = words * sizeof(vobj);
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
// Gather all surface terms up to depth "d"
|
// Gather all surface terms up to depth "d"
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
@ -491,14 +517,14 @@ public:
|
|||||||
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
|
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
|
||||||
}
|
}
|
||||||
t_scatter= usecond() - t;
|
t_scatter= usecond() - t;
|
||||||
|
// DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
||||||
// DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user