mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 11:15:55 +01:00
Simplify the compressor interface again.
This commit is contained in:
parent
c650bb3f3d
commit
c9fadf97a5
@ -108,6 +108,14 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
_Tp tmp;
|
||||||
|
#undef FIRST_TOUCH_OPTIMISE
|
||||||
|
#ifdef FIRST_TOUCH_OPTIMISE
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(int i=0;i<__n;i++){
|
||||||
|
ptr[i]=tmp;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,7 +77,7 @@ namespace Grid {
|
|||||||
int _around_the_world;
|
int _around_the_world;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class vobj,class cobj, class compressor>
|
template<class vobj,class cobj>
|
||||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@ -580,6 +580,7 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class compressor>
|
||||||
std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
|
std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
|
||||||
Mergers.resize(0);
|
Mergers.resize(0);
|
||||||
Packets.resize(0);
|
Packets.resize(0);
|
||||||
@ -587,6 +588,7 @@ PARALLEL_FOR_LOOP
|
|||||||
return std::thread([&] { this->Communicate(); });
|
return std::thread([&] { this->Communicate(); });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class compressor>
|
||||||
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
auto thr = HaloExchangeBegin(source,compress);
|
auto thr = HaloExchangeBegin(source,compress);
|
||||||
@ -601,20 +603,9 @@ PARALLEL_FOR_LOOP
|
|||||||
jointime+=usecond();
|
jointime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
template<class compressor>
|
||||||
|
void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point)
|
||||||
{
|
{
|
||||||
// conformable(source._grid,_grid);
|
|
||||||
assert(source._grid==_grid);
|
|
||||||
halogtime-=usecond();
|
|
||||||
|
|
||||||
assert (comm_buf.size() == _unified_buffer_size );
|
|
||||||
u_comm_offset=0;
|
|
||||||
|
|
||||||
// Gather all comms buffers
|
|
||||||
for(int point = 0 ; point < _npoints; point++) {
|
|
||||||
|
|
||||||
compress.Point(point);
|
|
||||||
|
|
||||||
int dimension = _directions[point];
|
int dimension = _directions[point];
|
||||||
int displacement = _distances[point];
|
int displacement = _distances[point];
|
||||||
|
|
||||||
@ -662,12 +653,29 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class compressor>
|
||||||
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
|
{
|
||||||
|
// conformable(source._grid,_grid);
|
||||||
|
assert(source._grid==_grid);
|
||||||
|
halogtime-=usecond();
|
||||||
|
|
||||||
|
assert (comm_buf.size() == _unified_buffer_size );
|
||||||
|
u_comm_offset=0;
|
||||||
|
|
||||||
|
// Gather all comms buffers
|
||||||
|
for(int point = 0 ; point < _npoints; point++) {
|
||||||
|
compress.Point(point);
|
||||||
|
HaloGatherDir(source,compress,point);
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(u_comm_offset==_unified_buffer_size);
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
halogtime+=usecond();
|
halogtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class compressor>
|
||||||
void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
|
void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
|
||||||
{
|
{
|
||||||
typedef typename cobj::vector_type vector_type;
|
typedef typename cobj::vector_type vector_type;
|
||||||
@ -728,6 +736,7 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class compressor>
|
||||||
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
|
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
|
||||||
{
|
{
|
||||||
const int Nsimd = _grid->Nsimd();
|
const int Nsimd = _grid->Nsimd();
|
||||||
|
@ -200,7 +200,7 @@ namespace Grid {
|
|||||||
////////////////////
|
////////////////////
|
||||||
Geometry geom;
|
Geometry geom;
|
||||||
GridBase * _grid;
|
GridBase * _grid;
|
||||||
CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil;
|
CartesianStencil<siteVector,siteVector> Stencil;
|
||||||
|
|
||||||
std::vector<CoarseMatrix> A;
|
std::vector<CoarseMatrix> A;
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ class SimpleCompressor {
|
|||||||
public:
|
public:
|
||||||
void Point(int) {};
|
void Point(int) {};
|
||||||
|
|
||||||
vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) {
|
vobj operator() (const vobj &arg) {
|
||||||
return arg;
|
return arg;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -63,7 +63,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
int bo = n*rhs._grid->_slice_block[dimension];
|
int bo = n*rhs._grid->_slice_block[dimension];
|
||||||
buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
buffer[off+bo+b]=compress(rhs._odata[so+o+b]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -73,7 +73,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb &cbmask ) {
|
if ( ocb &cbmask ) {
|
||||||
buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
buffer[off+bo++]=compress(rhs._odata[so+o+b]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -97,16 +97,17 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
|
int n1=rhs._grid->_slice_stride[dimension];
|
||||||
|
int n2=rhs._grid->_slice_block[dimension];
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
PARALLEL_NESTED_LOOP2
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*n1;
|
||||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
int offset = b+n*n2;
|
||||||
|
|
||||||
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
cobj temp =compress(rhs._odata[so+o+b]);
|
||||||
extract<cobj>(temp,pointers,offset);
|
extract<cobj>(temp,pointers,offset);
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -121,7 +122,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
cobj temp =compress(rhs._odata[so+o+b]);
|
||||||
extract<cobj>(temp,pointers,offset);
|
extract<cobj>(temp,pointers,offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -130,7 +130,7 @@ namespace Grid {
|
|||||||
|
|
||||||
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
||||||
typedef WilsonImplParams ImplParams;
|
typedef WilsonImplParams ImplParams;
|
||||||
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
|
typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
|
||||||
|
|
||||||
ImplParams Params;
|
ImplParams Params;
|
||||||
|
|
||||||
@ -205,7 +205,7 @@ PARALLEL_FOR_LOOP
|
|||||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||||
|
|
||||||
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
||||||
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
|
typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
|
||||||
|
|
||||||
typedef GparityWilsonImplParams ImplParams;
|
typedef GparityWilsonImplParams ImplParams;
|
||||||
|
|
||||||
|
@ -48,12 +48,7 @@ namespace QCD {
|
|||||||
mu=p;
|
mu=p;
|
||||||
};
|
};
|
||||||
|
|
||||||
virtual SiteHalfSpinor operator () (const SiteSpinor &in,int dim,int plane,int osite,GridBase *grid) {
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
return spinproject(in);
|
|
||||||
}
|
|
||||||
|
|
||||||
SiteHalfSpinor spinproject(const SiteSpinor &in)
|
|
||||||
{
|
|
||||||
SiteHalfSpinor ret;
|
SiteHalfSpinor ret;
|
||||||
int mudag=mu;
|
int mudag=mu;
|
||||||
if (!dag) {
|
if (!dag) {
|
||||||
@ -92,6 +87,173 @@ namespace QCD {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/////////////////////////
|
||||||
|
// optimised versions
|
||||||
|
/////////////////////////
|
||||||
|
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonXpCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjXp(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonYpCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjYp(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonZpCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjZp(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonTpCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjTp(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonXmCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjXm(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonYmCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjYm(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonZmCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjZm(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonTmCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjTm(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Fast comms buffer manipulation which should inline right through (avoid direction
|
||||||
|
// dependent logic that prevents inlining
|
||||||
|
template<class vobj,class cobj>
|
||||||
|
class WilsonStencil : public CartesianStencil<vobj,cobj> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
WilsonStencil(GridBase *grid,
|
||||||
|
int npoints,
|
||||||
|
int checkerboard,
|
||||||
|
const std::vector<int> &directions,
|
||||||
|
const std::vector<int> &distances) : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
|
||||||
|
{ };
|
||||||
|
|
||||||
|
template < class compressor>
|
||||||
|
std::thread HaloExchangeOptBegin(const Lattice<vobj> &source,compressor &compress) {
|
||||||
|
this->Mergers.resize(0);
|
||||||
|
this->Packets.resize(0);
|
||||||
|
this->HaloGatherOpt(source,compress);
|
||||||
|
return std::thread([&] { this->Communicate(); });
|
||||||
|
}
|
||||||
|
|
||||||
|
template < class compressor>
|
||||||
|
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
|
{
|
||||||
|
auto thr = this->HaloExchangeOptBegin(source,compress);
|
||||||
|
this->HaloExchangeOptComplete(thr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void HaloExchangeOptComplete(std::thread &thr)
|
||||||
|
{
|
||||||
|
this->CommsMerge(); // spins
|
||||||
|
this->jointime-=usecond();
|
||||||
|
thr.join();
|
||||||
|
this->jointime+=usecond();
|
||||||
|
}
|
||||||
|
|
||||||
|
template < class compressor>
|
||||||
|
void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
|
{
|
||||||
|
// conformable(source._grid,_grid);
|
||||||
|
assert(source._grid==this->_grid);
|
||||||
|
this->halogtime-=usecond();
|
||||||
|
|
||||||
|
assert (this->comm_buf.size() == this->_unified_buffer_size );
|
||||||
|
this->u_comm_offset=0;
|
||||||
|
|
||||||
|
int dag = compress.dag;
|
||||||
|
static std::vector<int> dirs(8);
|
||||||
|
for(int mu=0;mu<4;mu++){
|
||||||
|
if ( dag ) {
|
||||||
|
dirs[mu] =mu;
|
||||||
|
dirs[mu+4]=mu+4;
|
||||||
|
} else {
|
||||||
|
dirs[mu] =mu+4;
|
||||||
|
dirs[mu+4]=mu;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
WilsonXpCompressor<cobj,vobj> XpCompress;
|
||||||
|
this->HaloGatherDir(source,XpCompress,dirs[0]);
|
||||||
|
|
||||||
|
WilsonYpCompressor<cobj,vobj> YpCompress;
|
||||||
|
this->HaloGatherDir(source,YpCompress,dirs[1]);
|
||||||
|
|
||||||
|
WilsonZpCompressor<cobj,vobj> ZpCompress;
|
||||||
|
this->HaloGatherDir(source,ZpCompress,dirs[2]);
|
||||||
|
|
||||||
|
WilsonTpCompressor<cobj,vobj> TpCompress;
|
||||||
|
this->HaloGatherDir(source,TpCompress,dirs[3]);
|
||||||
|
|
||||||
|
WilsonXmCompressor<cobj,vobj> XmCompress;
|
||||||
|
this->HaloGatherDir(source,XmCompress,dirs[4]);
|
||||||
|
|
||||||
|
WilsonYmCompressor<cobj,vobj> YmCompress;
|
||||||
|
this->HaloGatherDir(source,YmCompress,dirs[5]);
|
||||||
|
|
||||||
|
WilsonZmCompressor<cobj,vobj> ZmCompress;
|
||||||
|
this->HaloGatherDir(source,ZmCompress,dirs[6]);
|
||||||
|
|
||||||
|
WilsonTmCompressor<cobj,vobj> TmCompress;
|
||||||
|
this->HaloGatherDir(source,TmCompress,dirs[7]);
|
||||||
|
|
||||||
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
|
this->halogtime+=usecond();
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
}} // namespace close
|
}} // namespace close
|
||||||
#endif
|
#endif
|
||||||
|
@ -304,8 +304,8 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
|
|||||||
int nwork = U._grid->oSites();
|
int nwork = U._grid->oSites();
|
||||||
|
|
||||||
commtime -=usecond();
|
commtime -=usecond();
|
||||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
auto handle = st.HaloExchangeOptBegin(in,compressor);
|
||||||
st.HaloExchangeComplete(handle);
|
st.HaloExchangeOptComplete(handle);
|
||||||
commtime +=usecond();
|
commtime +=usecond();
|
||||||
|
|
||||||
jointime -=usecond();
|
jointime -=usecond();
|
||||||
@ -440,7 +440,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
|
|||||||
int nwork = U._grid->oSites();
|
int nwork = U._grid->oSites();
|
||||||
|
|
||||||
commtime -=usecond();
|
commtime -=usecond();
|
||||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
auto handle = st.HaloExchangeOptBegin(in,compressor);
|
||||||
commtime +=usecond();
|
commtime +=usecond();
|
||||||
|
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
@ -498,7 +498,7 @@ PARALLEL_FOR_LOOP
|
|||||||
dslashtime +=usecond();
|
dslashtime +=usecond();
|
||||||
|
|
||||||
jointime -=usecond();
|
jointime -=usecond();
|
||||||
st.HaloExchangeComplete(handle);
|
st.HaloExchangeOptComplete(handle);
|
||||||
jointime +=usecond();
|
jointime +=usecond();
|
||||||
|
|
||||||
local = false;
|
local = false;
|
||||||
|
@ -44,8 +44,8 @@ template<class vsimd,class scalar>
|
|||||||
inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y,
|
inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y,
|
||||||
std::vector<scalar *> &extracted,int offset){
|
std::vector<scalar *> &extracted,int offset){
|
||||||
// FIXME: bounce off memory is painful
|
// FIXME: bounce off memory is painful
|
||||||
|
static const int Nsimd=vsimd::Nsimd();
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
int Nsimd=vsimd::Nsimd();
|
|
||||||
int s=Nsimd/Nextr;
|
int s=Nsimd/Nextr;
|
||||||
|
|
||||||
scalar*buf = (scalar *)y;
|
scalar*buf = (scalar *)y;
|
||||||
@ -59,8 +59,8 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
|
|||||||
template<class vsimd,class scalar>
|
template<class vsimd,class scalar>
|
||||||
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y,
|
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y,
|
||||||
std::vector<scalar *> &extracted,int offset){
|
std::vector<scalar *> &extracted,int offset){
|
||||||
|
static const int Nsimd=vsimd::Nsimd();
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
int Nsimd=vsimd::Nsimd();
|
|
||||||
int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
|
int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
|
||||||
// replicate n-fold. Use to allow Integer masks to
|
// replicate n-fold. Use to allow Integer masks to
|
||||||
// predicate floating point of various width assignments and maintain conformable.
|
// predicate floating point of various width assignments and maintain conformable.
|
||||||
@ -85,6 +85,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
|
|||||||
scalar *buf = (scalar *)&y;
|
scalar *buf = (scalar *)&y;
|
||||||
for(int i=0;i<Nextr;i++){
|
for(int i=0;i<Nextr;i++){
|
||||||
extracted[i]=buf[i*s];
|
extracted[i]=buf[i*s];
|
||||||
|
#ifdef PARANOID
|
||||||
for(int ii=1;ii<s;ii++){
|
for(int ii=1;ii<s;ii++){
|
||||||
if ( buf[i*s]!=buf[i*s+ii] ){
|
if ( buf[i*s]!=buf[i*s+ii] ){
|
||||||
std::cout<<GridLogMessage << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
|
std::cout<<GridLogMessage << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
|
||||||
@ -96,6 +97,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
|
|||||||
}
|
}
|
||||||
assert(buf[i*s]==buf[i*s+ii]);
|
assert(buf[i*s]==buf[i*s+ii]);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -106,7 +108,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
|
|||||||
template<class vsimd,class scalar>
|
template<class vsimd,class scalar>
|
||||||
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type &y,std::vector<scalar> &extracted){
|
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type &y,std::vector<scalar> &extracted){
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
int Nsimd=vsimd::Nsimd();
|
static const int Nsimd=vsimd::Nsimd();
|
||||||
int s=Nsimd/Nextr;
|
int s=Nsimd/Nextr;
|
||||||
scalar *buf = (scalar *)&y;
|
scalar *buf = (scalar *)&y;
|
||||||
|
|
||||||
@ -125,9 +127,9 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
|
|||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
const int Nsimd=vobj::vector_type::Nsimd();
|
static const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
|
||||||
int s=Nsimd/Nextr;
|
int s=Nsimd/Nextr;
|
||||||
|
|
||||||
std::vector<scalar_type *> pointers(Nextr);
|
std::vector<scalar_type *> pointers(Nextr);
|
||||||
@ -148,8 +150,8 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac
|
|||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
const int Nsimd=vobj::vector_type::Nsimd();
|
static const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
|
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
int s = Nsimd/Nextr;
|
int s = Nsimd/Nextr;
|
||||||
@ -172,8 +174,8 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
|
|||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
const int Nsimd=vobj::vector_type::Nsimd();
|
static const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
int Nextr = extracted.size();
|
int Nextr = extracted.size();
|
||||||
int splat=Nsimd/Nextr;
|
int splat=Nsimd/Nextr;
|
||||||
@ -224,8 +226,8 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
|
|||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
const int Nsimd=vobj::vector_type::Nsimd();
|
static const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
scalar_type *vp = (scalar_type *)&vec;
|
scalar_type *vp = (scalar_type *)&vec;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user