mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 01:35:36 +00:00
Massive compressor rework to support reduced precision comms
This commit is contained in:
parent
3b7de792d5
commit
4a340aa5ca
@ -34,8 +34,20 @@ template<class vobj>
|
|||||||
class SimpleCompressor {
|
class SimpleCompressor {
|
||||||
public:
|
public:
|
||||||
void Point(int) {};
|
void Point(int) {};
|
||||||
|
inline int CommDatumSize(void) { return sizeof(vobj); }
|
||||||
vobj operator() (const vobj &arg) {
|
inline bool DecompressionStep(void) { return false; }
|
||||||
|
inline void Compress(vobj *buf,int o,const vobj &in) { buf[o]=in; }
|
||||||
|
inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o){
|
||||||
|
exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
|
||||||
|
}
|
||||||
|
inline void Decompress(vobj *out,vobj *in, int o){ assert(0); }
|
||||||
|
inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in,
|
||||||
|
int j,int k, int m,int type){
|
||||||
|
exchange(out0[j],out1[j],in[k],in[m],type);
|
||||||
|
}
|
||||||
|
// For cshift. Cshift should drop compressor coupling altogether
|
||||||
|
// because I had to decouple the code from the Stencil anyway
|
||||||
|
inline vobj operator() (const vobj &arg) {
|
||||||
return arg;
|
return arg;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -35,7 +35,6 @@ directory
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// Template parameter class constructs to package
|
// Template parameter class constructs to package
|
||||||
// externally control Fermion implementations
|
// externally control Fermion implementations
|
||||||
@ -90,6 +89,52 @@ namespace QCD {
|
|||||||
// }
|
// }
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
|
|
||||||
|
template <class T> struct SamePrecisionMapper {
|
||||||
|
typedef T HigherPrecVector ;
|
||||||
|
typedef T LowerPrecVector ;
|
||||||
|
};
|
||||||
|
template <class T> struct LowerPrecisionMapper { };
|
||||||
|
template <> struct LowerPrecisionMapper<vRealF> {
|
||||||
|
typedef vRealF HigherPrecVector ;
|
||||||
|
typedef vRealH LowerPrecVector ;
|
||||||
|
};
|
||||||
|
template <> struct LowerPrecisionMapper<vRealD> {
|
||||||
|
typedef vRealD HigherPrecVector ;
|
||||||
|
typedef vRealF LowerPrecVector ;
|
||||||
|
};
|
||||||
|
template <> struct LowerPrecisionMapper<vComplexF> {
|
||||||
|
typedef vComplexF HigherPrecVector ;
|
||||||
|
typedef vComplexH LowerPrecVector ;
|
||||||
|
};
|
||||||
|
template <> struct LowerPrecisionMapper<vComplexD> {
|
||||||
|
typedef vComplexD HigherPrecVector ;
|
||||||
|
typedef vComplexF LowerPrecVector ;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct CoeffReal {
|
||||||
|
public:
|
||||||
|
typedef RealD _Coeff_t;
|
||||||
|
static const int Nhcs = 2;
|
||||||
|
template<class Simd> using PrecisionMapper = SamePrecisionMapper<Simd>;
|
||||||
|
};
|
||||||
|
struct CoeffRealHalfComms {
|
||||||
|
public:
|
||||||
|
typedef RealD _Coeff_t;
|
||||||
|
static const int Nhcs = 1;
|
||||||
|
template<class Simd> using PrecisionMapper = LowerPrecisionMapper<Simd>;
|
||||||
|
};
|
||||||
|
struct CoeffComplex {
|
||||||
|
public:
|
||||||
|
typedef ComplexD _Coeff_t;
|
||||||
|
static const int Nhcs = 2;
|
||||||
|
template<class Simd> using PrecisionMapper = SamePrecisionMapper<Simd>;
|
||||||
|
};
|
||||||
|
struct CoeffComplexHalfComms {
|
||||||
|
public:
|
||||||
|
typedef ComplexD _Coeff_t;
|
||||||
|
static const int Nhcs = 1;
|
||||||
|
template<class Simd> using PrecisionMapper = LowerPrecisionMapper<Simd>;
|
||||||
|
};
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Implementation dependent fermion types
|
// Implementation dependent fermion types
|
||||||
@ -114,37 +159,40 @@ namespace QCD {
|
|||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
// Single flavour four spinors with colour index
|
// Single flavour four spinors with colour index
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
|
template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
|
||||||
class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
|
class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
static const int Dimension = Representation::Dimension;
|
static const int Dimension = Representation::Dimension;
|
||||||
|
static const bool LsVectorised=false;
|
||||||
|
static const int Nhcs = Options::Nhcs;
|
||||||
|
|
||||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
||||||
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
|
|
||||||
//Necessary?
|
//Necessary?
|
||||||
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
|
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
|
||||||
|
|
||||||
const bool LsVectorised=false;
|
typedef typename Options::_Coeff_t Coeff_t;
|
||||||
typedef _Coeff_t Coeff_t;
|
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
|
||||||
|
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
|
||||||
|
|
||||||
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
|
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
|
||||||
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
|
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
|
||||||
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
|
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
|
||||||
|
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
|
||||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
|
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
|
||||||
|
|
||||||
typedef iImplSpinor<Simd> SiteSpinor;
|
typedef iImplSpinor<Simd> SiteSpinor;
|
||||||
typedef iImplPropagator<Simd> SitePropagator;
|
typedef iImplPropagator<Simd> SitePropagator;
|
||||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||||
|
typedef iImplHalfCommSpinor<SimdL> SiteHalfCommSpinor;
|
||||||
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
|
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
|
||||||
|
|
||||||
typedef Lattice<SiteSpinor> FermionField;
|
typedef Lattice<SiteSpinor> FermionField;
|
||||||
typedef Lattice<SitePropagator> PropagatorField;
|
typedef Lattice<SitePropagator> PropagatorField;
|
||||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||||
|
|
||||||
typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
|
typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
|
||||||
typedef WilsonImplParams ImplParams;
|
typedef WilsonImplParams ImplParams;
|
||||||
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
|
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
|
||||||
|
|
||||||
@ -209,21 +257,24 @@ namespace QCD {
|
|||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Single flavour four spinors with colour index, 5d redblack
|
// Single flavour four spinors with colour index, 5d redblack
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class S,int Nrepresentation=Nc, class Options=CoeffReal>
|
||||||
template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
|
|
||||||
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
|
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
static const int Dimension = Nrepresentation;
|
|
||||||
const bool LsVectorised=true;
|
|
||||||
typedef _Coeff_t Coeff_t;
|
|
||||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
|
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
|
||||||
|
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
|
|
||||||
|
static const int Dimension = Nrepresentation;
|
||||||
|
static const bool LsVectorised=true;
|
||||||
|
static const int Nhcs = Options::Nhcs;
|
||||||
|
|
||||||
|
typedef typename Options::_Coeff_t Coeff_t;
|
||||||
|
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
|
||||||
|
|
||||||
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
|
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
|
||||||
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
|
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
|
||||||
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
|
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
|
||||||
|
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhcs> >;
|
||||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
|
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
|
||||||
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
|
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
|
||||||
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
|
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
|
||||||
@ -231,10 +282,10 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
|
|||||||
typedef iImplSpinor<Simd> SiteSpinor;
|
typedef iImplSpinor<Simd> SiteSpinor;
|
||||||
typedef iImplPropagator<Simd> SitePropagator;
|
typedef iImplPropagator<Simd> SitePropagator;
|
||||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||||
|
typedef iImplHalfCommSpinor<SimdL> SiteHalfCommSpinor;
|
||||||
typedef Lattice<SiteSpinor> FermionField;
|
typedef Lattice<SiteSpinor> FermionField;
|
||||||
typedef Lattice<SitePropagator> PropagatorField;
|
typedef Lattice<SitePropagator> PropagatorField;
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////
|
/////////////////////////////////////////////////
|
||||||
// Make the doubled gauge field a *scalar*
|
// Make the doubled gauge field a *scalar*
|
||||||
/////////////////////////////////////////////////
|
/////////////////////////////////////////////////
|
||||||
@ -243,7 +294,7 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
|
|||||||
typedef iImplGaugeLink<typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
|
typedef iImplGaugeLink<typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
|
||||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||||
|
|
||||||
typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
|
typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
|
||||||
typedef WilsonImplParams ImplParams;
|
typedef WilsonImplParams ImplParams;
|
||||||
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
|
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
|
||||||
|
|
||||||
@ -311,35 +362,37 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Flavour doubled spinors; is Gparity the only? what about C*?
|
// Flavour doubled spinors; is Gparity the only? what about C*?
|
||||||
////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template <class S, int Nrepresentation, class Options=CoeffReal>
|
||||||
template <class S, int Nrepresentation,class _Coeff_t = RealD>
|
|
||||||
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
|
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
static const int Dimension = Nrepresentation;
|
static const int Dimension = Nrepresentation;
|
||||||
|
static const int Nhcs = Options::Nhcs;
|
||||||
|
static const bool LsVectorised=false;
|
||||||
|
|
||||||
const bool LsVectorised=false;
|
|
||||||
|
|
||||||
typedef _Coeff_t Coeff_t;
|
|
||||||
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
|
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
|
||||||
|
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
|
|
||||||
|
typedef typename Options::_Coeff_t Coeff_t;
|
||||||
|
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
|
||||||
|
|
||||||
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
|
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
|
||||||
template <typename vtype> using iImplPropagator = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>, Ngp >;
|
template <typename vtype> using iImplPropagator = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>, Ngp>;
|
||||||
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
|
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
|
||||||
|
template <typename vtype> using iImplHalfCommSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhcs>, Ngp>;
|
||||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
|
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
|
||||||
|
|
||||||
typedef iImplSpinor<Simd> SiteSpinor;
|
typedef iImplSpinor<Simd> SiteSpinor;
|
||||||
typedef iImplPropagator<Simd> SitePropagator;
|
typedef iImplPropagator<Simd> SitePropagator;
|
||||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||||
|
typedef iImplHalfCommSpinor<SimdL> SiteHalfCommSpinor;
|
||||||
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
|
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
|
||||||
|
|
||||||
typedef Lattice<SiteSpinor> FermionField;
|
typedef Lattice<SiteSpinor> FermionField;
|
||||||
typedef Lattice<SitePropagator> PropagatorField;
|
typedef Lattice<SitePropagator> PropagatorField;
|
||||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||||
|
|
||||||
typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
|
typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
|
||||||
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
|
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
|
||||||
|
|
||||||
typedef GparityWilsonImplParams ImplParams;
|
typedef GparityWilsonImplParams ImplParams;
|
||||||
@ -475,7 +528,6 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
|
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
|
||||||
|
|
||||||
// DhopDir provides U or Uconj depending on coor/flavour.
|
// DhopDir provides U or Uconj depending on coor/flavour.
|
||||||
@ -508,23 +560,22 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
// Single flavour one component spinors with colour index
|
||||||
// Single flavour one component spinors with colour index
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
template <class S, class Representation = FundamentalRepresentation >
|
||||||
template <class S, class Representation = FundamentalRepresentation >
|
class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
|
||||||
class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
typedef RealD _Coeff_t ;
|
typedef RealD _Coeff_t ;
|
||||||
static const int Dimension = Representation::Dimension;
|
static const int Dimension = Representation::Dimension;
|
||||||
|
static const bool LsVectorised=false;
|
||||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
||||||
|
|
||||||
//Necessary?
|
//Necessary?
|
||||||
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
|
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
|
||||||
|
|
||||||
const bool LsVectorised=false;
|
|
||||||
typedef _Coeff_t Coeff_t;
|
typedef _Coeff_t Coeff_t;
|
||||||
|
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
@ -641,8 +692,6 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
// Single flavour one component spinors with colour index. 5d vec
|
// Single flavour one component spinors with colour index. 5d vec
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
@ -651,16 +700,14 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
typedef RealD _Coeff_t ;
|
|
||||||
static const int Dimension = Representation::Dimension;
|
static const int Dimension = Representation::Dimension;
|
||||||
|
static const bool LsVectorised=true;
|
||||||
|
typedef RealD Coeff_t ;
|
||||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
||||||
|
|
||||||
//Necessary?
|
//Necessary?
|
||||||
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
|
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
|
||||||
|
|
||||||
const bool LsVectorised=true;
|
|
||||||
|
|
||||||
typedef _Coeff_t Coeff_t;
|
|
||||||
|
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
|
|
||||||
@ -823,43 +870,61 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffReal > WilsonImplR; // Real.. whichever prec
|
||||||
|
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF; // Float
|
||||||
|
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD; // Double
|
||||||
|
|
||||||
|
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRH; // Real.. whichever prec
|
||||||
|
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH; // Float
|
||||||
|
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDH; // Double
|
||||||
|
|
||||||
typedef WilsonImpl<vComplex, FundamentalRepresentation > WilsonImplR; // Real.. whichever prec
|
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
|
||||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF; // Float
|
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
|
||||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD; // Double
|
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
|
||||||
|
|
||||||
typedef WilsonImpl<vComplex, FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
|
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRH; // Real.. whichever prec
|
||||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
|
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
|
||||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
|
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDH; // Double
|
||||||
|
|
||||||
typedef WilsonImpl<vComplex, AdjointRepresentation > WilsonAdjImplR; // Real.. whichever prec
|
typedef WilsonImpl<vComplex, AdjointRepresentation, CoeffReal > WilsonAdjImplR; // Real.. whichever prec
|
||||||
typedef WilsonImpl<vComplexF, AdjointRepresentation > WilsonAdjImplF; // Float
|
typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF; // Float
|
||||||
typedef WilsonImpl<vComplexD, AdjointRepresentation > WilsonAdjImplD; // Double
|
typedef WilsonImpl<vComplexD, AdjointRepresentation, CoeffReal > WilsonAdjImplD; // Double
|
||||||
|
|
||||||
typedef WilsonImpl<vComplex, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplR; // Real.. whichever prec
|
typedef WilsonImpl<vComplex, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplR; // Real.. whichever prec
|
||||||
typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplF; // Float
|
typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF; // Float
|
||||||
typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplD; // Double
|
typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD; // Double
|
||||||
|
|
||||||
typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
|
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
|
||||||
typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
|
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffReal> DomainWallVec5dImplF; // Float
|
||||||
typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
|
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffReal> DomainWallVec5dImplD; // Double
|
||||||
|
|
||||||
typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
|
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffRealHalfComms> DomainWallVec5dImplRH; // Real.. whichever prec
|
||||||
typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
|
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
|
||||||
typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
|
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffRealHalfComms> DomainWallVec5dImplDH; // Double
|
||||||
|
|
||||||
typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR; // Real.. whichever prec
|
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
|
||||||
typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF; // Float
|
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplex> ZDomainWallVec5dImplF; // Float
|
||||||
typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD; // Double
|
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplex> ZDomainWallVec5dImplD; // Double
|
||||||
|
|
||||||
typedef StaggeredImpl<vComplex, FundamentalRepresentation > StaggeredImplR; // Real.. whichever prec
|
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplRH; // Real.. whichever prec
|
||||||
typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF; // Float
|
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
|
||||||
typedef StaggeredImpl<vComplexD, FundamentalRepresentation > StaggeredImplD; // Double
|
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplDH; // Double
|
||||||
|
|
||||||
typedef StaggeredVec5dImpl<vComplex, FundamentalRepresentation > StaggeredVec5dImplR; // Real.. whichever prec
|
typedef GparityWilsonImpl<vComplex , Nc,CoeffReal> GparityWilsonImplR; // Real.. whichever prec
|
||||||
typedef StaggeredVec5dImpl<vComplexF, FundamentalRepresentation > StaggeredVec5dImplF; // Float
|
typedef GparityWilsonImpl<vComplexF, Nc,CoeffReal> GparityWilsonImplF; // Float
|
||||||
typedef StaggeredVec5dImpl<vComplexD, FundamentalRepresentation > StaggeredVec5dImplD; // Double
|
typedef GparityWilsonImpl<vComplexD, Nc,CoeffReal> GparityWilsonImplD; // Double
|
||||||
|
|
||||||
|
typedef GparityWilsonImpl<vComplex , Nc,CoeffRealHalfComms> GparityWilsonImplRH; // Real.. whichever prec
|
||||||
|
typedef GparityWilsonImpl<vComplexF, Nc,CoeffRealHalfComms> GparityWilsonImplFH; // Float
|
||||||
|
typedef GparityWilsonImpl<vComplexD, Nc,CoeffRealHalfComms> GparityWilsonImplDH; // Double
|
||||||
|
|
||||||
|
typedef StaggeredImpl<vComplex, FundamentalRepresentation > StaggeredImplR; // Real.. whichever prec
|
||||||
|
typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF; // Float
|
||||||
|
typedef StaggeredImpl<vComplexD, FundamentalRepresentation > StaggeredImplD; // Double
|
||||||
|
|
||||||
|
typedef StaggeredVec5dImpl<vComplex, FundamentalRepresentation > StaggeredVec5dImplR; // Real.. whichever prec
|
||||||
|
typedef StaggeredVec5dImpl<vComplexF, FundamentalRepresentation > StaggeredVec5dImplF; // Float
|
||||||
|
typedef StaggeredVec5dImpl<vComplexD, FundamentalRepresentation > StaggeredVec5dImplD; // Double
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
@ -33,200 +33,265 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
template<class SiteHalfSpinor,class SiteSpinor>
|
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
class WilsonCompressor {
|
// optimised versions supporting half precision too
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
template<class _HCspinor,class _Hspinor,class _Spinor, class projector,typename SFINAE = void >
|
||||||
|
class WilsonCompressorTemplate;
|
||||||
|
|
||||||
|
|
||||||
|
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
|
||||||
|
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
|
||||||
|
typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
|
||||||
|
{
|
||||||
public:
|
public:
|
||||||
int mu;
|
|
||||||
int dag;
|
|
||||||
|
|
||||||
WilsonCompressor(int _dag){
|
int mu,dag;
|
||||||
mu=0;
|
|
||||||
dag=_dag;
|
|
||||||
assert((dag==0)||(dag==1));
|
|
||||||
}
|
|
||||||
void Point(int p) {
|
|
||||||
mu=p;
|
|
||||||
};
|
|
||||||
|
|
||||||
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
void Point(int p) { mu=p; };
|
||||||
SiteHalfSpinor ret;
|
|
||||||
int mudag=mu;
|
WilsonCompressorTemplate(int _dag=0){
|
||||||
if (!dag) {
|
dag = _dag;
|
||||||
mudag=(mu+Nd)%(2*Nd);
|
|
||||||
}
|
}
|
||||||
switch(mudag) {
|
|
||||||
case Xp:
|
typedef _Spinor SiteSpinor;
|
||||||
spProjXp(ret,in);
|
typedef _Hspinor SiteHalfSpinor;
|
||||||
break;
|
typedef _HCspinor SiteHalfCommSpinor;
|
||||||
case Yp:
|
typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
|
||||||
spProjYp(ret,in);
|
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
|
||||||
break;
|
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
|
||||||
case Zp:
|
|
||||||
spProjZp(ret,in);
|
inline int CommDatumSize(void) {
|
||||||
break;
|
return sizeof(SiteHalfCommSpinor);
|
||||||
case Tp:
|
}
|
||||||
spProjTp(ret,in);
|
|
||||||
break;
|
/*****************************************************/
|
||||||
case Xm:
|
/* Compress includes precision change if mpi data is not same */
|
||||||
spProjXm(ret,in);
|
/*****************************************************/
|
||||||
break;
|
inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
|
||||||
case Ym:
|
projector::Proj(buf[o],in,mu,dag);
|
||||||
spProjYm(ret,in);
|
}
|
||||||
break;
|
|
||||||
case Zm:
|
/*****************************************************/
|
||||||
spProjZm(ret,in);
|
/* Exchange includes precision change if mpi data is not same */
|
||||||
break;
|
/*****************************************************/
|
||||||
case Tm:
|
inline void Exchange(SiteHalfSpinor *mp,
|
||||||
spProjTm(ret,in);
|
SiteHalfSpinor *vp0,
|
||||||
break;
|
SiteHalfSpinor *vp1,
|
||||||
default:
|
Integer type,Integer o){
|
||||||
|
exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*****************************************************/
|
||||||
|
/* Have a decompression step if mpi data is not same */
|
||||||
|
/*****************************************************/
|
||||||
|
inline void Decompress(SiteHalfSpinor *out,
|
||||||
|
SiteHalfSpinor *in, Integer o) {
|
||||||
assert(0);
|
assert(0);
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/////////////////////////
|
/*****************************************************/
|
||||||
// optimised versions
|
/* Compress Exchange */
|
||||||
/////////////////////////
|
/*****************************************************/
|
||||||
|
inline void CompressExchange(SiteHalfSpinor *out0,
|
||||||
|
SiteHalfSpinor *out1,
|
||||||
|
const SiteSpinor *in,
|
||||||
|
Integer j,Integer k, Integer m,Integer type){
|
||||||
|
SiteHalfSpinor temp1, temp2,temp3,temp4;
|
||||||
|
projector::Proj(temp1,in[k],mu,dag);
|
||||||
|
projector::Proj(temp2,in[m],mu,dag);
|
||||||
|
exchange(out0[j],out1[j],temp1,temp2,type);
|
||||||
|
}
|
||||||
|
|
||||||
template<class SiteHalfSpinor,class SiteSpinor>
|
/*****************************************************/
|
||||||
class WilsonXpCompressor {
|
/* Pass the info to the stencil */
|
||||||
public:
|
/*****************************************************/
|
||||||
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
inline bool DecompressionStep(void) { return false; }
|
||||||
SiteHalfSpinor ret;
|
|
||||||
spProjXp(ret,in);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template<class SiteHalfSpinor,class SiteSpinor>
|
|
||||||
class WilsonYpCompressor {
|
|
||||||
public:
|
|
||||||
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
|
||||||
SiteHalfSpinor ret;
|
|
||||||
spProjYp(ret,in);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template<class SiteHalfSpinor,class SiteSpinor>
|
|
||||||
class WilsonZpCompressor {
|
|
||||||
public:
|
|
||||||
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
|
||||||
SiteHalfSpinor ret;
|
|
||||||
spProjZp(ret,in);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template<class SiteHalfSpinor,class SiteSpinor>
|
|
||||||
class WilsonTpCompressor {
|
|
||||||
public:
|
|
||||||
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
|
||||||
SiteHalfSpinor ret;
|
|
||||||
spProjTp(ret,in);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template<class SiteHalfSpinor,class SiteSpinor>
|
};
|
||||||
class WilsonXmCompressor {
|
|
||||||
public:
|
|
||||||
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
|
||||||
SiteHalfSpinor ret;
|
|
||||||
spProjXm(ret,in);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template<class SiteHalfSpinor,class SiteSpinor>
|
|
||||||
class WilsonYmCompressor {
|
|
||||||
public:
|
|
||||||
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
|
||||||
SiteHalfSpinor ret;
|
|
||||||
spProjYm(ret,in);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template<class SiteHalfSpinor,class SiteSpinor>
|
|
||||||
class WilsonZmCompressor {
|
|
||||||
public:
|
|
||||||
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
|
||||||
SiteHalfSpinor ret;
|
|
||||||
spProjZm(ret,in);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template<class SiteHalfSpinor,class SiteSpinor>
|
|
||||||
class WilsonTmCompressor {
|
|
||||||
public:
|
|
||||||
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
|
||||||
SiteHalfSpinor ret;
|
|
||||||
spProjTm(ret,in);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Fast comms buffer manipulation which should inline right through (avoid direction
|
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
|
||||||
// dependent logic that prevents inlining
|
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
|
||||||
template<class vobj,class cobj>
|
typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
|
||||||
class WilsonStencil : public CartesianStencil<vobj,cobj> {
|
{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
int mu,dag;
|
||||||
|
|
||||||
|
void Point(int p) { mu=p; };
|
||||||
|
|
||||||
|
WilsonCompressorTemplate(int _dag=0){
|
||||||
|
dag = _dag;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef _Spinor SiteSpinor;
|
||||||
|
typedef _Hspinor SiteHalfSpinor;
|
||||||
|
typedef _HCspinor SiteHalfCommSpinor;
|
||||||
|
typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
|
||||||
|
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
|
||||||
|
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
|
||||||
|
|
||||||
|
inline int CommDatumSize(void) {
|
||||||
|
return sizeof(SiteHalfCommSpinor);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*****************************************************/
|
||||||
|
/* Compress includes precision change if mpi data is not same */
|
||||||
|
/*****************************************************/
|
||||||
|
inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor hsp;
|
||||||
|
SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
|
||||||
|
projector::Proj(hsp,in,mu,dag);
|
||||||
|
precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*****************************************************/
|
||||||
|
/* Exchange includes precision change if mpi data is not same */
|
||||||
|
/*****************************************************/
|
||||||
|
inline void Exchange(SiteHalfSpinor *mp,
|
||||||
|
SiteHalfSpinor *vp0,
|
||||||
|
SiteHalfSpinor *vp1,
|
||||||
|
Integer type,Integer o){
|
||||||
|
SiteHalfSpinor vt0,vt1;
|
||||||
|
SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
|
||||||
|
SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
|
||||||
|
precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw);
|
||||||
|
precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw);
|
||||||
|
exchange(mp[2*o],mp[2*o+1],vt0,vt1,type);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*****************************************************/
|
||||||
|
/* Have a decompression step if mpi data is not same */
|
||||||
|
/*****************************************************/
|
||||||
|
inline void Decompress(SiteHalfSpinor *out,
|
||||||
|
SiteHalfSpinor *in, Integer o){
|
||||||
|
SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
|
||||||
|
precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*****************************************************/
|
||||||
|
/* Compress Exchange */
|
||||||
|
/*****************************************************/
|
||||||
|
inline void CompressExchange(SiteHalfSpinor *out0,
|
||||||
|
SiteHalfSpinor *out1,
|
||||||
|
const SiteSpinor *in,
|
||||||
|
Integer j,Integer k, Integer m,Integer type){
|
||||||
|
SiteHalfSpinor temp1, temp2,temp3,temp4;
|
||||||
|
SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
|
||||||
|
SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
|
||||||
|
projector::Proj(temp1,in[k],mu,dag);
|
||||||
|
projector::Proj(temp2,in[m],mu,dag);
|
||||||
|
exchange(temp3,temp4,temp1,temp2,type);
|
||||||
|
precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw);
|
||||||
|
precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*****************************************************/
|
||||||
|
/* Pass the info to the stencil */
|
||||||
|
/*****************************************************/
|
||||||
|
inline bool DecompressionStep(void) { return true; }
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#define DECLARE_PROJ(Projector,Compressor,spProj) \
|
||||||
|
class Projector { \
|
||||||
|
public: \
|
||||||
|
template<class hsp,class fsp> \
|
||||||
|
static void Proj(hsp &result,const fsp &in,int mu,int dag){ \
|
||||||
|
spProj(result,in); \
|
||||||
|
} \
|
||||||
|
}; \
|
||||||
|
template<typename HCS,typename HS,typename S> using Compressor = WilsonCompressorTemplate<HCS,HS,S,Projector>;
|
||||||
|
|
||||||
|
DECLARE_PROJ(WilsonXpProjector,WilsonXpCompressor,spProjXp);
|
||||||
|
DECLARE_PROJ(WilsonYpProjector,WilsonYpCompressor,spProjYp);
|
||||||
|
DECLARE_PROJ(WilsonZpProjector,WilsonZpCompressor,spProjZp);
|
||||||
|
DECLARE_PROJ(WilsonTpProjector,WilsonTpCompressor,spProjTp);
|
||||||
|
DECLARE_PROJ(WilsonXmProjector,WilsonXmCompressor,spProjXm);
|
||||||
|
DECLARE_PROJ(WilsonYmProjector,WilsonYmCompressor,spProjYm);
|
||||||
|
DECLARE_PROJ(WilsonZmProjector,WilsonZmCompressor,spProjZm);
|
||||||
|
DECLARE_PROJ(WilsonTmProjector,WilsonTmCompressor,spProjTm);
|
||||||
|
|
||||||
|
class WilsonProjector {
|
||||||
|
public:
|
||||||
|
template<class hsp,class fsp>
|
||||||
|
static void Proj(hsp &result,const fsp &in,int mu,int dag){
|
||||||
|
int mudag=dag? mu : (mu+Nd)%(2*Nd);
|
||||||
|
switch(mudag) {
|
||||||
|
case Xp: spProjXp(result,in); break;
|
||||||
|
case Yp: spProjYp(result,in); break;
|
||||||
|
case Zp: spProjZp(result,in); break;
|
||||||
|
case Tp: spProjTp(result,in); break;
|
||||||
|
case Xm: spProjXm(result,in); break;
|
||||||
|
case Ym: spProjYm(result,in); break;
|
||||||
|
case Zm: spProjZm(result,in); break;
|
||||||
|
case Tm: spProjTm(result,in); break;
|
||||||
|
default: assert(0); break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCompressorTemplate<HCS,HS,S,WilsonProjector>;
|
||||||
|
|
||||||
|
// Fast comms buffer manipulation which should inline right through (avoid direction
|
||||||
|
// dependent logic that prevents inlining
|
||||||
|
template<class vobj,class cobj>
|
||||||
|
class WilsonStencil : public CartesianStencil<vobj,cobj> {
|
||||||
|
public:
|
||||||
|
|
||||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||||
|
|
||||||
WilsonStencil(GridBase *grid,
|
WilsonStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
int checkerboard,
|
int checkerboard,
|
||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances) : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
|
const std::vector<int> &distances)
|
||||||
{ };
|
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
|
||||||
|
{ /*Do nothing*/ };
|
||||||
|
|
||||||
template < class compressor>
|
template < class compressor>
|
||||||
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||||
HaloExchangeOptGather(source,compress);
|
this->HaloExchangeOptGather(source,compress);
|
||||||
this->CommunicateBegin(reqs);
|
this->CommunicateBegin(reqs);
|
||||||
this->calls++;
|
|
||||||
this->CommunicateComplete(reqs);
|
this->CommunicateComplete(reqs);
|
||||||
this->CommsMerge();
|
this->CommsMerge(compress);
|
||||||
}
|
}
|
||||||
|
|
||||||
template < class compressor>
|
template <class compressor>
|
||||||
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
this->calls++;
|
this->Prepare();
|
||||||
this->Mergers.resize(0);
|
|
||||||
this->Packets.resize(0);
|
|
||||||
this->HaloGatherOpt(source,compress);
|
this->HaloGatherOpt(source,compress);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class compressor>
|
||||||
template < class compressor>
|
|
||||||
void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
|
void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
|
// Strategy. Inherit types from Compressor.
|
||||||
|
// Use types to select the write direction by directon compressor
|
||||||
|
typedef typename compressor::SiteSpinor SiteSpinor;
|
||||||
|
typedef typename compressor::SiteHalfSpinor SiteHalfSpinor;
|
||||||
|
typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
|
||||||
|
|
||||||
this->_grid->StencilBarrier();
|
this->_grid->StencilBarrier();
|
||||||
// conformable(source._grid,_grid);
|
|
||||||
assert(source._grid==this->_grid);
|
assert(source._grid==this->_grid);
|
||||||
this->halogtime-=usecond();
|
this->halogtime-=usecond();
|
||||||
|
|
||||||
this->u_comm_offset=0;
|
this->u_comm_offset=0;
|
||||||
|
|
||||||
|
WilsonXpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> XpCompress;
|
||||||
|
WilsonYpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> YpCompress;
|
||||||
|
WilsonZpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> ZpCompress;
|
||||||
|
WilsonTpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> TpCompress;
|
||||||
|
WilsonXmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> XmCompress;
|
||||||
|
WilsonYmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> YmCompress;
|
||||||
|
WilsonZmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> ZmCompress;
|
||||||
|
WilsonTmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> TmCompress;
|
||||||
|
|
||||||
int dag = compress.dag;
|
int dag = compress.dag;
|
||||||
|
|
||||||
WilsonXpCompressor<cobj,vobj> XpCompress;
|
|
||||||
WilsonYpCompressor<cobj,vobj> YpCompress;
|
|
||||||
WilsonZpCompressor<cobj,vobj> ZpCompress;
|
|
||||||
WilsonTpCompressor<cobj,vobj> TpCompress;
|
|
||||||
WilsonXmCompressor<cobj,vobj> XmCompress;
|
|
||||||
WilsonYmCompressor<cobj,vobj> YmCompress;
|
|
||||||
WilsonZmCompressor<cobj,vobj> ZmCompress;
|
|
||||||
WilsonTmCompressor<cobj,vobj> TmCompress;
|
|
||||||
|
|
||||||
// Gather all comms buffers
|
|
||||||
// for(int point = 0 ; point < _npoints; point++) {
|
|
||||||
// compress.Point(point);
|
|
||||||
// HaloGatherDir(source,compress,point,face_idx);
|
|
||||||
// }
|
|
||||||
int face_idx=0;
|
int face_idx=0;
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
// std::cout << " Optimised Dagger compress " <<std::endl;
|
// std::cout << " Optimised Dagger compress " <<std::endl;
|
||||||
@ -255,6 +320,5 @@ namespace QCD {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
}} // namespace close
|
}} // namespace close
|
||||||
#endif
|
#endif
|
||||||
|
@ -439,7 +439,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
}
|
}
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime-=usecond();
|
||||||
st.CommsMerge();
|
st.CommsMerge(compressor);
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
|
@ -74,6 +74,8 @@ struct RealPart<std::complex<T> > {
|
|||||||
typedef T type;
|
typedef T type;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#include <type_traits>
|
||||||
|
|
||||||
//////////////////////////////////////
|
//////////////////////////////////////
|
||||||
// demote a vector to real type
|
// demote a vector to real type
|
||||||
//////////////////////////////////////
|
//////////////////////////////////////
|
||||||
@ -91,12 +93,12 @@ template <> struct is_complex<std::complex<float> > : public std::true_type {};
|
|||||||
template <typename T> using IfReal = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
|
template <typename T> using IfReal = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
|
||||||
template <typename T> using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
|
template <typename T> using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
|
||||||
template <typename T> using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
|
template <typename T> using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
|
||||||
template <typename T1,typename T2> using IfSame = Invoke<std::enable_if<is_same<T1,T2>::value, int> >;
|
template <typename T1,typename T2> using IfSame = Invoke<std::enable_if<std::is_same<T1,T2>::value, int> >;
|
||||||
|
|
||||||
template <typename T> using IfNotReal = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
|
template <typename T> using IfNotReal = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
|
||||||
template <typename T> using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
|
template <typename T> using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
|
||||||
template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
|
template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
|
||||||
template <typename T1,typename T2> using IfNotSame = Invoke<std::enable_if<!is_same<T1,T2>::value, int> >;
|
template <typename T1,typename T2> using IfNotSame = Invoke<std::enable_if<!std::is_same<T1,T2>::value, int> >;
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
// Define the operation templates functors
|
// Define the operation templates functors
|
||||||
|
@ -29,7 +29,7 @@
|
|||||||
#define GRID_STENCIL_H
|
#define GRID_STENCIL_H
|
||||||
|
|
||||||
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
|
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
|
||||||
#define NEW_XYZT_GATHER
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Must not lose sight that goal is to be able to construct really efficient
|
// Must not lose sight that goal is to be able to construct really efficient
|
||||||
// gather to a point stencil code. CSHIFT is not the best way, so need
|
// gather to a point stencil code. CSHIFT is not the best way, so need
|
||||||
@ -82,7 +82,7 @@ void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const La
|
|||||||
{
|
{
|
||||||
int num=table.size();
|
int num=table.size();
|
||||||
parallel_for(int i=0;i<num;i++){
|
parallel_for(int i=0;i<num;i++){
|
||||||
vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second]));
|
compress.Compress(&buffer[off],table[i].first,rhs._odata[so+table[i].second]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,14 +102,8 @@ void Gather_plane_exchange_table(std::vector<std::pair<int,int> >& table,const L
|
|||||||
int num=table.size()/2;
|
int num=table.size()/2;
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
parallel_for(int j=0;j<num;j++){
|
parallel_for(int j=0;j<num;j++){
|
||||||
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
compress.CompressExchange(&pointers[0][0],&pointers[1][0],&rhs._odata[0],
|
||||||
cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
|
j,so+table[2*j].second,so+table[2*j+1].second,type);
|
||||||
cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
|
|
||||||
cobj temp3;
|
|
||||||
cobj temp4;
|
|
||||||
exchange(temp3,temp4,temp1,temp2,type);
|
|
||||||
vstream(pointers[0][j],temp3);
|
|
||||||
vstream(pointers[1][j],temp4);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -121,8 +115,19 @@ struct StencilEntry {
|
|||||||
uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
|
uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
|
||||||
};
|
};
|
||||||
|
|
||||||
//extern int dump;
|
//////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//Lattice object type, compressed object type.
|
||||||
|
//
|
||||||
|
//These only need to be known outside of halo exchange subset of methods
|
||||||
|
//because byte offsets are precomputed
|
||||||
|
//
|
||||||
|
//It might help/be cleaner to add a "mobj" for the mpi transferred object to this list
|
||||||
|
//for the on the wire representation.
|
||||||
|
//
|
||||||
|
//This would clean up the "casts" in the WilsonCompressor.h file
|
||||||
|
//
|
||||||
|
//Other compressors (SimpleCompressor) retains mobj == cobj so no issue
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj,class cobj>
|
template<class vobj,class cobj>
|
||||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||||
public:
|
public:
|
||||||
@ -133,6 +138,60 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
typedef typename cobj::scalar_type scalar_type;
|
typedef typename cobj::scalar_type scalar_type;
|
||||||
typedef typename cobj::scalar_object scalar_object;
|
typedef typename cobj::scalar_object scalar_object;
|
||||||
|
|
||||||
|
/////////////////////////////////////////
|
||||||
|
// Timing info; ugly; possibly temporary
|
||||||
|
/////////////////////////////////////////
|
||||||
|
double commtime;
|
||||||
|
double gathertime;
|
||||||
|
double gathermtime;
|
||||||
|
double halogtime;
|
||||||
|
double mergetime;
|
||||||
|
double decompresstime;
|
||||||
|
double comms_bytes;
|
||||||
|
double splicetime;
|
||||||
|
double nosplicetime;
|
||||||
|
double calls;
|
||||||
|
|
||||||
|
void ZeroCounters(void) {
|
||||||
|
gathertime = 0.;
|
||||||
|
commtime = 0.;
|
||||||
|
halogtime = 0.;
|
||||||
|
mergetime = 0.;
|
||||||
|
decompresstime = 0.;
|
||||||
|
gathermtime = 0.;
|
||||||
|
splicetime = 0.;
|
||||||
|
nosplicetime = 0.;
|
||||||
|
comms_bytes = 0.;
|
||||||
|
calls = 0.;
|
||||||
|
};
|
||||||
|
|
||||||
|
void Report(void) {
|
||||||
|
#define AVERAGE(A) _grid->GlobalSum(A);A/=NP;
|
||||||
|
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
||||||
|
RealD NP = _grid->_Nprocessors;
|
||||||
|
RealD NN = _grid->NodeCount();
|
||||||
|
|
||||||
|
_grid->GlobalSum(commtime); commtime/=NP;
|
||||||
|
if ( calls > 0. ) {
|
||||||
|
std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
|
||||||
|
PRINTIT(halogtime);
|
||||||
|
PRINTIT(gathertime);
|
||||||
|
PRINTIT(gathermtime);
|
||||||
|
PRINTIT(mergetime);
|
||||||
|
PRINTIT(decompresstime);
|
||||||
|
if(comms_bytes>1.0){
|
||||||
|
PRINTIT(comms_bytes);
|
||||||
|
PRINTIT(commtime);
|
||||||
|
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||||
|
}
|
||||||
|
PRINTIT(splicetime);
|
||||||
|
PRINTIT(nosplicetime);
|
||||||
|
}
|
||||||
|
#undef PRINTIT
|
||||||
|
#undef AVERAGE
|
||||||
|
};
|
||||||
|
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Comms packet queue for asynch thread
|
// Comms packet queue for asynch thread
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
@ -182,28 +241,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
}
|
}
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
/*
|
|
||||||
int dump=1;
|
|
||||||
if(dump){
|
|
||||||
for(int i=0;i<Packets.size();i++){
|
|
||||||
cobj * ptr = (cobj *) Packets[i].recv_buf;
|
|
||||||
uint64_t num=Packets[i].bytes/sizeof(cobj);
|
|
||||||
std::cout << " CommunicateComplete " << i<< " / " << Packets.size()<< " num " << num <<std::endl;
|
|
||||||
std::stringstream ss;
|
|
||||||
ss<<"recvbuf";
|
|
||||||
for(int d=0;d<_grid->_ndimension;d++){
|
|
||||||
ss<<"."<<_grid->_processor_coor[d];
|
|
||||||
}
|
|
||||||
ss<<"_mu_"<<i;
|
|
||||||
std::string fname(ss.str());
|
|
||||||
std::ofstream fout(fname);
|
|
||||||
for(int k=0;k<num;k++) {
|
|
||||||
fout << i<<" "<<k<<" "<<ptr[k]<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
dump =0;
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
@ -214,66 +251,64 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
std::vector<scalar_object *> rpointers;
|
std::vector<scalar_object *> rpointers;
|
||||||
std::vector<cobj *> vpointers;
|
std::vector<cobj *> vpointers;
|
||||||
Integer buffer_size;
|
Integer buffer_size;
|
||||||
Integer packet_id;
|
|
||||||
Integer exchange;
|
|
||||||
Integer type;
|
Integer type;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<Merge> Mergers;
|
std::vector<Merge> Mergers;
|
||||||
|
|
||||||
void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
|
struct Decompress {
|
||||||
Merge m;
|
cobj * kernel_p;
|
||||||
m.exchange = 0;
|
cobj * mpi_p;
|
||||||
m.mpointer = merge_p;
|
Integer buffer_size;
|
||||||
m.rpointers= rpointers;
|
};
|
||||||
m.buffer_size = buffer_size;
|
|
||||||
m.packet_id = packet_id;
|
void Prepare(void)
|
||||||
Mergers.push_back(m);
|
{
|
||||||
|
Decompressions.resize(0);
|
||||||
|
Mergers.resize(0);
|
||||||
|
Packets.resize(0);
|
||||||
|
calls++;
|
||||||
|
}
|
||||||
|
std::vector<Decompress> Decompressions;
|
||||||
|
|
||||||
|
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size) {
|
||||||
|
Decompress d;
|
||||||
|
d.kernel_p = k_p;
|
||||||
|
d.mpi_p = m_p;
|
||||||
|
d.buffer_size = buffer_size;
|
||||||
|
Decompressions.push_back(d);
|
||||||
}
|
}
|
||||||
|
|
||||||
void AddMergeNew(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer packet_id,Integer type) {
|
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type) {
|
||||||
Merge m;
|
Merge m;
|
||||||
m.exchange = 1;
|
|
||||||
m.type = type;
|
m.type = type;
|
||||||
m.mpointer = merge_p;
|
m.mpointer = merge_p;
|
||||||
m.vpointers= rpointers;
|
m.vpointers= rpointers;
|
||||||
m.buffer_size = buffer_size;
|
m.buffer_size = buffer_size;
|
||||||
m.packet_id = packet_id;
|
|
||||||
Mergers.push_back(m);
|
Mergers.push_back(m);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommsMerge(void ) {
|
template<class decompressor>
|
||||||
|
void CommsMerge(decompressor decompress) {
|
||||||
|
|
||||||
|
// Also do a precision convert possibly on a receive buffer
|
||||||
for(int i=0;i<Mergers.size();i++){
|
for(int i=0;i<Mergers.size();i++){
|
||||||
mergetime-=usecond();
|
mergetime-=usecond();
|
||||||
|
|
||||||
// std::cout << "Merge " <<i << std::endl;
|
|
||||||
// std::stringstream ss;
|
|
||||||
// ss<<"mergebuf";
|
|
||||||
// for(int d=0;d<_grid->_ndimension;d++){
|
|
||||||
// ss<<"."<<_grid->_processor_coor[d];
|
|
||||||
// }
|
|
||||||
// ss<<"_m_"<<i;
|
|
||||||
// std::string fname(ss.str());
|
|
||||||
// std::ofstream fout(fname);
|
|
||||||
|
|
||||||
if ( Mergers[i].exchange == 0 ) {
|
|
||||||
parallel_for(int o=0;o<Mergers[i].buffer_size;o++){
|
|
||||||
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
|
||||||
// fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
parallel_for(int o=0;o<Mergers[i].buffer_size/2;o++){
|
parallel_for(int o=0;o<Mergers[i].buffer_size/2;o++){
|
||||||
exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
|
decompress.Exchange(Mergers[i].mpointer,
|
||||||
Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
|
Mergers[i].vpointers[0],
|
||||||
// cobj temp1,temp2;
|
Mergers[i].vpointers[1],
|
||||||
// exchange(temp1,temp2,Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
|
Mergers[i].type,o);
|
||||||
// vstream(Mergers[i].mpointer[2*o],temp1);
|
|
||||||
// vstream(Mergers[i].mpointer[2*o+1],temp2);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
mergetime+=usecond();
|
mergetime+=usecond();
|
||||||
}
|
}
|
||||||
|
for(int i=0;i<Decompressions.size();i++){
|
||||||
|
decompresstime-=usecond();
|
||||||
|
parallel_for(int o=0;o<Decompressions[i].buffer_size;o++){
|
||||||
|
decompress.Decompress(Decompressions[i].kernel_p,Decompressions[i].mpi_p,o);
|
||||||
|
}
|
||||||
|
decompresstime+=usecond();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
@ -304,7 +339,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
|
inline StencilEntry * GetEntry(int &ptype,int point,int osite) {
|
||||||
|
ptype = _permute_type[point]; return & _entries[point+_npoints*osite];
|
||||||
|
}
|
||||||
|
|
||||||
inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
|
inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
|
||||||
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
||||||
local = _entries[ent]._is_local;
|
local = _entries[ent]._is_local;
|
||||||
@ -316,6 +354,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
return cbase + _entries[ent]._byte_offset;
|
return cbase + _entries[ent]._byte_offset;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint64_t GetPFInfo(int ent,uint64_t base) {
|
inline uint64_t GetPFInfo(int ent,uint64_t base) {
|
||||||
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
||||||
int local = _entries[ent]._is_local;
|
int local = _entries[ent]._is_local;
|
||||||
@ -327,89 +366,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
// Unified Comms buffers for all directions
|
// Unified Comms buffers for all directions
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
// Vectors that live on the symmetric heap in case of SHMEM
|
// Vectors that live on the symmetric heap in case of SHMEM
|
||||||
// std::vector<commVector<scalar_object> > u_simd_send_buf_hide;
|
|
||||||
// std::vector<commVector<scalar_object> > u_simd_recv_buf_hide;
|
|
||||||
// commVector<cobj> u_send_buf_hide;
|
|
||||||
// commVector<cobj> u_recv_buf_hide;
|
|
||||||
|
|
||||||
// These are used; either SHM objects or refs to the above symmetric heap vectors
|
// These are used; either SHM objects or refs to the above symmetric heap vectors
|
||||||
// depending on comms target
|
// depending on comms target
|
||||||
cobj* u_recv_buf_p;
|
cobj* u_recv_buf_p;
|
||||||
cobj* u_send_buf_p;
|
cobj* u_send_buf_p;
|
||||||
std::vector<cobj *> new_simd_send_buf;
|
std::vector<cobj *> u_simd_send_buf;
|
||||||
std::vector<cobj *> new_simd_recv_buf;
|
std::vector<cobj *> u_simd_recv_buf;
|
||||||
std::vector<scalar_object *> u_simd_send_buf;
|
|
||||||
std::vector<scalar_object *> u_simd_recv_buf;
|
|
||||||
|
|
||||||
int u_comm_offset;
|
int u_comm_offset;
|
||||||
int _unified_buffer_size;
|
int _unified_buffer_size;
|
||||||
|
|
||||||
cobj *CommBuf(void) { return u_recv_buf_p; }
|
cobj *CommBuf(void) { return u_recv_buf_p; }
|
||||||
|
|
||||||
/////////////////////////////////////////
|
|
||||||
// Timing info; ugly; possibly temporary
|
|
||||||
/////////////////////////////////////////
|
|
||||||
#define TIMING_HACK
|
|
||||||
#ifdef TIMING_HACK
|
|
||||||
double jointime;
|
|
||||||
double gathertime;
|
|
||||||
double commtime;
|
|
||||||
double halogtime;
|
|
||||||
double mergetime;
|
|
||||||
double spintime;
|
|
||||||
double comms_bytes;
|
|
||||||
double gathermtime;
|
|
||||||
double splicetime;
|
|
||||||
double nosplicetime;
|
|
||||||
double t_data;
|
|
||||||
double t_table;
|
|
||||||
double calls;
|
|
||||||
|
|
||||||
void ZeroCounters(void) {
|
|
||||||
gathertime = 0.;
|
|
||||||
jointime = 0.;
|
|
||||||
commtime = 0.;
|
|
||||||
halogtime = 0.;
|
|
||||||
mergetime = 0.;
|
|
||||||
spintime = 0.;
|
|
||||||
gathermtime = 0.;
|
|
||||||
splicetime = 0.;
|
|
||||||
nosplicetime = 0.;
|
|
||||||
t_data = 0.0;
|
|
||||||
t_table= 0.0;
|
|
||||||
comms_bytes = 0.;
|
|
||||||
calls = 0.;
|
|
||||||
};
|
|
||||||
|
|
||||||
void Report(void) {
|
|
||||||
#define PRINTIT(A) \
|
|
||||||
std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
|
||||||
|
|
||||||
RealD NP = _grid->_Nprocessors;
|
|
||||||
RealD NN = _grid->NodeCount();
|
|
||||||
|
|
||||||
_grid->GlobalSum(commtime); commtime/=NP;
|
|
||||||
if ( calls > 0. ) {
|
|
||||||
std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
|
|
||||||
PRINTIT(halogtime);
|
|
||||||
PRINTIT(gathertime);
|
|
||||||
PRINTIT(gathermtime);
|
|
||||||
PRINTIT(mergetime);
|
|
||||||
if(comms_bytes>1.0){
|
|
||||||
PRINTIT(comms_bytes);
|
|
||||||
PRINTIT(commtime);
|
|
||||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
|
||||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
|
||||||
}
|
|
||||||
PRINTIT(jointime);
|
|
||||||
PRINTIT(spintime);
|
|
||||||
PRINTIT(splicetime);
|
|
||||||
PRINTIT(nosplicetime);
|
|
||||||
PRINTIT(t_table);
|
|
||||||
PRINTIT(t_data);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
CartesianStencil(GridBase *grid,
|
CartesianStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
@ -493,21 +461,13 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
|
|
||||||
u_simd_send_buf.resize(Nsimd);
|
u_simd_send_buf.resize(Nsimd);
|
||||||
u_simd_recv_buf.resize(Nsimd);
|
u_simd_recv_buf.resize(Nsimd);
|
||||||
new_simd_send_buf.resize(Nsimd);
|
|
||||||
new_simd_recv_buf.resize(Nsimd);
|
|
||||||
u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||||
u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||||
#ifdef NEW_XYZT_GATHER
|
|
||||||
for(int l=0;l<2;l++){
|
for(int l=0;l<2;l++){
|
||||||
new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||||
new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
for(int l=0;l<Nsimd;l++){
|
|
||||||
u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
|
|
||||||
u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
PrecomputeByteOffsets();
|
PrecomputeByteOffsets();
|
||||||
}
|
}
|
||||||
@ -611,7 +571,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
offnode = (comm_proc!= 0);
|
offnode = (comm_proc!= 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int wraparound=0;
|
int wraparound=0;
|
||||||
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
|
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
|
||||||
wraparound = 1;
|
wraparound = 1;
|
||||||
@ -738,13 +697,11 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||||
calls++;
|
Prepare();
|
||||||
Mergers.resize(0);
|
|
||||||
Packets.resize(0);
|
|
||||||
HaloGather(source,compress);
|
HaloGather(source,compress);
|
||||||
this->CommunicateBegin(reqs);
|
CommunicateBegin(reqs);
|
||||||
this->CommunicateComplete(reqs);
|
CommunicateComplete(reqs);
|
||||||
CommsMerge();
|
CommsMerge(compress);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
||||||
@ -775,13 +732,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
if (splice_dim) {
|
if (splice_dim) {
|
||||||
splicetime-=usecond();
|
splicetime-=usecond();
|
||||||
// GatherSimd(source,dimension,shift,0x3,compress,face_idx);
|
|
||||||
// std::cout << "GatherSimdNew"<<std::endl;
|
|
||||||
#ifdef NEW_XYZT_GATHER
|
|
||||||
GatherSimdNew(source,dimension,shift,0x3,compress,face_idx);
|
|
||||||
#else
|
|
||||||
GatherSimd(source,dimension,shift,0x3,compress,face_idx);
|
GatherSimd(source,dimension,shift,0x3,compress,face_idx);
|
||||||
#endif
|
|
||||||
splicetime+=usecond();
|
splicetime+=usecond();
|
||||||
} else {
|
} else {
|
||||||
nosplicetime-=usecond();
|
nosplicetime-=usecond();
|
||||||
@ -791,14 +742,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
} else {
|
} else {
|
||||||
if(splice_dim){
|
if(splice_dim){
|
||||||
splicetime-=usecond();
|
splicetime-=usecond();
|
||||||
// std::cout << "GatherSimdNew2calls"<<std::endl;
|
|
||||||
#ifdef NEW_XYZT_GATHER
|
|
||||||
GatherSimdNew(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
|
|
||||||
GatherSimdNew(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
|
|
||||||
#else
|
|
||||||
GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
|
GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
|
||||||
GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
|
GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
|
||||||
#endif
|
|
||||||
splicetime+=usecond();
|
splicetime+=usecond();
|
||||||
} else {
|
} else {
|
||||||
nosplicetime-=usecond();
|
nosplicetime-=usecond();
|
||||||
@ -868,17 +813,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
int words = buffer_size;
|
int words = buffer_size;
|
||||||
if (cbmask != 0x3) words=words>>1;
|
if (cbmask != 0x3) words=words>>1;
|
||||||
|
|
||||||
int bytes = words * sizeof(cobj);
|
int bytes = words * compress.CommDatumSize();
|
||||||
|
|
||||||
gathertime-=usecond();
|
|
||||||
int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
if ( !face_table_computed ) {
|
if ( !face_table_computed ) {
|
||||||
t_table-=usecond();
|
|
||||||
face_table.resize(face_idx+1);
|
face_table.resize(face_idx+1);
|
||||||
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
|
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
|
||||||
// std::cout << " face table size "<<face_idx <<" " << face_table[face_idx].size() <<" computed buffer size "<< words <<
|
|
||||||
// " bytes = " << bytes <<std::endl;
|
|
||||||
t_table+=usecond();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int rank = _grid->_processor;
|
int rank = _grid->_processor;
|
||||||
@ -897,18 +837,33 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
send_buf = u_send_buf_p;
|
send_buf = u_send_buf_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
t_data-=usecond();
|
cobj *recv_buf;
|
||||||
|
gathertime-=usecond();
|
||||||
assert(send_buf!=NULL);
|
assert(send_buf!=NULL);
|
||||||
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
|
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
|
||||||
t_data+=usecond();
|
gathertime+=usecond();
|
||||||
|
|
||||||
|
if ( compress.DecompressionStep() ) {
|
||||||
|
|
||||||
|
recv_buf = &u_simd_recv_buf[0][u_comm_offset];
|
||||||
|
|
||||||
|
AddDecompress(&u_recv_buf_p[u_comm_offset],
|
||||||
|
&recv_buf[u_comm_offset],
|
||||||
|
words);
|
||||||
|
|
||||||
|
AddPacket((void *)&send_buf[u_comm_offset],
|
||||||
|
(void *)&recv_buf[u_comm_offset],
|
||||||
|
xmit_to_rank,
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
|
||||||
|
} else {
|
||||||
AddPacket((void *)&send_buf[u_comm_offset],
|
AddPacket((void *)&send_buf[u_comm_offset],
|
||||||
(void *)&u_recv_buf_p[u_comm_offset],
|
(void *)&u_recv_buf_p[u_comm_offset],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
}
|
||||||
gathertime+=usecond();
|
|
||||||
u_comm_offset+=words;
|
u_comm_offset+=words;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -919,123 +874,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
{
|
{
|
||||||
const int Nsimd = _grid->Nsimd();
|
const int Nsimd = _grid->Nsimd();
|
||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
|
||||||
int rd = _grid->_rdimensions[dimension];
|
|
||||||
int ld = _grid->_ldimensions[dimension];
|
|
||||||
int pd = _grid->_processors[dimension];
|
|
||||||
int simd_layout = _grid->_simd_layout[dimension];
|
|
||||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
|
||||||
|
|
||||||
assert(comm_dim==1);
|
|
||||||
// This will not work with a rotate dim
|
|
||||||
assert(simd_layout==2);
|
|
||||||
assert(shift>=0);
|
|
||||||
assert(shift<fd);
|
|
||||||
|
|
||||||
int permute_type=_grid->PermuteType(dimension);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////
|
|
||||||
// Simd direction uses an extract/merge pair
|
|
||||||
///////////////////////////////////////////////
|
|
||||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
|
||||||
int words = sizeof(cobj)/sizeof(vector_type);
|
|
||||||
|
|
||||||
assert(cbmask==0x3); // Fixme think there is a latent bug if not true
|
|
||||||
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
|
||||||
|
|
||||||
std::vector<scalar_object *> rpointers(Nsimd);
|
|
||||||
std::vector<scalar_object *> spointers(Nsimd);
|
|
||||||
|
|
||||||
// std::cout << "GatherSimd " << dimension << " shift "<<shift<<std::endl;
|
|
||||||
|
|
||||||
///////////////////////////////////////////
|
|
||||||
// Work out what to send where
|
|
||||||
///////////////////////////////////////////
|
|
||||||
|
|
||||||
int cb = (cbmask==0x2)? Odd : Even;
|
|
||||||
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
|
||||||
|
|
||||||
// loop over outer coord planes orthog to dim
|
|
||||||
for(int x=0;x<rd;x++){
|
|
||||||
|
|
||||||
int any_offnode = ( ((x+sshift)%fd) >= rd );
|
|
||||||
|
|
||||||
if ( any_offnode ) {
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
spointers[i] = &u_simd_send_buf[i][u_comm_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
int sx = (x+sshift)%rd;
|
|
||||||
|
|
||||||
gathermtime-=usecond();
|
|
||||||
Gather_plane_extract<cobj>(rhs,spointers,dimension,sx,cbmask,compress);
|
|
||||||
gathermtime+=usecond();
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
|
|
||||||
// FIXME - This logic is hard coded to simd_layout==2 and not allowing >2
|
|
||||||
// for(int w=0;w<buffer_size;w++){
|
|
||||||
// std::cout << "GatherSimd<"<<Nsimd<<"> : lane " << i <<" elem "<<w<<" "<< u_simd_send_buf[i ][u_comm_offset+w]<<std::endl;
|
|
||||||
// }
|
|
||||||
int inner_bit = (Nsimd>>(permute_type+1));
|
|
||||||
int ic= (i&inner_bit)? 1:0;
|
|
||||||
|
|
||||||
int my_coor = rd*ic + x;
|
|
||||||
int nbr_coor = my_coor+sshift;
|
|
||||||
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
|
||||||
int nbr_lcoor= (nbr_coor%ld);
|
|
||||||
int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer
|
|
||||||
int nbr_ox = (nbr_lcoor%rd); // outer coord of peer
|
|
||||||
int nbr_lane = (i&(~inner_bit));
|
|
||||||
|
|
||||||
if (nbr_ic) nbr_lane|=inner_bit;
|
|
||||||
assert (sx == nbr_ox);
|
|
||||||
|
|
||||||
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
|
|
||||||
auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
|
|
||||||
|
|
||||||
if(nbr_proc){
|
|
||||||
|
|
||||||
int recv_from_rank;
|
|
||||||
int xmit_to_rank;
|
|
||||||
|
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
|
|
||||||
// shm == receive pointer if offnode
|
|
||||||
// shm == Translate[send pointer] if on node -- my view of his send pointer
|
|
||||||
scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp);
|
|
||||||
if (shm==NULL) {
|
|
||||||
shm = rp;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
|
|
||||||
// assuming above pointer flip
|
|
||||||
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
|
|
||||||
|
|
||||||
rpointers[i] = shm;
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
rpointers[i] = sp;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
|
|
||||||
|
|
||||||
u_comm_offset +=buffer_size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class compressor>
|
|
||||||
void GatherSimdNew(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
|
|
||||||
{
|
|
||||||
const int Nsimd = _grid->Nsimd();
|
|
||||||
|
|
||||||
const int maxl =2;// max layout in a direction
|
const int maxl =2;// max layout in a direction
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
@ -1085,25 +923,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
|
|
||||||
|
|
||||||
for(int i=0;i<maxl;i++){
|
for(int i=0;i<maxl;i++){
|
||||||
spointers[i] = (cobj *) &new_simd_send_buf[i][u_comm_offset];
|
spointers[i] = (cobj *) &u_simd_send_buf[i][u_comm_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
// if ( cbmask==0x3 ) {
|
|
||||||
// std::vector<std::pair<int,int> > table;
|
|
||||||
t_table-=usecond();
|
|
||||||
if ( !face_table_computed ) {
|
if ( !face_table_computed ) {
|
||||||
face_table.resize(face_idx+1);
|
face_table.resize(face_idx+1);
|
||||||
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
|
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
|
||||||
// std::cout << " face table size "<<face_idx <<" " << face_table[face_idx].size() <<" computed buffer size "<< reduced_buffer_size <<
|
|
||||||
// " bytes = "<<bytes <<std::endl;
|
|
||||||
}
|
}
|
||||||
t_table+=usecond();
|
|
||||||
gathermtime-=usecond();
|
gathermtime-=usecond();
|
||||||
Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); face_idx++;
|
Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); face_idx++;
|
||||||
gathermtime+=usecond();
|
gathermtime+=usecond();
|
||||||
|
|
||||||
//spointers[0] -- low
|
//spointers[0] -- low
|
||||||
//spointers[1] -- high
|
//spointers[1] -- high
|
||||||
|
|
||||||
@ -1120,8 +951,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
int nbr_plane = nbr_ic;
|
int nbr_plane = nbr_ic;
|
||||||
assert (sx == nbr_ox);
|
assert (sx == nbr_ox);
|
||||||
|
|
||||||
auto rp = &new_simd_recv_buf[i ][u_comm_offset];
|
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
|
||||||
auto sp = &new_simd_send_buf[nbr_plane][u_comm_offset];
|
auto sp = &u_simd_send_buf[nbr_plane][u_comm_offset];
|
||||||
|
|
||||||
if(nbr_proc){
|
if(nbr_proc){
|
||||||
|
|
||||||
@ -1150,7 +981,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,Packets.size()-1,permute_type);
|
AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type);
|
||||||
|
|
||||||
u_comm_offset +=buffer_size;
|
u_comm_offset +=buffer_size;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user