mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Massive compressor rework to support reduced precision comms
This commit is contained in:
		@@ -34,8 +34,20 @@ template<class vobj>
 | 
			
		||||
class SimpleCompressor {
 | 
			
		||||
public:
 | 
			
		||||
  void Point(int) {};
 | 
			
		||||
 | 
			
		||||
  vobj operator() (const vobj &arg) {
 | 
			
		||||
  inline int  CommDatumSize(void) { return sizeof(vobj); }
 | 
			
		||||
  inline bool DecompressionStep(void) { return false; }
 | 
			
		||||
  inline void Compress(vobj *buf,int o,const vobj &in) { buf[o]=in; }
 | 
			
		||||
  inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o){
 | 
			
		||||
    exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
 | 
			
		||||
  }
 | 
			
		||||
  inline void Decompress(vobj *out,vobj *in, int o){ assert(0); }
 | 
			
		||||
  inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in,
 | 
			
		||||
			       int j,int k, int m,int type){
 | 
			
		||||
    exchange(out0[j],out1[j],in[k],in[m],type);
 | 
			
		||||
  }
 | 
			
		||||
  // For cshift. Cshift should drop compressor coupling altogether 
 | 
			
		||||
  // because I had to decouple the code from the Stencil anyway
 | 
			
		||||
  inline vobj operator() (const vobj &arg) {
 | 
			
		||||
    return arg;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 
 | 
			
		||||
@@ -35,7 +35,6 @@ directory
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Template parameter class constructs to package
 | 
			
		||||
  // externally control Fermion implementations
 | 
			
		||||
@@ -90,6 +89,52 @@ namespace QCD {
 | 
			
		||||
  //  }
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  template <class T> struct SamePrecisionMapper {
 | 
			
		||||
    typedef T HigherPrecVector ;
 | 
			
		||||
    typedef T LowerPrecVector ;
 | 
			
		||||
  };
 | 
			
		||||
  template <class T> struct LowerPrecisionMapper {  };
 | 
			
		||||
  template <> struct LowerPrecisionMapper<vRealF> {
 | 
			
		||||
    typedef vRealF HigherPrecVector ;
 | 
			
		||||
    typedef vRealH LowerPrecVector ;
 | 
			
		||||
  };
 | 
			
		||||
  template <> struct LowerPrecisionMapper<vRealD> {
 | 
			
		||||
    typedef vRealD HigherPrecVector ;
 | 
			
		||||
    typedef vRealF LowerPrecVector ;
 | 
			
		||||
  };
 | 
			
		||||
  template <> struct LowerPrecisionMapper<vComplexF> {
 | 
			
		||||
    typedef vComplexF HigherPrecVector ;
 | 
			
		||||
    typedef vComplexH LowerPrecVector ;
 | 
			
		||||
  };
 | 
			
		||||
  template <> struct LowerPrecisionMapper<vComplexD> {
 | 
			
		||||
    typedef vComplexD HigherPrecVector ;
 | 
			
		||||
    typedef vComplexF LowerPrecVector ;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct CoeffReal {
 | 
			
		||||
  public:
 | 
			
		||||
    typedef RealD _Coeff_t;
 | 
			
		||||
    static const int Nhcs = 2;
 | 
			
		||||
    template<class Simd> using PrecisionMapper = SamePrecisionMapper<Simd>;
 | 
			
		||||
  };
 | 
			
		||||
  struct CoeffRealHalfComms {
 | 
			
		||||
  public:
 | 
			
		||||
    typedef RealD _Coeff_t;
 | 
			
		||||
    static const int Nhcs = 1;
 | 
			
		||||
    template<class Simd> using PrecisionMapper = LowerPrecisionMapper<Simd>;
 | 
			
		||||
  };
 | 
			
		||||
  struct CoeffComplex {
 | 
			
		||||
  public:
 | 
			
		||||
    typedef ComplexD _Coeff_t;
 | 
			
		||||
    static const int Nhcs = 2;
 | 
			
		||||
    template<class Simd> using PrecisionMapper = SamePrecisionMapper<Simd>;
 | 
			
		||||
  };
 | 
			
		||||
  struct CoeffComplexHalfComms {
 | 
			
		||||
  public:
 | 
			
		||||
    typedef ComplexD _Coeff_t;
 | 
			
		||||
    static const int Nhcs = 1;
 | 
			
		||||
    template<class Simd> using PrecisionMapper = LowerPrecisionMapper<Simd>;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Implementation dependent fermion types
 | 
			
		||||
@@ -114,37 +159,40 @@ namespace QCD {
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Single flavour four spinors with colour index
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
 | 
			
		||||
  template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
 | 
			
		||||
  class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
 | 
			
		||||
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
    static const int Dimension = Representation::Dimension;
 | 
			
		||||
    static const bool LsVectorised=false;
 | 
			
		||||
    static const int Nhcs = Options::Nhcs;
 | 
			
		||||
 | 
			
		||||
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
 | 
			
		||||
    INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
      
 | 
			
		||||
    //Necessary?
 | 
			
		||||
    constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
 | 
			
		||||
    
 | 
			
		||||
    const bool LsVectorised=false;
 | 
			
		||||
    typedef _Coeff_t Coeff_t;
 | 
			
		||||
 | 
			
		||||
    INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
    typedef typename Options::_Coeff_t Coeff_t;
 | 
			
		||||
    typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
 | 
			
		||||
      
 | 
			
		||||
    template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
 | 
			
		||||
    template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
 | 
			
		||||
    template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
 | 
			
		||||
    template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
 | 
			
		||||
    template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
 | 
			
		||||
    
 | 
			
		||||
    typedef iImplSpinor<Simd>            SiteSpinor;
 | 
			
		||||
    typedef iImplPropagator<Simd>        SitePropagator;
 | 
			
		||||
    typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
 | 
			
		||||
    typedef iImplHalfCommSpinor<SimdL>   SiteHalfCommSpinor;
 | 
			
		||||
    typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
 | 
			
		||||
    
 | 
			
		||||
    typedef Lattice<SiteSpinor>            FermionField;
 | 
			
		||||
    typedef Lattice<SitePropagator>        PropagatorField;
 | 
			
		||||
    typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
 | 
			
		||||
    
 | 
			
		||||
    typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
 | 
			
		||||
    typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
 | 
			
		||||
    typedef WilsonImplParams ImplParams;
 | 
			
		||||
    typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
 | 
			
		||||
    
 | 
			
		||||
@@ -209,31 +257,34 @@ namespace QCD {
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Single flavour four spinors with colour index, 5d redblack
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
 | 
			
		||||
template<class S,int Nrepresentation=Nc, class Options=CoeffReal>
 | 
			
		||||
class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
  static const int Dimension = Nrepresentation;
 | 
			
		||||
  const bool LsVectorised=true;
 | 
			
		||||
  typedef _Coeff_t Coeff_t;      
 | 
			
		||||
  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
 | 
			
		||||
  
 | 
			
		||||
  INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
 | 
			
		||||
  static const int Dimension = Nrepresentation;
 | 
			
		||||
  static const bool LsVectorised=true;
 | 
			
		||||
  static const int Nhcs = Options::Nhcs;
 | 
			
		||||
      
 | 
			
		||||
  typedef typename Options::_Coeff_t Coeff_t;      
 | 
			
		||||
  typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
 | 
			
		||||
  
 | 
			
		||||
  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
 | 
			
		||||
  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
 | 
			
		||||
  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
 | 
			
		||||
  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhcs> >;
 | 
			
		||||
  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
 | 
			
		||||
  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
 | 
			
		||||
  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
 | 
			
		||||
  
 | 
			
		||||
  typedef iImplSpinor<Simd> SiteSpinor;
 | 
			
		||||
  typedef iImplPropagator<Simd> SitePropagator;
 | 
			
		||||
  typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
 | 
			
		||||
  typedef Lattice<SiteSpinor> FermionField;
 | 
			
		||||
  typedef Lattice<SitePropagator> PropagatorField;
 | 
			
		||||
  
 | 
			
		||||
  typedef iImplSpinor<Simd>            SiteSpinor;
 | 
			
		||||
  typedef iImplPropagator<Simd>        SitePropagator;
 | 
			
		||||
  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
 | 
			
		||||
  typedef iImplHalfCommSpinor<SimdL>   SiteHalfCommSpinor;
 | 
			
		||||
  typedef Lattice<SiteSpinor>          FermionField;
 | 
			
		||||
  typedef Lattice<SitePropagator>      PropagatorField;
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////
 | 
			
		||||
  // Make the doubled gauge field a *scalar*
 | 
			
		||||
@@ -241,9 +292,9 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
 | 
			
		||||
  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
 | 
			
		||||
  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
 | 
			
		||||
  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
 | 
			
		||||
  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
 | 
			
		||||
  typedef Lattice<SiteDoubledGaugeField>                      DoubledGaugeField;
 | 
			
		||||
      
 | 
			
		||||
  typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
 | 
			
		||||
  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
 | 
			
		||||
  typedef WilsonImplParams ImplParams;
 | 
			
		||||
  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
 | 
			
		||||
  
 | 
			
		||||
@@ -311,35 +362,37 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Flavour doubled spinors; is Gparity the only? what about C*?
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    
 | 
			
		||||
template <class S, int Nrepresentation,class _Coeff_t = RealD>
 | 
			
		||||
template <class S, int Nrepresentation, class Options=CoeffReal>
 | 
			
		||||
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
 | 
			
		||||
 public:
 | 
			
		||||
 | 
			
		||||
 static const int Dimension = Nrepresentation;
 | 
			
		||||
 static const int Nhcs = Options::Nhcs;
 | 
			
		||||
 static const bool LsVectorised=false;
 | 
			
		||||
 | 
			
		||||
 const bool LsVectorised=false;
 | 
			
		||||
 | 
			
		||||
 typedef _Coeff_t Coeff_t;
 | 
			
		||||
 typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
 | 
			
		||||
 
 | 
			
		||||
 INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
 | 
			
		||||
 template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>, Ngp >;
 | 
			
		||||
 template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
 | 
			
		||||
 typedef typename Options::_Coeff_t Coeff_t;
 | 
			
		||||
 typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
 | 
			
		||||
      
 | 
			
		||||
 template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>,   Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>,   Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>,  Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Nrepresentation>, Nhcs>, Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
 | 
			
		||||
 | 
			
		||||
 typedef iImplSpinor<Simd> SiteSpinor;
 | 
			
		||||
 typedef iImplPropagator<Simd> SitePropagator;
 | 
			
		||||
 typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
 | 
			
		||||
 typedef iImplSpinor<Simd>            SiteSpinor;
 | 
			
		||||
 typedef iImplPropagator<Simd>        SitePropagator;
 | 
			
		||||
 typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
 | 
			
		||||
 typedef iImplHalfCommSpinor<SimdL>   SiteHalfCommSpinor;
 | 
			
		||||
 typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
 | 
			
		||||
 | 
			
		||||
 typedef Lattice<SiteSpinor> FermionField;
 | 
			
		||||
 typedef Lattice<SitePropagator> PropagatorField;
 | 
			
		||||
 typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
 | 
			
		||||
 
 | 
			
		||||
 typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
 | 
			
		||||
 typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
 | 
			
		||||
 typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
 | 
			
		||||
 
 | 
			
		||||
 typedef GparityWilsonImplParams ImplParams;
 | 
			
		||||
@@ -356,8 +409,8 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 | 
			
		||||
		      const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
 | 
			
		||||
		      StencilImpl &St) {
 | 
			
		||||
 | 
			
		||||
  typedef SiteHalfSpinor vobj;
 | 
			
		||||
  typedef typename SiteHalfSpinor::scalar_object sobj;
 | 
			
		||||
   typedef SiteHalfSpinor vobj;
 | 
			
		||||
   typedef typename SiteHalfSpinor::scalar_object sobj;
 | 
			
		||||
	
 | 
			
		||||
   vobj vtmp;
 | 
			
		||||
   sobj stmp;
 | 
			
		||||
@@ -475,7 +528,6 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 | 
			
		||||
   }
 | 
			
		||||
 }
 | 
			
		||||
      
 | 
			
		||||
      
 | 
			
		||||
 inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
 | 
			
		||||
 | 
			
		||||
   // DhopDir provides U or Uconj depending on coor/flavour.
 | 
			
		||||
@@ -508,23 +560,22 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Single flavour one component spinors with colour index
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template <class S, class Representation = FundamentalRepresentation >
 | 
			
		||||
  class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Single flavour one component spinors with colour index
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class S, class Representation = FundamentalRepresentation >
 | 
			
		||||
class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
 | 
			
		||||
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
    typedef RealD  _Coeff_t ;
 | 
			
		||||
    static const int Dimension = Representation::Dimension;
 | 
			
		||||
    static const bool LsVectorised=false;
 | 
			
		||||
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
 | 
			
		||||
      
 | 
			
		||||
    //Necessary?
 | 
			
		||||
    constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
 | 
			
		||||
    
 | 
			
		||||
    const bool LsVectorised=false;
 | 
			
		||||
    typedef _Coeff_t Coeff_t;
 | 
			
		||||
 | 
			
		||||
    INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
@@ -641,8 +692,6 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Single flavour one component spinors with colour index. 5d vec
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -651,16 +700,14 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 | 
			
		||||
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
    typedef RealD  _Coeff_t ;
 | 
			
		||||
    static const int Dimension = Representation::Dimension;
 | 
			
		||||
    static const bool LsVectorised=true;
 | 
			
		||||
    typedef RealD   Coeff_t ;
 | 
			
		||||
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
 | 
			
		||||
      
 | 
			
		||||
    //Necessary?
 | 
			
		||||
    constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
 | 
			
		||||
 | 
			
		||||
    const bool LsVectorised=true;
 | 
			
		||||
 | 
			
		||||
    typedef _Coeff_t Coeff_t;
 | 
			
		||||
 | 
			
		||||
    INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
 | 
			
		||||
@@ -823,43 +870,61 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > WilsonImplR;  // Real.. whichever prec
 | 
			
		||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF;  // Float
 | 
			
		||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD;  // Double
 | 
			
		||||
 | 
			
		||||
typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRH;  // Real.. whichever prec
 | 
			
		||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH;  // Float
 | 
			
		||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDH;  // Double
 | 
			
		||||
 | 
			
		||||
 typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec
 | 
			
		||||
 typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float
 | 
			
		||||
 typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double
 | 
			
		||||
typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
 | 
			
		||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
 | 
			
		||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
 | 
			
		||||
 | 
			
		||||
 typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
 | 
			
		||||
 typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
 | 
			
		||||
 typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
 | 
			
		||||
typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRH; // Real.. whichever prec
 | 
			
		||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
 | 
			
		||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDH; // Double
 | 
			
		||||
 
 | 
			
		||||
 typedef WilsonImpl<vComplex,  AdjointRepresentation > WilsonAdjImplR;   // Real.. whichever prec
 | 
			
		||||
 typedef WilsonImpl<vComplexF, AdjointRepresentation > WilsonAdjImplF;  // Float
 | 
			
		||||
 typedef WilsonImpl<vComplexD, AdjointRepresentation > WilsonAdjImplD;  // Double
 | 
			
		||||
typedef WilsonImpl<vComplex,  AdjointRepresentation, CoeffReal > WilsonAdjImplR;   // Real.. whichever prec
 | 
			
		||||
typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF;  // Float
 | 
			
		||||
typedef WilsonImpl<vComplexD, AdjointRepresentation, CoeffReal > WilsonAdjImplD;  // Double
 | 
			
		||||
 
 | 
			
		||||
 typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplR;   // Real.. whichever prec
 | 
			
		||||
 typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplF;  // Float
 | 
			
		||||
 typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplD;  // Double
 | 
			
		||||
typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplR;   // Real.. whichever prec
 | 
			
		||||
typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF;  // Float
 | 
			
		||||
typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD;  // Double
 | 
			
		||||
 
 | 
			
		||||
 typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
 | 
			
		||||
 typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
 | 
			
		||||
 typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffReal> DomainWallVec5dImplF; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffReal> DomainWallVec5dImplD; // Double
 | 
			
		||||
 
 | 
			
		||||
 typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
 | 
			
		||||
 typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
 | 
			
		||||
 typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffRealHalfComms> DomainWallVec5dImplRH; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffRealHalfComms> DomainWallVec5dImplDH; // Double
 | 
			
		||||
 
 | 
			
		||||
 typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR;  // Real.. whichever prec
 | 
			
		||||
 typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float
 | 
			
		||||
 typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplex> ZDomainWallVec5dImplF; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplex> ZDomainWallVec5dImplD; // Double
 | 
			
		||||
 
 | 
			
		||||
 typedef StaggeredImpl<vComplex,  FundamentalRepresentation > StaggeredImplR;   // Real.. whichever prec
 | 
			
		||||
 typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF;  // Float
 | 
			
		||||
 typedef StaggeredImpl<vComplexD, FundamentalRepresentation > StaggeredImplD;  // Double
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplRH; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplDH; // Double
 | 
			
		||||
 
 | 
			
		||||
 typedef StaggeredVec5dImpl<vComplex,  FundamentalRepresentation > StaggeredVec5dImplR;   // Real.. whichever prec
 | 
			
		||||
 typedef StaggeredVec5dImpl<vComplexF, FundamentalRepresentation > StaggeredVec5dImplF;  // Float
 | 
			
		||||
 typedef StaggeredVec5dImpl<vComplexD, FundamentalRepresentation > StaggeredVec5dImplD;  // Double
 | 
			
		||||
typedef GparityWilsonImpl<vComplex , Nc,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
 | 
			
		||||
typedef GparityWilsonImpl<vComplexF, Nc,CoeffReal> GparityWilsonImplF;  // Float
 | 
			
		||||
typedef GparityWilsonImpl<vComplexD, Nc,CoeffReal> GparityWilsonImplD;  // Double
 | 
			
		||||
 
 | 
			
		||||
typedef GparityWilsonImpl<vComplex , Nc,CoeffRealHalfComms> GparityWilsonImplRH;  // Real.. whichever prec
 | 
			
		||||
typedef GparityWilsonImpl<vComplexF, Nc,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
 | 
			
		||||
typedef GparityWilsonImpl<vComplexD, Nc,CoeffRealHalfComms> GparityWilsonImplDH;  // Double
 | 
			
		||||
 | 
			
		||||
typedef StaggeredImpl<vComplex,  FundamentalRepresentation > StaggeredImplR;   // Real.. whichever prec
 | 
			
		||||
typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF;  // Float
 | 
			
		||||
typedef StaggeredImpl<vComplexD, FundamentalRepresentation > StaggeredImplD;  // Double
 | 
			
		||||
 | 
			
		||||
typedef StaggeredVec5dImpl<vComplex,  FundamentalRepresentation > StaggeredVec5dImplR;   // Real.. whichever prec
 | 
			
		||||
typedef StaggeredVec5dImpl<vComplexF, FundamentalRepresentation > StaggeredVec5dImplF;  // Float
 | 
			
		||||
typedef StaggeredVec5dImpl<vComplexD, FundamentalRepresentation > StaggeredVec5dImplD;  // Double
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -33,228 +33,292 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    int mu;
 | 
			
		||||
    int dag;
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// optimised versions supporting half precision too
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
    WilsonCompressor(int _dag){
 | 
			
		||||
      mu=0;
 | 
			
		||||
      dag=_dag;
 | 
			
		||||
      assert((dag==0)||(dag==1));
 | 
			
		||||
    }
 | 
			
		||||
    void Point(int p) { 
 | 
			
		||||
      mu=p;
 | 
			
		||||
    };
 | 
			
		||||
template<class _HCspinor,class _Hspinor,class _Spinor, class projector,typename SFINAE = void >
 | 
			
		||||
class WilsonCompressorTemplate;
 | 
			
		||||
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      int mudag=mu;
 | 
			
		||||
      if (!dag) {
 | 
			
		||||
	mudag=(mu+Nd)%(2*Nd);
 | 
			
		||||
      }
 | 
			
		||||
      switch(mudag) {
 | 
			
		||||
      case Xp:
 | 
			
		||||
	spProjXp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case Yp:
 | 
			
		||||
	spProjYp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case Zp:
 | 
			
		||||
	spProjZp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case Tp:
 | 
			
		||||
	spProjTp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case Xm:
 | 
			
		||||
	spProjXm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case Ym:
 | 
			
		||||
	spProjYm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case Zm:
 | 
			
		||||
	spProjZm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case Tm:
 | 
			
		||||
	spProjTm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      default: 
 | 
			
		||||
	assert(0);
 | 
			
		||||
	break;
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  /////////////////////////
 | 
			
		||||
  // optimised versions
 | 
			
		||||
  /////////////////////////
 | 
			
		||||
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 | 
			
		||||
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
 | 
			
		||||
  typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
 | 
			
		||||
{
 | 
			
		||||
 public:
 | 
			
		||||
  
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonXpCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjXp(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonYpCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjYp(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonZpCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjZp(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonTpCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjTp(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  int mu,dag;  
 | 
			
		||||
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonXmCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjXm(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonYmCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjYm(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonZmCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjZm(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonTmCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjTm(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  void Point(int p) { mu=p; };
 | 
			
		||||
 | 
			
		||||
    // Fast comms buffer manipulation which should inline right through (avoid direction
 | 
			
		||||
    // dependent logic that prevents inlining
 | 
			
		||||
  template<class vobj,class cobj>
 | 
			
		||||
  class WilsonStencil : public CartesianStencil<vobj,cobj> {
 | 
			
		||||
  public:
 | 
			
		||||
  WilsonCompressorTemplate(int _dag=0){
 | 
			
		||||
    dag = _dag;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
    typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 | 
			
		||||
  typedef _Spinor         SiteSpinor;
 | 
			
		||||
  typedef _Hspinor     SiteHalfSpinor;
 | 
			
		||||
  typedef _HCspinor SiteHalfCommSpinor;
 | 
			
		||||
  typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
 | 
			
		||||
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
 | 
			
		||||
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
 | 
			
		||||
 | 
			
		||||
    WilsonStencil(GridBase *grid,
 | 
			
		||||
  inline int CommDatumSize(void) {
 | 
			
		||||
    return sizeof(SiteHalfCommSpinor);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Compress includes precision change if mpi data is not same */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
 | 
			
		||||
    projector::Proj(buf[o],in,mu,dag);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Exchange includes precision change if mpi data is not same */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void Exchange(SiteHalfSpinor *mp,
 | 
			
		||||
                       SiteHalfSpinor *vp0,
 | 
			
		||||
                       SiteHalfSpinor *vp1,
 | 
			
		||||
		       Integer type,Integer o){
 | 
			
		||||
    exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Have a decompression step if mpi data is not same */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void Decompress(SiteHalfSpinor *out,
 | 
			
		||||
			 SiteHalfSpinor *in, Integer o) {    
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Compress Exchange                                 */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void CompressExchange(SiteHalfSpinor *out0,
 | 
			
		||||
			       SiteHalfSpinor *out1,
 | 
			
		||||
			       const SiteSpinor *in,
 | 
			
		||||
			       Integer j,Integer k, Integer m,Integer type){
 | 
			
		||||
    SiteHalfSpinor temp1, temp2,temp3,temp4;
 | 
			
		||||
    projector::Proj(temp1,in[k],mu,dag);
 | 
			
		||||
    projector::Proj(temp2,in[m],mu,dag);
 | 
			
		||||
    exchange(out0[j],out1[j],temp1,temp2,type);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Pass the info to the stencil */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline bool DecompressionStep(void) { return false; }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 | 
			
		||||
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
 | 
			
		||||
  typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
 | 
			
		||||
{
 | 
			
		||||
 public:
 | 
			
		||||
  
 | 
			
		||||
  int mu,dag;  
 | 
			
		||||
 | 
			
		||||
  void Point(int p) { mu=p; };
 | 
			
		||||
 | 
			
		||||
  WilsonCompressorTemplate(int _dag=0){
 | 
			
		||||
    dag = _dag;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  typedef _Spinor         SiteSpinor;
 | 
			
		||||
  typedef _Hspinor     SiteHalfSpinor;
 | 
			
		||||
  typedef _HCspinor SiteHalfCommSpinor;
 | 
			
		||||
  typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
 | 
			
		||||
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
 | 
			
		||||
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
 | 
			
		||||
 | 
			
		||||
  inline int CommDatumSize(void) {
 | 
			
		||||
    return sizeof(SiteHalfCommSpinor);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Compress includes precision change if mpi data is not same */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
 | 
			
		||||
    SiteHalfSpinor hsp;
 | 
			
		||||
    SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
 | 
			
		||||
    projector::Proj(hsp,in,mu,dag);
 | 
			
		||||
    precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Exchange includes precision change if mpi data is not same */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void Exchange(SiteHalfSpinor *mp,
 | 
			
		||||
                       SiteHalfSpinor *vp0,
 | 
			
		||||
                       SiteHalfSpinor *vp1,
 | 
			
		||||
		       Integer type,Integer o){
 | 
			
		||||
    SiteHalfSpinor vt0,vt1;
 | 
			
		||||
    SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
 | 
			
		||||
    SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
 | 
			
		||||
    precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw);
 | 
			
		||||
    precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw);
 | 
			
		||||
    exchange(mp[2*o],mp[2*o+1],vt0,vt1,type);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Have a decompression step if mpi data is not same */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void Decompress(SiteHalfSpinor *out,
 | 
			
		||||
			 SiteHalfSpinor *in, Integer o){
 | 
			
		||||
    SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
 | 
			
		||||
    precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Compress Exchange                                 */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void CompressExchange(SiteHalfSpinor *out0,
 | 
			
		||||
			       SiteHalfSpinor *out1,
 | 
			
		||||
			       const SiteSpinor *in,
 | 
			
		||||
			       Integer j,Integer k, Integer m,Integer type){
 | 
			
		||||
    SiteHalfSpinor temp1, temp2,temp3,temp4;
 | 
			
		||||
    SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
 | 
			
		||||
    SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
 | 
			
		||||
    projector::Proj(temp1,in[k],mu,dag);
 | 
			
		||||
    projector::Proj(temp2,in[m],mu,dag);
 | 
			
		||||
    exchange(temp3,temp4,temp1,temp2,type);
 | 
			
		||||
    precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw);
 | 
			
		||||
    precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Pass the info to the stencil */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline bool DecompressionStep(void) { return true; }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#define DECLARE_PROJ(Projector,Compressor,spProj)			\
 | 
			
		||||
  class Projector {							\
 | 
			
		||||
  public:								\
 | 
			
		||||
    template<class hsp,class fsp>					\
 | 
			
		||||
    static void Proj(hsp &result,const fsp &in,int mu,int dag){			\
 | 
			
		||||
      spProj(result,in);						\
 | 
			
		||||
    }									\
 | 
			
		||||
  };									\
 | 
			
		||||
template<typename HCS,typename HS,typename S> using Compressor = WilsonCompressorTemplate<HCS,HS,S,Projector>;
 | 
			
		||||
 | 
			
		||||
DECLARE_PROJ(WilsonXpProjector,WilsonXpCompressor,spProjXp);
 | 
			
		||||
DECLARE_PROJ(WilsonYpProjector,WilsonYpCompressor,spProjYp);
 | 
			
		||||
DECLARE_PROJ(WilsonZpProjector,WilsonZpCompressor,spProjZp);
 | 
			
		||||
DECLARE_PROJ(WilsonTpProjector,WilsonTpCompressor,spProjTp);
 | 
			
		||||
DECLARE_PROJ(WilsonXmProjector,WilsonXmCompressor,spProjXm);
 | 
			
		||||
DECLARE_PROJ(WilsonYmProjector,WilsonYmCompressor,spProjYm);
 | 
			
		||||
DECLARE_PROJ(WilsonZmProjector,WilsonZmCompressor,spProjZm);
 | 
			
		||||
DECLARE_PROJ(WilsonTmProjector,WilsonTmCompressor,spProjTm);
 | 
			
		||||
 | 
			
		||||
class WilsonProjector {
 | 
			
		||||
 public:
 | 
			
		||||
  template<class hsp,class fsp>
 | 
			
		||||
  static void Proj(hsp &result,const fsp &in,int mu,int dag){
 | 
			
		||||
    int mudag=dag? mu : (mu+Nd)%(2*Nd);
 | 
			
		||||
    switch(mudag) {
 | 
			
		||||
    case Xp:	spProjXp(result,in);	break;
 | 
			
		||||
    case Yp:	spProjYp(result,in);	break;
 | 
			
		||||
    case Zp:	spProjZp(result,in);	break;
 | 
			
		||||
    case Tp:	spProjTp(result,in);	break;
 | 
			
		||||
    case Xm:	spProjXm(result,in);	break;
 | 
			
		||||
    case Ym:	spProjYm(result,in);	break;
 | 
			
		||||
    case Zm:	spProjZm(result,in);	break;
 | 
			
		||||
    case Tm:	spProjTm(result,in);	break;
 | 
			
		||||
    default: 	assert(0);	        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCompressorTemplate<HCS,HS,S,WilsonProjector>;
 | 
			
		||||
 | 
			
		||||
// Fast comms buffer manipulation which should inline right through (avoid direction
 | 
			
		||||
// dependent logic that prevents inlining
 | 
			
		||||
template<class vobj,class cobj>
 | 
			
		||||
class WilsonStencil : public CartesianStencil<vobj,cobj> {
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 | 
			
		||||
 | 
			
		||||
  WilsonStencil(GridBase *grid,
 | 
			
		||||
		int npoints,
 | 
			
		||||
		int checkerboard,
 | 
			
		||||
		const std::vector<int> &directions,
 | 
			
		||||
		const std::vector<int> &distances)  : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) 
 | 
			
		||||
      {    };
 | 
			
		||||
		const std::vector<int> &distances)  
 | 
			
		||||
   : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) 
 | 
			
		||||
  { /*Do nothing*/ };
 | 
			
		||||
 | 
			
		||||
    template < class compressor>
 | 
			
		||||
    void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
    {
 | 
			
		||||
      std::vector<std::vector<CommsRequest_t> > reqs;
 | 
			
		||||
      HaloExchangeOptGather(source,compress);
 | 
			
		||||
      this->CommunicateBegin(reqs);
 | 
			
		||||
      this->calls++;
 | 
			
		||||
      this->CommunicateComplete(reqs);
 | 
			
		||||
      this->CommsMerge();
 | 
			
		||||
  template < class compressor>
 | 
			
		||||
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
  {
 | 
			
		||||
    std::vector<std::vector<CommsRequest_t> > reqs;
 | 
			
		||||
    this->HaloExchangeOptGather(source,compress);
 | 
			
		||||
    this->CommunicateBegin(reqs);
 | 
			
		||||
    this->CommunicateComplete(reqs);
 | 
			
		||||
    this->CommsMerge(compress);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template <class compressor>
 | 
			
		||||
  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
  {
 | 
			
		||||
    this->Prepare();
 | 
			
		||||
    this->HaloGatherOpt(source,compress);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template <class compressor>
 | 
			
		||||
  void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
 | 
			
		||||
  {
 | 
			
		||||
    // Strategy. Inherit types from Compressor.
 | 
			
		||||
    // Use types to select the write direction by directon compressor
 | 
			
		||||
    typedef typename compressor::SiteSpinor         SiteSpinor;
 | 
			
		||||
    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
 | 
			
		||||
    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
 | 
			
		||||
 | 
			
		||||
    this->_grid->StencilBarrier();
 | 
			
		||||
 | 
			
		||||
    assert(source._grid==this->_grid);
 | 
			
		||||
    this->halogtime-=usecond();
 | 
			
		||||
    
 | 
			
		||||
    this->u_comm_offset=0;
 | 
			
		||||
      
 | 
			
		||||
    WilsonXpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> XpCompress; 
 | 
			
		||||
    WilsonYpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> YpCompress; 
 | 
			
		||||
    WilsonZpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> ZpCompress; 
 | 
			
		||||
    WilsonTpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> TpCompress;
 | 
			
		||||
    WilsonXmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> XmCompress; 
 | 
			
		||||
    WilsonYmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> YmCompress; 
 | 
			
		||||
    WilsonZmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> ZmCompress; 
 | 
			
		||||
    WilsonTmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> TmCompress;
 | 
			
		||||
 | 
			
		||||
    int dag = compress.dag;
 | 
			
		||||
    int face_idx=0;
 | 
			
		||||
    if ( dag ) { 
 | 
			
		||||
      //	std::cout << " Optimised Dagger compress " <<std::endl;
 | 
			
		||||
      this->HaloGatherDir(source,XpCompress,Xp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,YpCompress,Yp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,ZpCompress,Zp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,TpCompress,Tp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,XmCompress,Xm,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,YmCompress,Ym,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,ZmCompress,Zm,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,TmCompress,Tm,face_idx);
 | 
			
		||||
    } else {
 | 
			
		||||
      this->HaloGatherDir(source,XmCompress,Xp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,YmCompress,Yp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,ZmCompress,Zp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,TmCompress,Tp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,XpCompress,Xm,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,YpCompress,Ym,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,ZpCompress,Zm,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,TpCompress,Tm,face_idx);
 | 
			
		||||
    }
 | 
			
		||||
    this->face_table_computed=1;
 | 
			
		||||
    assert(this->u_comm_offset==this->_unified_buffer_size);
 | 
			
		||||
    this->halogtime+=usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
    template < class compressor>
 | 
			
		||||
    void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
    {
 | 
			
		||||
      this->calls++;
 | 
			
		||||
      this->Mergers.resize(0); 
 | 
			
		||||
      this->Packets.resize(0);
 | 
			
		||||
      this->HaloGatherOpt(source,compress);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    template < class compressor>
 | 
			
		||||
    void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
 | 
			
		||||
    {
 | 
			
		||||
      this->_grid->StencilBarrier();
 | 
			
		||||
      // conformable(source._grid,_grid);
 | 
			
		||||
      assert(source._grid==this->_grid);
 | 
			
		||||
      this->halogtime-=usecond();
 | 
			
		||||
      
 | 
			
		||||
      this->u_comm_offset=0;
 | 
			
		||||
      
 | 
			
		||||
      int dag = compress.dag;
 | 
			
		||||
      
 | 
			
		||||
      WilsonXpCompressor<cobj,vobj> XpCompress; 
 | 
			
		||||
      WilsonYpCompressor<cobj,vobj> YpCompress; 
 | 
			
		||||
      WilsonZpCompressor<cobj,vobj> ZpCompress; 
 | 
			
		||||
      WilsonTpCompressor<cobj,vobj> TpCompress;
 | 
			
		||||
      WilsonXmCompressor<cobj,vobj> XmCompress;
 | 
			
		||||
      WilsonYmCompressor<cobj,vobj> YmCompress;
 | 
			
		||||
      WilsonZmCompressor<cobj,vobj> ZmCompress;
 | 
			
		||||
      WilsonTmCompressor<cobj,vobj> TmCompress;
 | 
			
		||||
 | 
			
		||||
      // Gather all comms buffers
 | 
			
		||||
      //    for(int point = 0 ; point < _npoints; point++) {
 | 
			
		||||
      //      compress.Point(point);
 | 
			
		||||
      //      HaloGatherDir(source,compress,point,face_idx);
 | 
			
		||||
      //    }
 | 
			
		||||
      int face_idx=0;
 | 
			
		||||
      if ( dag ) { 
 | 
			
		||||
	//	std::cout << " Optimised Dagger compress " <<std::endl;
 | 
			
		||||
	this->HaloGatherDir(source,XpCompress,Xp,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,YpCompress,Yp,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,ZpCompress,Zp,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,TpCompress,Tp,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,XmCompress,Xm,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,YmCompress,Ym,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,ZmCompress,Zm,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,TmCompress,Tm,face_idx);
 | 
			
		||||
      } else {
 | 
			
		||||
	this->HaloGatherDir(source,XmCompress,Xp,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,YmCompress,Yp,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,ZmCompress,Zp,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,TmCompress,Tp,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,XpCompress,Xm,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,YpCompress,Ym,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,ZpCompress,Zm,face_idx);
 | 
			
		||||
	this->HaloGatherDir(source,TpCompress,Tm,face_idx);
 | 
			
		||||
      }
 | 
			
		||||
      this->face_table_computed=1;
 | 
			
		||||
      assert(this->u_comm_offset==this->_unified_buffer_size);
 | 
			
		||||
      this->halogtime+=usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 };
 | 
			
		||||
 | 
			
		||||
}} // namespace close
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -439,7 +439,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.CommsMerge();
 | 
			
		||||
  st.CommsMerge(compressor);
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
 
 | 
			
		||||
@@ -74,12 +74,14 @@ struct RealPart<std::complex<T> > {
 | 
			
		||||
  typedef T type;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#include <type_traits>
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////
 | 
			
		||||
// demote a vector to real type
 | 
			
		||||
//////////////////////////////////////
 | 
			
		||||
// type alias used to simplify the syntax of std::enable_if
 | 
			
		||||
template <typename T> using Invoke = typename T::type;
 | 
			
		||||
template <typename Condition, typename ReturnType> using EnableIf = Invoke<std::enable_if<Condition::value, ReturnType> >;
 | 
			
		||||
template <typename Condition, typename ReturnType> using EnableIf    = Invoke<std::enable_if<Condition::value, ReturnType> >;
 | 
			
		||||
template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////
 | 
			
		||||
@@ -88,15 +90,15 @@ template <typename T> struct is_complex : public std::false_type {};
 | 
			
		||||
template <> struct is_complex<std::complex<double> > : public std::true_type {};
 | 
			
		||||
template <> struct is_complex<std::complex<float> > : public std::true_type {};
 | 
			
		||||
 | 
			
		||||
template <typename T> using IfReal       = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
 | 
			
		||||
template <typename T> using IfComplex    = Invoke<std::enable_if<is_complex<T>::value, int> >;
 | 
			
		||||
template <typename T> using IfInteger    = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
 | 
			
		||||
template <typename T1,typename T2> using IfSame = Invoke<std::enable_if<is_same<T1,T2>::value, int> >;
 | 
			
		||||
template <typename T>              using IfReal    = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
 | 
			
		||||
template <typename T>              using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
 | 
			
		||||
template <typename T>              using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
 | 
			
		||||
template <typename T1,typename T2> using IfSame    = Invoke<std::enable_if<std::is_same<T1,T2>::value, int> >;
 | 
			
		||||
 | 
			
		||||
template <typename T> using IfNotReal    = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
 | 
			
		||||
template <typename T> using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
 | 
			
		||||
template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
 | 
			
		||||
template <typename T1,typename T2> using IfNotSame = Invoke<std::enable_if<!is_same<T1,T2>::value, int> >;
 | 
			
		||||
template <typename T>              using IfNotReal    = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
 | 
			
		||||
template <typename T>              using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
 | 
			
		||||
template <typename T>              using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
 | 
			
		||||
template <typename T1,typename T2> using IfNotSame    = Invoke<std::enable_if<!std::is_same<T1,T2>::value, int> >;
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////
 | 
			
		||||
// Define the operation templates functors
 | 
			
		||||
 
 | 
			
		||||
@@ -29,7 +29,7 @@
 | 
			
		||||
#define GRID_STENCIL_H
 | 
			
		||||
 | 
			
		||||
#include <Grid/stencil/Lebesgue.h>   // subdir aggregate
 | 
			
		||||
#define NEW_XYZT_GATHER
 | 
			
		||||
 | 
			
		||||
 //////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 // Must not lose sight that goal is to be able to construct really efficient
 | 
			
		||||
 // gather to a point stencil code. CSHIFT is not the best way, so need
 | 
			
		||||
@@ -82,7 +82,7 @@ void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const La
 | 
			
		||||
{
 | 
			
		||||
  int num=table.size();
 | 
			
		||||
  parallel_for(int i=0;i<num;i++){
 | 
			
		||||
    vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second]));
 | 
			
		||||
    compress.Compress(&buffer[off],table[i].first,rhs._odata[so+table[i].second]);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -102,14 +102,8 @@ void Gather_plane_exchange_table(std::vector<std::pair<int,int> >& table,const L
 | 
			
		||||
  int num=table.size()/2;
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
  parallel_for(int j=0;j<num;j++){
 | 
			
		||||
    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
 | 
			
		||||
    cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
 | 
			
		||||
    cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
 | 
			
		||||
    cobj temp3;
 | 
			
		||||
    cobj temp4;
 | 
			
		||||
    exchange(temp3,temp4,temp1,temp2,type);
 | 
			
		||||
    vstream(pointers[0][j],temp3);
 | 
			
		||||
    vstream(pointers[1][j],temp4);
 | 
			
		||||
    compress.CompressExchange(&pointers[0][0],&pointers[1][0],&rhs._odata[0],
 | 
			
		||||
			      j,so+table[2*j].second,so+table[2*j+1].second,type);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -121,8 +115,19 @@ struct StencilEntry {
 | 
			
		||||
  uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
//extern int dump;
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//Lattice object type, compressed object type.
 | 
			
		||||
//
 | 
			
		||||
//These only need to be known outside of halo exchange subset of methods 
 | 
			
		||||
//because byte offsets are precomputed
 | 
			
		||||
//
 | 
			
		||||
//It might help/be cleaner to add a "mobj" for the mpi transferred object to this list
 | 
			
		||||
//for the on the wire representation.
 | 
			
		||||
//
 | 
			
		||||
//This would clean up the "casts" in the WilsonCompressor.h file
 | 
			
		||||
//
 | 
			
		||||
//Other compressors (SimpleCompressor) retains mobj == cobj so no issue
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj,class cobj>
 | 
			
		||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
 | 
			
		||||
 public:
 | 
			
		||||
@@ -133,6 +138,60 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  typedef typename cobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename cobj::scalar_object scalar_object;
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////
 | 
			
		||||
  // Timing info; ugly; possibly temporary
 | 
			
		||||
  /////////////////////////////////////////
 | 
			
		||||
  double commtime;
 | 
			
		||||
  double gathertime;
 | 
			
		||||
  double gathermtime;
 | 
			
		||||
  double halogtime;
 | 
			
		||||
  double mergetime;
 | 
			
		||||
  double decompresstime;
 | 
			
		||||
  double comms_bytes;
 | 
			
		||||
  double splicetime;
 | 
			
		||||
  double nosplicetime;
 | 
			
		||||
  double calls;
 | 
			
		||||
  
 | 
			
		||||
  void ZeroCounters(void) {
 | 
			
		||||
    gathertime = 0.;
 | 
			
		||||
    commtime = 0.;
 | 
			
		||||
    halogtime = 0.;
 | 
			
		||||
    mergetime = 0.;
 | 
			
		||||
    decompresstime = 0.;
 | 
			
		||||
    gathermtime = 0.;
 | 
			
		||||
    splicetime = 0.;
 | 
			
		||||
    nosplicetime = 0.;
 | 
			
		||||
    comms_bytes = 0.;
 | 
			
		||||
    calls = 0.;
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  void Report(void) {
 | 
			
		||||
#define AVERAGE(A) _grid->GlobalSum(A);A/=NP;
 | 
			
		||||
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
 | 
			
		||||
    RealD NP = _grid->_Nprocessors;
 | 
			
		||||
    RealD NN = _grid->NodeCount();
 | 
			
		||||
    
 | 
			
		||||
    _grid->GlobalSum(commtime);    commtime/=NP;
 | 
			
		||||
    if ( calls > 0. ) {
 | 
			
		||||
      std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
 | 
			
		||||
      PRINTIT(halogtime);
 | 
			
		||||
      PRINTIT(gathertime);
 | 
			
		||||
      PRINTIT(gathermtime);
 | 
			
		||||
      PRINTIT(mergetime);
 | 
			
		||||
      PRINTIT(decompresstime);
 | 
			
		||||
      if(comms_bytes>1.0){
 | 
			
		||||
	PRINTIT(comms_bytes);
 | 
			
		||||
	PRINTIT(commtime);
 | 
			
		||||
	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
      PRINTIT(splicetime);
 | 
			
		||||
      PRINTIT(nosplicetime);
 | 
			
		||||
    }
 | 
			
		||||
#undef PRINTIT
 | 
			
		||||
#undef AVERAGE
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  // Comms packet queue for asynch thread
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
@@ -182,28 +241,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
    }
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
    commtime+=usecond();
 | 
			
		||||
    /*
 | 
			
		||||
    int dump=1;
 | 
			
		||||
    if(dump){
 | 
			
		||||
      for(int i=0;i<Packets.size();i++){
 | 
			
		||||
	cobj * ptr  = (cobj *) Packets[i].recv_buf;
 | 
			
		||||
	uint64_t num=Packets[i].bytes/sizeof(cobj);
 | 
			
		||||
	  std::cout << " CommunicateComplete " << i<< " / " << Packets.size()<< " num " << num <<std::endl;
 | 
			
		||||
	  std::stringstream ss;
 | 
			
		||||
	  ss<<"recvbuf";
 | 
			
		||||
	  for(int d=0;d<_grid->_ndimension;d++){
 | 
			
		||||
	    ss<<"."<<_grid->_processor_coor[d];
 | 
			
		||||
	  }
 | 
			
		||||
	  ss<<"_mu_"<<i;
 | 
			
		||||
	  std::string fname(ss.str());
 | 
			
		||||
	  std::ofstream fout(fname);
 | 
			
		||||
	  for(int k=0;k<num;k++) {
 | 
			
		||||
	    fout << i<<" "<<k<<" "<<ptr[k]<<std::endl;
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    dump =0;
 | 
			
		||||
*/
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////
 | 
			
		||||
@@ -214,66 +251,64 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
    std::vector<scalar_object *> rpointers;
 | 
			
		||||
    std::vector<cobj *> vpointers;
 | 
			
		||||
    Integer buffer_size;
 | 
			
		||||
    Integer packet_id;
 | 
			
		||||
    Integer exchange;
 | 
			
		||||
    Integer type;
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  std::vector<Merge> Mergers;
 | 
			
		||||
 | 
			
		||||
  void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
 | 
			
		||||
    Merge m;
 | 
			
		||||
    m.exchange = 0;
 | 
			
		||||
    m.mpointer = merge_p;
 | 
			
		||||
    m.rpointers= rpointers;
 | 
			
		||||
    m.buffer_size = buffer_size;
 | 
			
		||||
    m.packet_id   = packet_id;
 | 
			
		||||
    Mergers.push_back(m);
 | 
			
		||||
  struct Decompress {
 | 
			
		||||
    cobj * kernel_p;
 | 
			
		||||
    cobj * mpi_p;
 | 
			
		||||
    Integer buffer_size;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  void Prepare(void)
 | 
			
		||||
  {
 | 
			
		||||
    Decompressions.resize(0); 
 | 
			
		||||
    Mergers.resize(0); 
 | 
			
		||||
    Packets.resize(0);
 | 
			
		||||
    calls++;
 | 
			
		||||
  }
 | 
			
		||||
  std::vector<Decompress> Decompressions;
 | 
			
		||||
 | 
			
		||||
  void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size) {
 | 
			
		||||
    Decompress d;
 | 
			
		||||
    d.kernel_p = k_p;
 | 
			
		||||
    d.mpi_p    = m_p;
 | 
			
		||||
    d.buffer_size = buffer_size;
 | 
			
		||||
    Decompressions.push_back(d);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void AddMergeNew(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer packet_id,Integer type) {
 | 
			
		||||
  void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type) {
 | 
			
		||||
    Merge m;
 | 
			
		||||
    m.exchange = 1;
 | 
			
		||||
    m.type     = type;
 | 
			
		||||
    m.mpointer = merge_p;
 | 
			
		||||
    m.vpointers= rpointers;
 | 
			
		||||
    m.buffer_size = buffer_size;
 | 
			
		||||
    m.packet_id   = packet_id;
 | 
			
		||||
    Mergers.push_back(m);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void CommsMerge(void ) { 
 | 
			
		||||
  template<class decompressor>
 | 
			
		||||
  void CommsMerge(decompressor decompress) { 
 | 
			
		||||
 | 
			
		||||
    // Also do a precision convert possibly on a receive buffer
 | 
			
		||||
    for(int i=0;i<Mergers.size();i++){	
 | 
			
		||||
      mergetime-=usecond();
 | 
			
		||||
 | 
			
		||||
      //      std::cout << "Merge " <<i << std::endl;
 | 
			
		||||
      //      std::stringstream ss;
 | 
			
		||||
      //      ss<<"mergebuf";
 | 
			
		||||
      //      for(int d=0;d<_grid->_ndimension;d++){
 | 
			
		||||
      //	ss<<"."<<_grid->_processor_coor[d];
 | 
			
		||||
      //      }
 | 
			
		||||
      //      ss<<"_m_"<<i;
 | 
			
		||||
      //      std::string fname(ss.str());
 | 
			
		||||
      //      std::ofstream fout(fname);
 | 
			
		||||
 | 
			
		||||
      if ( Mergers[i].exchange == 0 ) { 
 | 
			
		||||
	parallel_for(int o=0;o<Mergers[i].buffer_size;o++){
 | 
			
		||||
	  merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 | 
			
		||||
	  //	fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
	parallel_for(int o=0;o<Mergers[i].buffer_size/2;o++){
 | 
			
		||||
	  exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
 | 
			
		||||
		   Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
 | 
			
		||||
	  //	  cobj temp1,temp2;
 | 
			
		||||
	  //	  exchange(temp1,temp2,Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
 | 
			
		||||
	  //	  vstream(Mergers[i].mpointer[2*o],temp1);
 | 
			
		||||
	  //	  vstream(Mergers[i].mpointer[2*o+1],temp2);
 | 
			
		||||
	}
 | 
			
		||||
      parallel_for(int o=0;o<Mergers[i].buffer_size/2;o++){
 | 
			
		||||
	decompress.Exchange(Mergers[i].mpointer,
 | 
			
		||||
			    Mergers[i].vpointers[0],
 | 
			
		||||
			    Mergers[i].vpointers[1],
 | 
			
		||||
			    Mergers[i].type,o);
 | 
			
		||||
      }
 | 
			
		||||
      mergetime+=usecond();
 | 
			
		||||
    }
 | 
			
		||||
    for(int i=0;i<Decompressions.size();i++){	
 | 
			
		||||
      decompresstime-=usecond();
 | 
			
		||||
      parallel_for(int o=0;o<Decompressions[i].buffer_size;o++){
 | 
			
		||||
	decompress.Decompress(Decompressions[i].kernel_p,Decompressions[i].mpi_p,o);
 | 
			
		||||
      }      
 | 
			
		||||
      decompresstime+=usecond();
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////
 | 
			
		||||
@@ -304,7 +339,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
 | 
			
		||||
  inline StencilEntry * GetEntry(int &ptype,int point,int osite) { 
 | 
			
		||||
    ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; 
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
 | 
			
		||||
    uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
 | 
			
		||||
    local = _entries[ent]._is_local;
 | 
			
		||||
@@ -316,6 +354,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
      return cbase + _entries[ent]._byte_offset;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  inline uint64_t GetPFInfo(int ent,uint64_t base) {
 | 
			
		||||
    uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
 | 
			
		||||
    int local = _entries[ent]._is_local;
 | 
			
		||||
@@ -327,89 +366,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  // Unified Comms buffers for all directions
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
  // Vectors that live on the symmetric heap in case of SHMEM
 | 
			
		||||
  //  std::vector<commVector<scalar_object> > u_simd_send_buf_hide;
 | 
			
		||||
  //  std::vector<commVector<scalar_object> > u_simd_recv_buf_hide;
 | 
			
		||||
  //  commVector<cobj>          u_send_buf_hide;
 | 
			
		||||
  //  commVector<cobj>          u_recv_buf_hide;
 | 
			
		||||
 | 
			
		||||
  // These are used; either SHM objects or refs to the above symmetric heap vectors
 | 
			
		||||
  // depending on comms target
 | 
			
		||||
  cobj* u_recv_buf_p;
 | 
			
		||||
  cobj* u_send_buf_p;
 | 
			
		||||
  std::vector<cobj *> new_simd_send_buf;
 | 
			
		||||
  std::vector<cobj *> new_simd_recv_buf;
 | 
			
		||||
  std::vector<scalar_object *> u_simd_send_buf;
 | 
			
		||||
  std::vector<scalar_object *> u_simd_recv_buf;
 | 
			
		||||
  std::vector<cobj *> u_simd_send_buf;
 | 
			
		||||
  std::vector<cobj *> u_simd_recv_buf;
 | 
			
		||||
 | 
			
		||||
  int u_comm_offset;
 | 
			
		||||
  int _unified_buffer_size;
 | 
			
		||||
  
 | 
			
		||||
  cobj *CommBuf(void) { return u_recv_buf_p; }
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////
 | 
			
		||||
  // Timing info; ugly; possibly temporary
 | 
			
		||||
  /////////////////////////////////////////
 | 
			
		||||
#define TIMING_HACK
 | 
			
		||||
#ifdef TIMING_HACK
 | 
			
		||||
  double jointime;
 | 
			
		||||
  double gathertime;
 | 
			
		||||
  double commtime;
 | 
			
		||||
  double halogtime;
 | 
			
		||||
  double mergetime;
 | 
			
		||||
  double spintime;
 | 
			
		||||
  double comms_bytes;
 | 
			
		||||
  double gathermtime;
 | 
			
		||||
  double splicetime;
 | 
			
		||||
  double nosplicetime;
 | 
			
		||||
  double t_data;
 | 
			
		||||
  double t_table;
 | 
			
		||||
  double calls;
 | 
			
		||||
  
 | 
			
		||||
  void ZeroCounters(void) {
 | 
			
		||||
    gathertime = 0.;
 | 
			
		||||
    jointime = 0.;
 | 
			
		||||
    commtime = 0.;
 | 
			
		||||
    halogtime = 0.;
 | 
			
		||||
    mergetime = 0.;
 | 
			
		||||
    spintime = 0.;
 | 
			
		||||
    gathermtime = 0.;
 | 
			
		||||
    splicetime = 0.;
 | 
			
		||||
    nosplicetime = 0.;
 | 
			
		||||
    t_data = 0.0;
 | 
			
		||||
    t_table= 0.0;
 | 
			
		||||
    comms_bytes = 0.;
 | 
			
		||||
    calls = 0.;
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  void Report(void) {
 | 
			
		||||
#define PRINTIT(A)	\
 | 
			
		||||
 std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
 | 
			
		||||
 | 
			
		||||
    RealD NP = _grid->_Nprocessors;
 | 
			
		||||
    RealD NN = _grid->NodeCount();
 | 
			
		||||
 | 
			
		||||
    _grid->GlobalSum(commtime);    commtime/=NP;
 | 
			
		||||
    if ( calls > 0. ) {
 | 
			
		||||
      std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
 | 
			
		||||
      PRINTIT(halogtime);
 | 
			
		||||
      PRINTIT(gathertime);
 | 
			
		||||
      PRINTIT(gathermtime);
 | 
			
		||||
      PRINTIT(mergetime);
 | 
			
		||||
      if(comms_bytes>1.0){
 | 
			
		||||
	PRINTIT(comms_bytes);
 | 
			
		||||
	PRINTIT(commtime);
 | 
			
		||||
	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
      PRINTIT(jointime);
 | 
			
		||||
      PRINTIT(spintime);
 | 
			
		||||
      PRINTIT(splicetime);
 | 
			
		||||
      PRINTIT(nosplicetime);
 | 
			
		||||
      PRINTIT(t_table);
 | 
			
		||||
	   PRINTIT(t_data);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 #endif
 | 
			
		||||
 | 
			
		||||
 CartesianStencil(GridBase *grid,
 | 
			
		||||
		  int npoints,
 | 
			
		||||
@@ -493,21 +461,13 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
 | 
			
		||||
    u_simd_send_buf.resize(Nsimd);
 | 
			
		||||
    u_simd_recv_buf.resize(Nsimd);
 | 
			
		||||
    new_simd_send_buf.resize(Nsimd);
 | 
			
		||||
    new_simd_recv_buf.resize(Nsimd);
 | 
			
		||||
    u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
 | 
			
		||||
    u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
 | 
			
		||||
#ifdef NEW_XYZT_GATHER
 | 
			
		||||
 | 
			
		||||
    for(int l=0;l<2;l++){
 | 
			
		||||
      new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
 | 
			
		||||
      new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
 | 
			
		||||
      u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
 | 
			
		||||
      u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
 | 
			
		||||
    }
 | 
			
		||||
#else
 | 
			
		||||
    for(int l=0;l<Nsimd;l++){
 | 
			
		||||
      u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
 | 
			
		||||
      u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    PrecomputeByteOffsets();
 | 
			
		||||
  }
 | 
			
		||||
@@ -611,7 +571,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
	offnode = (comm_proc!= 0);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      
 | 
			
		||||
      int wraparound=0;
 | 
			
		||||
      if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
 | 
			
		||||
	wraparound = 1;
 | 
			
		||||
@@ -738,13 +697,11 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
  {
 | 
			
		||||
    std::vector<std::vector<CommsRequest_t> > reqs;
 | 
			
		||||
    calls++;
 | 
			
		||||
    Mergers.resize(0);
 | 
			
		||||
    Packets.resize(0);
 | 
			
		||||
    Prepare();
 | 
			
		||||
    HaloGather(source,compress);
 | 
			
		||||
    this->CommunicateBegin(reqs);
 | 
			
		||||
    this->CommunicateComplete(reqs);
 | 
			
		||||
    CommsMerge(); 
 | 
			
		||||
    CommunicateBegin(reqs);
 | 
			
		||||
    CommunicateComplete(reqs);
 | 
			
		||||
    CommsMerge(compress); 
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
 | 
			
		||||
@@ -775,13 +732,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
      if ( sshift[0] == sshift[1] ) {
 | 
			
		||||
	if (splice_dim) {
 | 
			
		||||
	  splicetime-=usecond();
 | 
			
		||||
	  //	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
 | 
			
		||||
	  //	  std::cout << "GatherSimdNew"<<std::endl;
 | 
			
		||||
#ifdef NEW_XYZT_GATHER
 | 
			
		||||
	  GatherSimdNew(source,dimension,shift,0x3,compress,face_idx);
 | 
			
		||||
#else 
 | 
			
		||||
	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
 | 
			
		||||
#endif
 | 
			
		||||
	  splicetime+=usecond();
 | 
			
		||||
	} else { 
 | 
			
		||||
	  nosplicetime-=usecond();
 | 
			
		||||
@@ -791,14 +742,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
      } else {
 | 
			
		||||
	if(splice_dim){
 | 
			
		||||
	  splicetime-=usecond();
 | 
			
		||||
	  //	  std::cout << "GatherSimdNew2calls"<<std::endl;
 | 
			
		||||
#ifdef NEW_XYZT_GATHER
 | 
			
		||||
	  GatherSimdNew(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
 | 
			
		||||
	  GatherSimdNew(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 | 
			
		||||
#else 
 | 
			
		||||
	  GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
 | 
			
		||||
	  GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 | 
			
		||||
#endif
 | 
			
		||||
	  splicetime+=usecond();
 | 
			
		||||
	} else {
 | 
			
		||||
	  nosplicetime-=usecond();
 | 
			
		||||
@@ -868,17 +813,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
	int words = buffer_size;
 | 
			
		||||
	if (cbmask != 0x3) words=words>>1;
 | 
			
		||||
	
 | 
			
		||||
	int bytes = words * sizeof(cobj);
 | 
			
		||||
	int bytes =  words * compress.CommDatumSize();
 | 
			
		||||
	
 | 
			
		||||
	gathertime-=usecond();
 | 
			
		||||
	int so  = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
	if ( !face_table_computed ) {
 | 
			
		||||
	  t_table-=usecond();
 | 
			
		||||
	  face_table.resize(face_idx+1);
 | 
			
		||||
	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
 | 
			
		||||
	  //	  std::cout << " face table size "<<face_idx <<" " <<  face_table[face_idx].size() <<" computed buffer size "<< words <<
 | 
			
		||||
	  //		    " bytes = " << bytes <<std::endl;
 | 
			
		||||
	  t_table+=usecond();
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
      	int rank           = _grid->_processor;
 | 
			
		||||
@@ -897,18 +837,33 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
	  send_buf = u_send_buf_p;
 | 
			
		||||
	} 
 | 
			
		||||
 | 
			
		||||
	t_data-=usecond();
 | 
			
		||||
	cobj *recv_buf;
 | 
			
		||||
	gathertime-=usecond();
 | 
			
		||||
	assert(send_buf!=NULL);
 | 
			
		||||
	Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);  face_idx++;
 | 
			
		||||
	t_data+=usecond();
 | 
			
		||||
	
 | 
			
		||||
	AddPacket((void *)&send_buf[u_comm_offset],
 | 
			
		||||
		  (void *)&u_recv_buf_p[u_comm_offset],
 | 
			
		||||
		  xmit_to_rank,
 | 
			
		||||
		  recv_from_rank,
 | 
			
		||||
		  bytes);
 | 
			
		||||
 | 
			
		||||
	gathertime+=usecond();
 | 
			
		||||
	
 | 
			
		||||
	if ( compress.DecompressionStep() ) {
 | 
			
		||||
 | 
			
		||||
	  recv_buf = &u_simd_recv_buf[0][u_comm_offset];
 | 
			
		||||
	  
 | 
			
		||||
	  AddDecompress(&u_recv_buf_p[u_comm_offset],
 | 
			
		||||
			&recv_buf[u_comm_offset],
 | 
			
		||||
			words);
 | 
			
		||||
 | 
			
		||||
	  AddPacket((void *)&send_buf[u_comm_offset],
 | 
			
		||||
		    (void *)&recv_buf[u_comm_offset],
 | 
			
		||||
		    xmit_to_rank,
 | 
			
		||||
		    recv_from_rank,
 | 
			
		||||
		    bytes);
 | 
			
		||||
 | 
			
		||||
	} else {
 | 
			
		||||
	  AddPacket((void *)&send_buf[u_comm_offset],
 | 
			
		||||
		    (void *)&u_recv_buf_p[u_comm_offset],
 | 
			
		||||
		    xmit_to_rank,
 | 
			
		||||
		    recv_from_rank,
 | 
			
		||||
		    bytes);
 | 
			
		||||
	}
 | 
			
		||||
	u_comm_offset+=words;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
@@ -919,123 +874,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  {
 | 
			
		||||
    const int Nsimd = _grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
    int fd = _grid->_fdimensions[dimension];
 | 
			
		||||
    int rd = _grid->_rdimensions[dimension];
 | 
			
		||||
    int ld = _grid->_ldimensions[dimension];
 | 
			
		||||
    int pd              = _grid->_processors[dimension];
 | 
			
		||||
    int simd_layout     = _grid->_simd_layout[dimension];
 | 
			
		||||
    int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
			
		||||
    
 | 
			
		||||
    assert(comm_dim==1);
 | 
			
		||||
    // This will not work with a rotate dim
 | 
			
		||||
    assert(simd_layout==2);
 | 
			
		||||
    assert(shift>=0);
 | 
			
		||||
    assert(shift<fd);
 | 
			
		||||
    
 | 
			
		||||
    int permute_type=_grid->PermuteType(dimension);
 | 
			
		||||
    
 | 
			
		||||
    ///////////////////////////////////////////////
 | 
			
		||||
    // Simd direction uses an extract/merge pair
 | 
			
		||||
    ///////////////////////////////////////////////
 | 
			
		||||
    int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
			
		||||
    int words = sizeof(cobj)/sizeof(vector_type);
 | 
			
		||||
    
 | 
			
		||||
    assert(cbmask==0x3); // Fixme think there is a latent bug if not true
 | 
			
		||||
    
 | 
			
		||||
    int bytes = buffer_size*sizeof(scalar_object);
 | 
			
		||||
    
 | 
			
		||||
    std::vector<scalar_object *> rpointers(Nsimd);
 | 
			
		||||
    std::vector<scalar_object *> spointers(Nsimd);
 | 
			
		||||
 | 
			
		||||
    //    std::cout << "GatherSimd " << dimension << " shift "<<shift<<std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////
 | 
			
		||||
    // Work out what to send where
 | 
			
		||||
    ///////////////////////////////////////////
 | 
			
		||||
    
 | 
			
		||||
    int cb    = (cbmask==0x2)? Odd : Even;
 | 
			
		||||
    int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
 | 
			
		||||
    
 | 
			
		||||
    // loop over outer coord planes orthog to dim
 | 
			
		||||
    for(int x=0;x<rd;x++){       
 | 
			
		||||
      
 | 
			
		||||
      int any_offnode = ( ((x+sshift)%fd) >= rd );
 | 
			
		||||
 | 
			
		||||
      if ( any_offnode ) {
 | 
			
		||||
	
 | 
			
		||||
	for(int i=0;i<Nsimd;i++){       
 | 
			
		||||
	  spointers[i] = &u_simd_send_buf[i][u_comm_offset];
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
	int sx   = (x+sshift)%rd;
 | 
			
		||||
	
 | 
			
		||||
	gathermtime-=usecond();
 | 
			
		||||
	Gather_plane_extract<cobj>(rhs,spointers,dimension,sx,cbmask,compress);
 | 
			
		||||
	gathermtime+=usecond();
 | 
			
		||||
 | 
			
		||||
	for(int i=0;i<Nsimd;i++){
 | 
			
		||||
	  
 | 
			
		||||
	  // FIXME -  This logic is hard coded to simd_layout==2 and not allowing >2
 | 
			
		||||
	  //	  for(int w=0;w<buffer_size;w++){
 | 
			
		||||
	  //	    std::cout << "GatherSimd<"<<Nsimd<<"> : lane " << i <<" elem "<<w<<" "<< u_simd_send_buf[i ][u_comm_offset+w]<<std::endl;
 | 
			
		||||
	  //	  }
 | 
			
		||||
	  int inner_bit = (Nsimd>>(permute_type+1));
 | 
			
		||||
	  int ic= (i&inner_bit)? 1:0;
 | 
			
		||||
	  
 | 
			
		||||
	  int my_coor  = rd*ic + x;
 | 
			
		||||
	  int nbr_coor = my_coor+sshift;
 | 
			
		||||
	  int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
 | 
			
		||||
	  int nbr_lcoor= (nbr_coor%ld);
 | 
			
		||||
	  int nbr_ic   = (nbr_lcoor)/rd;    // inner coord of peer
 | 
			
		||||
	  int nbr_ox   = (nbr_lcoor%rd);    // outer coord of peer
 | 
			
		||||
	  int nbr_lane = (i&(~inner_bit));
 | 
			
		||||
	  
 | 
			
		||||
	  if (nbr_ic) nbr_lane|=inner_bit;
 | 
			
		||||
	  assert (sx == nbr_ox);
 | 
			
		||||
 | 
			
		||||
	  auto rp = &u_simd_recv_buf[i       ][u_comm_offset];
 | 
			
		||||
	  auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
 | 
			
		||||
 | 
			
		||||
	  if(nbr_proc){
 | 
			
		||||
	    
 | 
			
		||||
	    int recv_from_rank;
 | 
			
		||||
	    int xmit_to_rank;
 | 
			
		||||
	    
 | 
			
		||||
	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 | 
			
		||||
 
 | 
			
		||||
	    // shm == receive pointer         if offnode
 | 
			
		||||
	    // shm == Translate[send pointer] if on node -- my view of his send pointer
 | 
			
		||||
	    scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp);
 | 
			
		||||
	    if (shm==NULL) { 
 | 
			
		||||
	      shm = rp;
 | 
			
		||||
	    }
 | 
			
		||||
 | 
			
		||||
	    // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
 | 
			
		||||
	    // assuming above pointer flip
 | 
			
		||||
	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
 | 
			
		||||
 | 
			
		||||
	    rpointers[i] = shm;
 | 
			
		||||
	    
 | 
			
		||||
	  } else { 
 | 
			
		||||
	    
 | 
			
		||||
	    rpointers[i] = sp;
 | 
			
		||||
	    
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
 | 
			
		||||
	
 | 
			
		||||
	u_comm_offset     +=buffer_size;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  template<class compressor>
 | 
			
		||||
  void  GatherSimdNew(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
 | 
			
		||||
  {
 | 
			
		||||
    const int Nsimd = _grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
    const int maxl =2;// max layout in a direction
 | 
			
		||||
    int fd = _grid->_fdimensions[dimension];
 | 
			
		||||
    int rd = _grid->_rdimensions[dimension];
 | 
			
		||||
@@ -1085,25 +923,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
 | 
			
		||||
	
 | 
			
		||||
	for(int i=0;i<maxl;i++){       
 | 
			
		||||
	  spointers[i] = (cobj *) &new_simd_send_buf[i][u_comm_offset];
 | 
			
		||||
	  spointers[i] = (cobj *) &u_simd_send_buf[i][u_comm_offset];
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
	int sx   = (x+sshift)%rd;
 | 
			
		||||
 | 
			
		||||
	//	if ( cbmask==0x3 ) { 
 | 
			
		||||
	//	  std::vector<std::pair<int,int> > table;
 | 
			
		||||
	t_table-=usecond();
 | 
			
		||||
	if ( !face_table_computed ) {
 | 
			
		||||
	  face_table.resize(face_idx+1);
 | 
			
		||||
	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
 | 
			
		||||
	  //	  std::cout << " face table size "<<face_idx <<" " <<  face_table[face_idx].size() <<" computed buffer size "<< reduced_buffer_size <<
 | 
			
		||||
	  //		    " bytes = "<<bytes <<std::endl;
 | 
			
		||||
	}
 | 
			
		||||
	t_table+=usecond();
 | 
			
		||||
	gathermtime-=usecond();
 | 
			
		||||
	Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);  face_idx++;
 | 
			
		||||
	gathermtime+=usecond();
 | 
			
		||||
      
 | 
			
		||||
	//spointers[0] -- low
 | 
			
		||||
	//spointers[1] -- high
 | 
			
		||||
 | 
			
		||||
@@ -1120,8 +951,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
	  int nbr_plane = nbr_ic;
 | 
			
		||||
	  assert (sx == nbr_ox);
 | 
			
		||||
 | 
			
		||||
	  auto rp = &new_simd_recv_buf[i        ][u_comm_offset];
 | 
			
		||||
	  auto sp = &new_simd_send_buf[nbr_plane][u_comm_offset];
 | 
			
		||||
	  auto rp = &u_simd_recv_buf[i        ][u_comm_offset];
 | 
			
		||||
	  auto sp = &u_simd_send_buf[nbr_plane][u_comm_offset];
 | 
			
		||||
 | 
			
		||||
	  if(nbr_proc){
 | 
			
		||||
 | 
			
		||||
@@ -1150,7 +981,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,Packets.size()-1,permute_type);
 | 
			
		||||
	AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type);
 | 
			
		||||
 | 
			
		||||
	u_comm_offset     +=buffer_size;
 | 
			
		||||
      }
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user