mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	RRII gpu option
This commit is contained in:
		@@ -131,7 +131,7 @@ public:
 | 
			
		||||
  template<class obj> void GlobalSum(obj &o){
 | 
			
		||||
    typedef typename obj::scalar_type scalar_type;
 | 
			
		||||
    int words = sizeof(obj)/sizeof(scalar_type);
 | 
			
		||||
    scalar_type * ptr = (scalar_type *)& o;
 | 
			
		||||
    scalar_type * ptr = (scalar_type *)& o; // Safe alias 
 | 
			
		||||
    GlobalSumVector(ptr,words);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
@@ -63,7 +63,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
 | 
			
		||||
  typename std::remove_const<vobj>::type ret;
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::scalar_object scalar_object;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  //  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
  const int Nsimd = vobj::vector_type::Nsimd();
 | 
			
		||||
 
 | 
			
		||||
@@ -32,7 +32,6 @@ template<class vobj>
 | 
			
		||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 | 
			
		||||
{    
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
 | 
			
		||||
@@ -82,7 +81,6 @@ template<class vobj>
 | 
			
		||||
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 | 
			
		||||
{    
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
 | 
			
		||||
@@ -130,7 +128,6 @@ template<class vobj>
 | 
			
		||||
static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
  
 | 
			
		||||
  GridBase *FullGrid  = lhs.Grid();
 | 
			
		||||
 
 | 
			
		||||
@@ -96,9 +96,6 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 | 
			
		||||
 | 
			
		||||
  GridBase *grid=l.Grid();
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
  int Nsimd = grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
 | 
			
		||||
@@ -136,9 +133,6 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
 | 
			
		||||
        
 | 
			
		||||
  GridBase *grid=l.Grid();
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
  int Nsimd = grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
  assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
 | 
			
		||||
@@ -179,11 +173,11 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
 | 
			
		||||
  idx= grid->iIndex(site);
 | 
			
		||||
  odx= grid->oIndex(site);
 | 
			
		||||
  
 | 
			
		||||
  scalar_type * vp = (scalar_type *)&l[odx];
 | 
			
		||||
  const vector_type *vp = (const vector_type *) &l[odx];
 | 
			
		||||
  scalar_type * pt = (scalar_type *)&s;
 | 
			
		||||
      
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    pt[w] = vp[idx+w*Nsimd];
 | 
			
		||||
    pt[w] = getlane(vp[w],idx);
 | 
			
		||||
  }
 | 
			
		||||
      
 | 
			
		||||
  return;
 | 
			
		||||
@@ -216,10 +210,10 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
 | 
			
		||||
  idx= grid->iIndex(site);
 | 
			
		||||
  odx= grid->oIndex(site);
 | 
			
		||||
 | 
			
		||||
  scalar_type * vp = (scalar_type *)&l[odx];
 | 
			
		||||
  vector_type * vp = (vector_type *)&l[odx];
 | 
			
		||||
  scalar_type * pt = (scalar_type *)&s;
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    vp[idx+w*Nsimd] = pt[w];
 | 
			
		||||
    putlane(vp[w],pt[w],idx);
 | 
			
		||||
  }
 | 
			
		||||
  return;
 | 
			
		||||
};
 | 
			
		||||
 
 | 
			
		||||
@@ -219,7 +219,6 @@ template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_typeD vector_type;
 | 
			
		||||
  ComplexD  nrm;
 | 
			
		||||
  
 | 
			
		||||
@@ -296,7 +295,6 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
 | 
			
		||||
  conformable(z,x);
 | 
			
		||||
  conformable(x,y);
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  //  typedef typename vobj::vector_typeD vector_type;
 | 
			
		||||
  RealD  nrm;
 | 
			
		||||
  
 | 
			
		||||
@@ -341,7 +339,6 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
 | 
			
		||||
{
 | 
			
		||||
  conformable(left,right);
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_typeD vector_type;
 | 
			
		||||
  Vector<ComplexD> tmp(2);
 | 
			
		||||
 | 
			
		||||
@@ -597,7 +594,8 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
 | 
			
		||||
template<class vobj>
 | 
			
		||||
static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 | 
			
		||||
			    int orthogdim,RealD scale=1.0) 
 | 
			
		||||
{    
 | 
			
		||||
{
 | 
			
		||||
  // perhaps easier to just promote A to a field and use regular madd
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
@@ -628,8 +626,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
 | 
			
		||||
    for(int l=0;l<Nsimd;l++){
 | 
			
		||||
      grid->iCoorFromIindex(icoor,l);
 | 
			
		||||
      int ldx =r+icoor[orthogdim]*rd;
 | 
			
		||||
      scalar_type *as =(scalar_type *)&av;
 | 
			
		||||
      as[l] = scalar_type(a[ldx])*zscale;
 | 
			
		||||
      av.putlane(scalar_type(a[ldx])*zscale,l);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    tensor_reduced at; at=av;
 | 
			
		||||
@@ -669,7 +666,6 @@ template<class vobj>
 | 
			
		||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 | 
			
		||||
{    
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
 | 
			
		||||
@@ -723,7 +719,6 @@ template<class vobj>
 | 
			
		||||
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 | 
			
		||||
{    
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
 | 
			
		||||
@@ -777,7 +772,6 @@ template<class vobj>
 | 
			
		||||
static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
  
 | 
			
		||||
  GridBase *FullGrid  = lhs.Grid();
 | 
			
		||||
 
 | 
			
		||||
@@ -250,8 +250,6 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
 | 
			
		||||
template <class vobj>
 | 
			
		||||
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::vector_type  vector;
 | 
			
		||||
  typedef typename vobj::scalar_typeD scalarD;
 | 
			
		||||
  typedef typename vobj::scalar_objectD sobj;
 | 
			
		||||
  sobj ret;
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
@@ -677,10 +677,10 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 | 
			
		||||
      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
 | 
			
		||||
      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
 | 
			
		||||
      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
 | 
			
		||||
      scalar_type * fp = (scalar_type *)&f_v[odx_f];
 | 
			
		||||
      scalar_type * tp = (scalar_type *)&t_v[odx_t];
 | 
			
		||||
      vector_type * fp = (vector_type *)&f_v[odx_f];
 | 
			
		||||
      vector_type * tp = (vector_type *)&t_v[odx_t];
 | 
			
		||||
      for(int w=0;w<words;w++){
 | 
			
		||||
	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
 | 
			
		||||
	tp[w].putlane(fp[w].getlane(idx_f),idx_t);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
 
 | 
			
		||||
@@ -905,88 +905,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 | 
			
		||||
#undef TopRowWithSource
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
 | 
			
		||||
						 Vector<iSinglet<Simd> > & Matp,
 | 
			
		||||
						 Vector<iSinglet<Simd> > & Matm)
 | 
			
		||||
{
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
 | 
			
		||||
  GridBase *grid = this->FermionRedBlackGrid();
 | 
			
		||||
  int LLs = grid->_rdimensions[0];
 | 
			
		||||
 | 
			
		||||
  if ( LLs == Ls ) {
 | 
			
		||||
    return; // Not vectorised in 5th direction
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
  
 | 
			
		||||
  for(int s=0;s<Ls;s++){
 | 
			
		||||
    Pplus(s,s) = bee[s];
 | 
			
		||||
    Pminus(s,s)= bee[s];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  for(int s=0;s<Ls-1;s++){
 | 
			
		||||
    Pminus(s,s+1) = -cee[s];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  for(int s=0;s<Ls-1;s++){
 | 
			
		||||
    Pplus(s+1,s) = -cee[s+1];
 | 
			
		||||
  }
 | 
			
		||||
  Pplus (0,Ls-1) = mass*cee[0];
 | 
			
		||||
  Pminus(Ls-1,0) = mass*cee[Ls-1];
 | 
			
		||||
  
 | 
			
		||||
  Eigen::MatrixXcd PplusMat ;
 | 
			
		||||
  Eigen::MatrixXcd PminusMat;
 | 
			
		||||
  
 | 
			
		||||
  if ( inv ) {
 | 
			
		||||
    PplusMat =Pplus.inverse();
 | 
			
		||||
    PminusMat=Pminus.inverse();
 | 
			
		||||
  } else { 
 | 
			
		||||
    PplusMat =Pplus;
 | 
			
		||||
    PminusMat=Pminus;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  if(dag){
 | 
			
		||||
    PplusMat.adjointInPlace();
 | 
			
		||||
    PminusMat.adjointInPlace();
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  typedef typename SiteHalfSpinor::scalar_type scalar_type;
 | 
			
		||||
  const int Nsimd=Simd::Nsimd();
 | 
			
		||||
  Matp.resize(Ls*LLs);
 | 
			
		||||
  Matm.resize(Ls*LLs);
 | 
			
		||||
 | 
			
		||||
  for(int s2=0;s2<Ls;s2++){
 | 
			
		||||
    for(int s1=0;s1<LLs;s1++){
 | 
			
		||||
      int istride = LLs;
 | 
			
		||||
      int ostride = 1;
 | 
			
		||||
      Simd Vp;
 | 
			
		||||
      Simd Vm;
 | 
			
		||||
      scalar_type *sp = (scalar_type *)&Vp;
 | 
			
		||||
      scalar_type *sm = (scalar_type *)&Vm;
 | 
			
		||||
      for(int l=0;l<Nsimd;l++){
 | 
			
		||||
	if ( switcheroo<Coeff_t>::iscomplex() ) {
 | 
			
		||||
	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
 | 
			
		||||
	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
 | 
			
		||||
	} else { 
 | 
			
		||||
	  // if real
 | 
			
		||||
	  scalar_type tmp;
 | 
			
		||||
	  tmp = PplusMat (l*istride+s1*ostride,s2);
 | 
			
		||||
	  sp[l] = scalar_type(tmp.real(),tmp.real());
 | 
			
		||||
	  tmp = PminusMat(l*istride+s1*ostride,s2);
 | 
			
		||||
	  sm[l] = scalar_type(tmp.real(),tmp.real());
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      Matp[LLs*s2+s1] = Vp;
 | 
			
		||||
      Matm[LLs*s2+s1] = Vm;
 | 
			
		||||
    }}
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -615,7 +615,6 @@ public:
 | 
			
		||||
    GridBase *grid = out.Grid();
 | 
			
		||||
 | 
			
		||||
    typedef typename LatticeMatrixType::vector_type vector_type;
 | 
			
		||||
    typedef typename LatticeMatrixType::scalar_type scalar_type;
 | 
			
		||||
 | 
			
		||||
    typedef iSinglet<vector_type> vTComplexType;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -501,7 +501,7 @@ struct Conj{
 | 
			
		||||
struct TimesMinusI{
 | 
			
		||||
  // Complex
 | 
			
		||||
  template <typename T>
 | 
			
		||||
  inline vec<T> operator()(vec<T> a, vec<T> b){
 | 
			
		||||
  inline vec<T> operator()(vec<T> a){
 | 
			
		||||
    vec<T> out;
 | 
			
		||||
    const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap();
 | 
			
		||||
    svbool_t pg1 = acle<T>::pg1();
 | 
			
		||||
@@ -520,7 +520,7 @@ struct TimesMinusI{
 | 
			
		||||
struct TimesI{
 | 
			
		||||
  // Complex
 | 
			
		||||
  template <typename T>
 | 
			
		||||
  inline vec<T> operator()(vec<T> a, vec<T> b){
 | 
			
		||||
  inline vec<T> operator()(vec<T> a){
 | 
			
		||||
    vec<T> out;
 | 
			
		||||
    const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap();
 | 
			
		||||
    svbool_t pg1 = acle<T>::pg1();
 | 
			
		||||
 
 | 
			
		||||
@@ -418,7 +418,7 @@ struct Conj{
 | 
			
		||||
 | 
			
		||||
struct TimesMinusI{
 | 
			
		||||
  // Complex float
 | 
			
		||||
  inline vecf operator()(vecf a, vecf b){
 | 
			
		||||
  inline vecf operator()(vecf a){
 | 
			
		||||
    lutf tbl_swap = acle<float>::tbl_swap();
 | 
			
		||||
    pred pg1 = acle<float>::pg1();
 | 
			
		||||
    pred pg_odd = acle<float>::pg_odd();
 | 
			
		||||
@@ -428,7 +428,7 @@ struct TimesMinusI{
 | 
			
		||||
    return svneg_m(a_v, pg_odd, a_v);
 | 
			
		||||
  }
 | 
			
		||||
  // Complex double
 | 
			
		||||
  inline vecd operator()(vecd a, vecd b){
 | 
			
		||||
  inline vecd operator()(vecd a){
 | 
			
		||||
    lutd tbl_swap = acle<double>::tbl_swap();
 | 
			
		||||
    pred pg1 = acle<double>::pg1();
 | 
			
		||||
    pred pg_odd = acle<double>::pg_odd();
 | 
			
		||||
@@ -441,7 +441,7 @@ struct TimesMinusI{
 | 
			
		||||
 | 
			
		||||
struct TimesI{
 | 
			
		||||
  // Complex float
 | 
			
		||||
  inline vecf operator()(vecf a, vecf b){
 | 
			
		||||
  inline vecf operator()(vecf a){
 | 
			
		||||
    lutf tbl_swap = acle<float>::tbl_swap();
 | 
			
		||||
    pred pg1 = acle<float>::pg1();
 | 
			
		||||
    pred pg_even = acle<float>::pg_even();
 | 
			
		||||
@@ -451,7 +451,7 @@ struct TimesI{
 | 
			
		||||
    return svneg_m(a_v, pg_even, a_v);
 | 
			
		||||
  }
 | 
			
		||||
  // Complex double
 | 
			
		||||
  inline vecd operator()(vecd a, vecd b){
 | 
			
		||||
  inline vecd operator()(vecd a){
 | 
			
		||||
    lutd tbl_swap = acle<double>::tbl_swap();
 | 
			
		||||
    pred pg1 = acle<double>::pg1();
 | 
			
		||||
    pred pg_even = acle<double>::pg_even();
 | 
			
		||||
 
 | 
			
		||||
@@ -405,12 +405,12 @@ struct Conj{
 | 
			
		||||
 | 
			
		||||
struct TimesMinusI{
 | 
			
		||||
  //Complex single
 | 
			
		||||
  inline __m256 operator()(__m256 in, __m256 ret){
 | 
			
		||||
  inline __m256 operator()(__m256 in){
 | 
			
		||||
    __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in);   // r,-i
 | 
			
		||||
    return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
 | 
			
		||||
  }
 | 
			
		||||
  //Complex double
 | 
			
		||||
  inline __m256d operator()(__m256d in, __m256d ret){
 | 
			
		||||
  inline __m256d operator()(__m256d in){
 | 
			
		||||
    __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i
 | 
			
		||||
    return _mm256_shuffle_pd(tmp,tmp,0x5);
 | 
			
		||||
  }
 | 
			
		||||
@@ -418,12 +418,12 @@ struct TimesMinusI{
 | 
			
		||||
 | 
			
		||||
struct TimesI{
 | 
			
		||||
  //Complex single
 | 
			
		||||
  inline __m256 operator()(__m256 in, __m256 ret){
 | 
			
		||||
  inline __m256 operator()(__m256 in){
 | 
			
		||||
    __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
 | 
			
		||||
    return _mm256_addsub_ps(_mm256_setzero_ps(),tmp);          // i,-r
 | 
			
		||||
  }
 | 
			
		||||
  //Complex double
 | 
			
		||||
  inline __m256d operator()(__m256d in, __m256d ret){
 | 
			
		||||
  inline __m256d operator()(__m256d in){
 | 
			
		||||
    __m256d tmp = _mm256_shuffle_pd(in,in,0x5);
 | 
			
		||||
    return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -271,14 +271,14 @@ struct Conj{
 | 
			
		||||
 | 
			
		||||
struct TimesMinusI{
 | 
			
		||||
  //Complex single
 | 
			
		||||
  inline __m512 operator()(__m512 in, __m512 ret){
 | 
			
		||||
  inline __m512 operator()(__m512 in){
 | 
			
		||||
    //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
 | 
			
		||||
    //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0));   // 0x4E??
 | 
			
		||||
    __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
    return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
 | 
			
		||||
  }
 | 
			
		||||
  //Complex double
 | 
			
		||||
  inline __m512d operator()(__m512d in, __m512d ret){
 | 
			
		||||
  inline __m512d operator()(__m512d in){
 | 
			
		||||
    //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
 | 
			
		||||
    //return _mm512_shuffle_pd(tmp,tmp,0x55);
 | 
			
		||||
    __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
 | 
			
		||||
@@ -288,17 +288,16 @@ struct TimesMinusI{
 | 
			
		||||
 | 
			
		||||
struct TimesI{
 | 
			
		||||
  //Complex single
 | 
			
		||||
  inline __m512 operator()(__m512 in, __m512 ret){
 | 
			
		||||
  inline __m512 operator()(__m512 in){
 | 
			
		||||
    __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
    return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); 
 | 
			
		||||
  }
 | 
			
		||||
  //Complex double
 | 
			
		||||
  inline __m512d operator()(__m512d in, __m512d ret){
 | 
			
		||||
  inline __m512d operator()(__m512d in){
 | 
			
		||||
    __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
 | 
			
		||||
    return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); 
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
  
 | 
			
		||||
// Gpermute utilities consider coalescing into 1 Gpermute
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										878
									
								
								Grid/simd/Grid_gpu_rrii.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										878
									
								
								Grid/simd/Grid_gpu_rrii.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,878 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/simd/Grid_gpu.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2021
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
/*! @file Grid_gpu_rrii.h*/
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
//////////////////////////////
 | 
			
		||||
// fp16
 | 
			
		||||
//////////////////////////////
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
#include <cuda_fp16.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
#include <hip/hip_fp16.h>
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined(GRID_HIP) && !defined(GRID_CUDA) 
 | 
			
		||||
namespace Grid {
 | 
			
		||||
  typedef struct { uint16_t x;} half;
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
namespace Grid {
 | 
			
		||||
  accelerator_inline float half2float(half h)
 | 
			
		||||
  {
 | 
			
		||||
    float f;
 | 
			
		||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
 | 
			
		||||
    f = __half2float(h);
 | 
			
		||||
#else 
 | 
			
		||||
    Grid_half hh; 
 | 
			
		||||
    hh.x = h.x;
 | 
			
		||||
    f=  sfw_half_to_float(hh);
 | 
			
		||||
#endif
 | 
			
		||||
    return f;
 | 
			
		||||
  }
 | 
			
		||||
  accelerator_inline half float2half(float f)
 | 
			
		||||
  {
 | 
			
		||||
    half h;
 | 
			
		||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
 | 
			
		||||
    h = __float2half(f);
 | 
			
		||||
#else
 | 
			
		||||
    Grid_half hh = sfw_float_to_half(f);
 | 
			
		||||
    h.x = hh.x;
 | 
			
		||||
#endif
 | 
			
		||||
    return h;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Real vector
 | 
			
		||||
////////////////////////////////////////////////////////////////////////  
 | 
			
		||||
template<int _N, class _datum>
 | 
			
		||||
struct GpuVector {
 | 
			
		||||
  _datum rrrr[_N];
 | 
			
		||||
  static const int N = _N;
 | 
			
		||||
  typedef _datum datum;
 | 
			
		||||
};
 | 
			
		||||
template<int N,class datum>
 | 
			
		||||
inline accelerator GpuVector<N,datum> operator*(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
 | 
			
		||||
  GpuVector<N,datum> ret;
 | 
			
		||||
  for(int i=0;i<N;i++) { 
 | 
			
		||||
    ret.rrrr[i] = l.rrrr[i]*r.rrrr[i];
 | 
			
		||||
  }
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
template<int N,class datum>
 | 
			
		||||
inline accelerator GpuVector<N,datum> operator-(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
 | 
			
		||||
  GpuVector<N,datum> ret;
 | 
			
		||||
  for(int i=0;i<N;i++) { 
 | 
			
		||||
    ret.rrrr[i] = l.rrrr[i]-r.rrrr[i];
 | 
			
		||||
  }
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
template<int N,class datum>
 | 
			
		||||
inline accelerator GpuVector<N,datum> operator+(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
 | 
			
		||||
  GpuVector<N,datum> ret;
 | 
			
		||||
  for(int i=0;i<N;i++) { 
 | 
			
		||||
    ret.rrrr[i] = l.rrrr[i]+r.rrrr[i];
 | 
			
		||||
  }
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
template<int N,class datum>
 | 
			
		||||
inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
 | 
			
		||||
  GpuVector<N,datum> ret;
 | 
			
		||||
  for(int i=0;i<N;i++) { 
 | 
			
		||||
    ret.rrrr[i] = l.rrrr[i]/r.rrrr[i];
 | 
			
		||||
  }
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Complex vector
 | 
			
		||||
////////////////////////////////////////////////////////////////////////  
 | 
			
		||||
template<int _N, class _datum>
 | 
			
		||||
struct GpuComplexVector {
 | 
			
		||||
  _datum rrrr[_N];
 | 
			
		||||
  _datum iiii[_N];
 | 
			
		||||
  static const int N = _N;
 | 
			
		||||
  typedef _datum datum;
 | 
			
		||||
};
 | 
			
		||||
template<int N,class datum>
 | 
			
		||||
inline accelerator GpuComplexVector<N,datum> operator*(const GpuComplexVector<N,datum> l,const GpuComplexVector<N,datum> r) {
 | 
			
		||||
  GpuComplexVector<N,datum> ret;
 | 
			
		||||
  for(int i=0;i<N;i++) { 
 | 
			
		||||
    ret.rrrr[i] = l.rrrr[i]*r.rrrr[i] - l.iiii[i]*r.iiii[i];
 | 
			
		||||
    ret.iiii[i] = l.rrrr[i]*r.iiii[i] + l.iiii[i]*r.rrrr[i];
 | 
			
		||||
  }
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
template<int N,class datum>
 | 
			
		||||
inline accelerator GpuComplexVector<N,datum> operator-(const GpuComplexVector<N,datum> l,const GpuComplexVector<N,datum> r) {
 | 
			
		||||
  GpuComplexVector<N,datum> ret;
 | 
			
		||||
  for(int i=0;i<N;i++) { 
 | 
			
		||||
    ret.rrrr[i] = l.rrrr[i]-r.rrrr[i];
 | 
			
		||||
    ret.iiii[i] = l.iiii[i]-r.iiii[i];
 | 
			
		||||
  }
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
template<int N,class datum>
 | 
			
		||||
inline accelerator GpuComplexVector<N,datum> operator+(const GpuComplexVector<N,datum> l,const GpuComplexVector<N,datum> r) {
 | 
			
		||||
  GpuComplexVector<N,datum> ret;
 | 
			
		||||
  for(int i=0;i<N;i++) { 
 | 
			
		||||
    ret.rrrr[i] = l.rrrr[i]+r.rrrr[i];
 | 
			
		||||
    ret.iiii[i] = l.iiii[i]+r.iiii[i];
 | 
			
		||||
  }
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
template<int N,class datum>
 | 
			
		||||
inline accelerator GpuComplexVector<N,datum> operator/(const GpuComplexVector<N,datum> l,const GpuComplexVector<N,datum> r) {
 | 
			
		||||
  GpuComplexVector<N,datum> ret;
 | 
			
		||||
  for(int i=0;i<N;i++) { 
 | 
			
		||||
    ret.rrrr[i] = l.rrrr[i]/r.rrrr[i];
 | 
			
		||||
    ret.iiii[i] = l.iiii[i]/r.iiii[i];
 | 
			
		||||
  }
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////
 | 
			
		||||
// SIMD counts
 | 
			
		||||
////////////////////////////////
 | 
			
		||||
 | 
			
		||||
constexpr int NSIMD_RealH    = COALESCE_GRANULARITY / sizeof(half);
 | 
			
		||||
constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half);
 | 
			
		||||
constexpr int NSIMD_RealF    = COALESCE_GRANULARITY / sizeof(float);
 | 
			
		||||
constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float);
 | 
			
		||||
constexpr int NSIMD_RealD    = COALESCE_GRANULARITY / sizeof(double);
 | 
			
		||||
constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double);
 | 
			
		||||
constexpr int NSIMD_Integer  = COALESCE_GRANULARITY / sizeof(Integer);
 | 
			
		||||
 | 
			
		||||
typedef GpuVector<NSIMD_RealH   , half        > GpuVectorRH;
 | 
			
		||||
typedef GpuComplexVector<NSIMD_ComplexH, half > GpuVectorCH;
 | 
			
		||||
typedef GpuVector<NSIMD_RealF,    float       > GpuVectorRF;
 | 
			
		||||
typedef GpuComplexVector<NSIMD_ComplexF, float> GpuVectorCF;
 | 
			
		||||
typedef GpuVector<NSIMD_RealD,    double      > GpuVectorRD;
 | 
			
		||||
typedef GpuComplexVector<NSIMD_ComplexD,double> GpuVectorCD;
 | 
			
		||||
typedef GpuVector<NSIMD_Integer,  Integer     > GpuVectorI;
 | 
			
		||||
 | 
			
		||||
namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  struct Vsplat{
 | 
			
		||||
    //Complex float
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(float a, float b){
 | 
			
		||||
      GpuVectorCF ret;
 | 
			
		||||
      for(int i=0;i<GpuVectorCF::N;i++){
 | 
			
		||||
	ret.rrrr[i] = typename GpuVectorCF::datum(a);
 | 
			
		||||
	ret.iiii[i] = typename GpuVectorCF::datum(b);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    // Real float
 | 
			
		||||
    accelerator_inline GpuVectorRF operator()(float a){
 | 
			
		||||
      GpuVectorRF ret;
 | 
			
		||||
      for(int i=0;i<GpuVectorRF::N;i++){
 | 
			
		||||
	ret.rrrr[i] = typename GpuVectorRF::datum(a);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(double a, double b){
 | 
			
		||||
      GpuVectorCD ret;
 | 
			
		||||
      for(int i=0;i<GpuVectorCD::N;i++){
 | 
			
		||||
	ret.rrrr[i] = typename GpuVectorCD::datum(a);
 | 
			
		||||
	ret.iiii[i] = typename GpuVectorCD::datum(b);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    //Real double
 | 
			
		||||
    accelerator_inline GpuVectorRD operator()(double a){
 | 
			
		||||
      GpuVectorRD ret; 
 | 
			
		||||
      for(int i=0;i<GpuVectorRD::N;i++){
 | 
			
		||||
	ret.rrrr[i] = typename GpuVectorRD::datum(a);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    accelerator_inline GpuVectorI operator()(Integer a){
 | 
			
		||||
      GpuVectorI ret;
 | 
			
		||||
      for(int i=0;i<GpuVectorI::N;i++){
 | 
			
		||||
	ret.rrrr[i] = typename GpuVectorI::datum(a);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Vstore{
 | 
			
		||||
    template<int N,class datum,class P>
 | 
			
		||||
    accelerator_inline void operator()(GpuVector<N,datum> a, P* Fp){
 | 
			
		||||
      GpuVector<N,datum> *vF = (GpuVector<N,datum> *)Fp;
 | 
			
		||||
      *vF = a;
 | 
			
		||||
    }
 | 
			
		||||
    template<int N,class datum,class P>
 | 
			
		||||
    accelerator_inline void operator()(GpuComplexVector<N,datum> a, P* Fp){
 | 
			
		||||
      GpuComplexVector<N,datum> *vF = (GpuComplexVector<N,datum> *)Fp;
 | 
			
		||||
      *vF = a;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Vstream{
 | 
			
		||||
    template<int N,class datum, class P>
 | 
			
		||||
    accelerator_inline void operator()(P* F,GpuVector<N,datum> a){
 | 
			
		||||
      GpuVector<N,datum> *vF = (GpuVector<N,datum> *)F;
 | 
			
		||||
      *vF = a;
 | 
			
		||||
    }
 | 
			
		||||
    template<int N,class datum, class P>
 | 
			
		||||
    accelerator_inline void operator()(P* F,GpuComplexVector<N,datum> a){
 | 
			
		||||
      GpuComplexVector<N,datum> *vF = (GpuComplexVector<N,datum> *)F;
 | 
			
		||||
      *vF = a;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Vset{
 | 
			
		||||
    // Complex float 
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(Grid::ComplexF *a){
 | 
			
		||||
      typedef GpuVectorCF vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = vec::datum(a[i].real());
 | 
			
		||||
	ret.iiii[i] = vec::datum(a[i].imag());
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double 
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(Grid::ComplexD *a){
 | 
			
		||||
      typedef GpuVectorCD vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = vec::datum(a[i].real());
 | 
			
		||||
	ret.iiii[i] = vec::datum(a[i].imag());
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    // Real float 
 | 
			
		||||
    accelerator_inline GpuVectorRF operator()(float *a){
 | 
			
		||||
      typedef GpuVectorRF vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = vec::datum(a[i]);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    // Real double
 | 
			
		||||
    accelerator_inline GpuVectorRD operator()(double *a){
 | 
			
		||||
      typedef GpuVectorRD vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = vec::datum(a[i]);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    // Integer
 | 
			
		||||
    accelerator_inline GpuVectorI operator()(Integer *a){
 | 
			
		||||
      typedef GpuVectorI vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = vec::datum(a[i]);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  template <typename Out_type, typename In_type>
 | 
			
		||||
  struct Reduce{
 | 
			
		||||
    //Need templated class to overload output type
 | 
			
		||||
    //General form must generate error if compiled
 | 
			
		||||
    accelerator_inline Out_type operator()(In_type in){
 | 
			
		||||
      printf("Error, using wrong Reduce function\n");
 | 
			
		||||
      exit(1);
 | 
			
		||||
      return 0;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////
 | 
			
		||||
  // Arithmetic operations
 | 
			
		||||
  /////////////////////////////////////////////////////
 | 
			
		||||
  struct Sum{
 | 
			
		||||
    //Real float
 | 
			
		||||
    accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){
 | 
			
		||||
      return a+b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){
 | 
			
		||||
      return a+b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
 | 
			
		||||
      return a+b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
 | 
			
		||||
      return a+b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){
 | 
			
		||||
      return a+b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Sub{
 | 
			
		||||
    accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){
 | 
			
		||||
      return a-b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){
 | 
			
		||||
      return a-b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
 | 
			
		||||
      return a-b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
 | 
			
		||||
      return a-b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){
 | 
			
		||||
      return a-b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MultRealPart{
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
 | 
			
		||||
      typedef GpuVectorCF vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = a.rrrr[i]*b.rrrr[i];
 | 
			
		||||
	ret.iiii[i] = a.rrrr[i]*b.iiii[i];
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
 | 
			
		||||
      typedef GpuVectorCD vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = a.rrrr[i]*b.rrrr[i];
 | 
			
		||||
	ret.iiii[i] = a.rrrr[i]*b.iiii[i];
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MaddRealPart{
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b,GpuVectorCF c){
 | 
			
		||||
      typedef GpuVectorCF vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = a.rrrr[i]*b.rrrr[i]+c.rrrr[i];
 | 
			
		||||
	ret.iiii[i] = a.rrrr[i]*b.iiii[i]+c.iiii[i];
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b,GpuVectorCD c){
 | 
			
		||||
      typedef GpuVectorCD vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = a.rrrr[i]*b.rrrr[i]+c.rrrr[i];
 | 
			
		||||
	ret.iiii[i] = a.rrrr[i]*b.iiii[i]+c.iiii[i];
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MultComplex{
 | 
			
		||||
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
 | 
			
		||||
      return a*b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
 | 
			
		||||
      return a*b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Mult{
 | 
			
		||||
    accelerator_inline void mac(GpuVectorRF &a, GpuVectorRF b, GpuVectorRF c){
 | 
			
		||||
      a= a+b*c;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline void mac(GpuVectorRD &a, GpuVectorRD b, GpuVectorRD c){
 | 
			
		||||
      a= a+b*c;
 | 
			
		||||
    }
 | 
			
		||||
    // Real float
 | 
			
		||||
    accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b){
 | 
			
		||||
      return a*b;
 | 
			
		||||
    }
 | 
			
		||||
    // Real double
 | 
			
		||||
    accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){
 | 
			
		||||
      return a*b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){
 | 
			
		||||
      return a*b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Div{
 | 
			
		||||
    // Real float
 | 
			
		||||
    accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b){
 | 
			
		||||
      return a/b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){
 | 
			
		||||
      return a/b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){
 | 
			
		||||
      return a/b;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Danger -- element wise divide fro complex, not complex div. 
 | 
			
		||||
    // See Grid_vector_types.h lines around 735, applied after "toReal"
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b){
 | 
			
		||||
      return a/b;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b){
 | 
			
		||||
      return a/b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  struct Conj{
 | 
			
		||||
    // Complex single
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF in){
 | 
			
		||||
      typedef GpuVectorCF vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = in.rrrr[i];
 | 
			
		||||
	ret.iiii[i] =-in.iiii[i];
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD in){
 | 
			
		||||
      typedef GpuVectorCD vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = in.rrrr[i];
 | 
			
		||||
	ret.iiii[i] =-in.iiii[i];
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct TimesMinusI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF in){
 | 
			
		||||
      typedef GpuVectorCF vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = in.iiii[i];
 | 
			
		||||
	ret.iiii[i] =-in.rrrr[i];
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD in){
 | 
			
		||||
      typedef GpuVectorCD vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] = in.iiii[i];
 | 
			
		||||
	ret.iiii[i] =-in.rrrr[i];
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct TimesI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF in){
 | 
			
		||||
      typedef GpuVectorCF vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] =-in.iiii[i];
 | 
			
		||||
	ret.iiii[i] = in.rrrr[i];
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD in){
 | 
			
		||||
      typedef GpuVectorCD vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
	ret.rrrr[i] =-in.iiii[i];
 | 
			
		||||
	ret.iiii[i] = in.rrrr[i];
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Permute{
 | 
			
		||||
 | 
			
		||||
    template <int n,int _N, class _datum >
 | 
			
		||||
    static accelerator_inline GpuVector<_N,_datum> PermuteN(GpuVector<_N,_datum> &in) {   
 | 
			
		||||
      typedef GpuVector<_N,_datum> vec;
 | 
			
		||||
      vec out;					
 | 
			
		||||
      unsigned int _mask = vec::N >> (n + 1);	
 | 
			
		||||
      for(int i=0;i<vec::N;i++) {
 | 
			
		||||
	out.rrrr[i] = in.rrrr[i^_mask];
 | 
			
		||||
      }
 | 
			
		||||
      return out;	
 | 
			
		||||
    }
 | 
			
		||||
    template <int n,int _N, class _datum >
 | 
			
		||||
    static accelerator_inline GpuComplexVector<_N,_datum> PermuteN(GpuComplexVector<_N,_datum> &in) {   
 | 
			
		||||
      typedef GpuComplexVector<_N,_datum> vec;
 | 
			
		||||
      vec out;					
 | 
			
		||||
      unsigned int _mask = vec::N >> (n + 1);	
 | 
			
		||||
      for(int i=0;i<vec::N;i++) {
 | 
			
		||||
	out.rrrr[i] = in.rrrr[i^_mask];
 | 
			
		||||
	out.iiii[i] = in.iiii[i^_mask];
 | 
			
		||||
      }
 | 
			
		||||
      return out;	
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    template <typename vec>  static accelerator_inline vec Permute0(vec in) { return PermuteN<0,vec::N,typename vec::datum>(in);  }
 | 
			
		||||
    template <typename vec>  static accelerator_inline vec Permute1(vec in) { return PermuteN<1,vec::N,typename vec::datum>(in);  }
 | 
			
		||||
    template <typename vec>  static accelerator_inline vec Permute2(vec in) { return PermuteN<2,vec::N,typename vec::datum>(in);  }
 | 
			
		||||
    template <typename vec>  static accelerator_inline vec Permute3(vec in) { return PermuteN<3,vec::N,typename vec::datum>(in);  }
 | 
			
		||||
    
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  struct PrecisionChange {
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Single / Half
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
     static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
 | 
			
		||||
      int N = GpuVectorCF::N;
 | 
			
		||||
      GpuVectorCH h;
 | 
			
		||||
      for(int i=0;i<N;i++) {
 | 
			
		||||
        h.rrrr[i  ] = float2half(a.rrrr[i]);
 | 
			
		||||
        h.iiii[i  ] = float2half(a.iiii[i]);
 | 
			
		||||
	h.rrrr[i+N] = float2half(b.rrrr[i]);
 | 
			
		||||
	h.iiii[i+N] = float2half(b.iiii[i]);
 | 
			
		||||
      }
 | 
			
		||||
      return h;
 | 
			
		||||
    }
 | 
			
		||||
    static accelerator_inline void  HtoS (GpuVectorCH h,GpuVectorCF &sa,GpuVectorCF &sb) {
 | 
			
		||||
      int N = GpuVectorCF::N;
 | 
			
		||||
      for(int i=0;i<N;i++) {
 | 
			
		||||
	sa.rrrr[i] = half2float(h.rrrr[i  ]);
 | 
			
		||||
	sa.iiii[i] = half2float(h.iiii[i  ]);
 | 
			
		||||
	sb.rrrr[i] = half2float(h.rrrr[i+N]);
 | 
			
		||||
	sb.iiii[i] = half2float(h.iiii[i+N]);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    static accelerator_inline GpuVectorRH StoH (GpuVectorRF a,GpuVectorRF b) {
 | 
			
		||||
      int N = GpuVectorRF::N;
 | 
			
		||||
      GpuVectorRH h;
 | 
			
		||||
      for(int i=0;i<N;i++) {
 | 
			
		||||
        h.rrrr[i  ] = float2half(a.rrrr[i]);
 | 
			
		||||
	h.rrrr[i+N] = float2half(b.rrrr[i]);
 | 
			
		||||
      }
 | 
			
		||||
      return h;
 | 
			
		||||
    }
 | 
			
		||||
    static accelerator_inline void  HtoS (GpuVectorRH h,GpuVectorRF &sa,GpuVectorRF &sb) {
 | 
			
		||||
      int N = GpuVectorRF::N;
 | 
			
		||||
      for(int i=0;i<N;i++) {
 | 
			
		||||
	sa.rrrr[i] = half2float(h.rrrr[i  ]);
 | 
			
		||||
	sb.rrrr[i] = half2float(h.rrrr[i+N]);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Double Single
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    static accelerator_inline GpuVectorCF DtoS (GpuVectorCD a,GpuVectorCD b) {
 | 
			
		||||
      int N = GpuVectorCD::N;
 | 
			
		||||
      GpuVectorCF h;
 | 
			
		||||
      for(int i=0;i<N;i++) {
 | 
			
		||||
        h.rrrr[i  ] = a.rrrr[i];
 | 
			
		||||
        h.iiii[i  ] = a.iiii[i];
 | 
			
		||||
	h.rrrr[i+N] = b.rrrr[i];
 | 
			
		||||
	h.iiii[i+N] = b.iiii[i];
 | 
			
		||||
      }
 | 
			
		||||
      return h;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static accelerator_inline void  StoD (GpuVectorCF h,GpuVectorCD &sa,GpuVectorCD &sb) {
 | 
			
		||||
      int N = GpuVectorCD::N;
 | 
			
		||||
      for(int i=0;i<N;i++) {
 | 
			
		||||
	sa.rrrr[i] = h.rrrr[i  ];
 | 
			
		||||
	sa.iiii[i] = h.iiii[i  ];
 | 
			
		||||
	sb.rrrr[i] = h.rrrr[i+N];
 | 
			
		||||
	sb.iiii[i] = h.iiii[i+N];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static accelerator_inline GpuVectorRF DtoS (GpuVectorRD a,GpuVectorRD b) {
 | 
			
		||||
      int N = GpuVectorRD::N;
 | 
			
		||||
      GpuVectorRF h;
 | 
			
		||||
      for(int i=0;i<N;i++) {
 | 
			
		||||
        h.rrrr[i  ] = a.rrrr[i];
 | 
			
		||||
	h.rrrr[i+N] = b.rrrr[i];
 | 
			
		||||
      }
 | 
			
		||||
      return h;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static accelerator_inline void  StoD (GpuVectorRF h,GpuVectorRD &sa,GpuVectorRD &sb) {
 | 
			
		||||
      int N = GpuVectorRD::N;
 | 
			
		||||
      for(int i=0;i<N;i++) {
 | 
			
		||||
	sa.rrrr[i] = h.rrrr[i  ];
 | 
			
		||||
	sb.rrrr[i] = h.rrrr[i+N];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Double Half
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    static accelerator_inline GpuVectorCH DtoH (GpuVectorCD a,GpuVectorCD b,GpuVectorCD c,GpuVectorCD d) {
 | 
			
		||||
      GpuVectorCF sa,sb;
 | 
			
		||||
      sa = DtoS(a,b);
 | 
			
		||||
      sb = DtoS(c,d);
 | 
			
		||||
      return StoH(sa,sb);
 | 
			
		||||
    }
 | 
			
		||||
    static accelerator_inline void HtoD (GpuVectorCH h,GpuVectorCD &a,GpuVectorCD &b,GpuVectorCD &c,GpuVectorCD &d) {
 | 
			
		||||
      GpuVectorCF sa,sb;
 | 
			
		||||
      HtoS(h,sa,sb);
 | 
			
		||||
      StoD(sa,a,b);
 | 
			
		||||
      StoD(sb,c,d);
 | 
			
		||||
    }
 | 
			
		||||
    static accelerator_inline GpuVectorRH DtoH (GpuVectorRD a,GpuVectorRD b,GpuVectorRD c,GpuVectorRD d) {
 | 
			
		||||
      GpuVectorRF sa,sb;
 | 
			
		||||
      sa = DtoS(a,b);
 | 
			
		||||
      sb = DtoS(c,d);
 | 
			
		||||
      return StoH(sa,sb);
 | 
			
		||||
    }
 | 
			
		||||
    static accelerator_inline void HtoD (GpuVectorRH h,GpuVectorRD &a,GpuVectorRD &b,GpuVectorRD &c,GpuVectorRD &d) {
 | 
			
		||||
      GpuVectorRF sa,sb;
 | 
			
		||||
      HtoS(h,sa,sb);
 | 
			
		||||
      StoD(sa,a,b);
 | 
			
		||||
      StoD(sb,c,d);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
struct Exchange{
 | 
			
		||||
 | 
			
		||||
  template <int n,int _N, class _datum >
 | 
			
		||||
  static accelerator_inline void ExchangeN(GpuVector<_N,_datum> &out1,
 | 
			
		||||
					   GpuVector<_N,_datum> &out2,
 | 
			
		||||
					   GpuVector<_N,_datum> &in1,
 | 
			
		||||
					   GpuVector<_N,_datum> &in2 )
 | 
			
		||||
  {   
 | 
			
		||||
    typedef GpuVector<_N,_datum> vec;
 | 
			
		||||
    unsigned int mask = vec::N >> (n + 1);
 | 
			
		||||
    for(int i=0;i<vec::N;i++) {
 | 
			
		||||
      int j1 = i&(~mask);
 | 
			
		||||
      if  ( (i&mask) == 0 ) { out1.rrrr[i]=in1.rrrr[j1];}
 | 
			
		||||
      else                  { out1.rrrr[i]=in2.rrrr[j1];}
 | 
			
		||||
      int j2 = i|mask;
 | 
			
		||||
      if  ( (i&mask) == 0 ) { out2.rrrr[i]=in1.rrrr[j2];}
 | 
			
		||||
      else                  { out2.rrrr[i]=in2.rrrr[j2];}
 | 
			
		||||
    }      
 | 
			
		||||
  }
 | 
			
		||||
  template <int n,int _N, class _datum >
 | 
			
		||||
  static accelerator_inline void ExchangeN(GpuComplexVector<_N,_datum> &out1,
 | 
			
		||||
					   GpuComplexVector<_N,_datum> &out2,
 | 
			
		||||
					   GpuComplexVector<_N,_datum> &in1,
 | 
			
		||||
					   GpuComplexVector<_N,_datum> &in2 )
 | 
			
		||||
  {   
 | 
			
		||||
    typedef GpuComplexVector<_N,_datum> vec;
 | 
			
		||||
    unsigned int mask = vec::N >> (n + 1);
 | 
			
		||||
    for(int i=0;i<vec::N;i++) {
 | 
			
		||||
      int j1 = i&(~mask);
 | 
			
		||||
      if  ( (i&mask) == 0 ) {
 | 
			
		||||
	out1.rrrr[i]=in1.rrrr[j1];
 | 
			
		||||
	out1.iiii[i]=in1.iiii[j1];
 | 
			
		||||
      }
 | 
			
		||||
      else                  {
 | 
			
		||||
	out1.rrrr[i]=in2.rrrr[j1];
 | 
			
		||||
	out1.iiii[i]=in2.iiii[j1];
 | 
			
		||||
      }
 | 
			
		||||
      int j2 = i|mask;
 | 
			
		||||
      if  ( (i&mask) == 0 ) {
 | 
			
		||||
	out2.rrrr[i]=in1.rrrr[j2];
 | 
			
		||||
	out2.iiii[i]=in1.iiii[j2];
 | 
			
		||||
      }
 | 
			
		||||
      else                  {
 | 
			
		||||
	out2.rrrr[i]=in2.rrrr[j2];
 | 
			
		||||
	out2.iiii[i]=in2.iiii[j2];
 | 
			
		||||
      }
 | 
			
		||||
    }      
 | 
			
		||||
  }
 | 
			
		||||
  template <typename vec>
 | 
			
		||||
  static accelerator_inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){
 | 
			
		||||
    ExchangeN<0>(out1,out2,in1,in2);
 | 
			
		||||
  };
 | 
			
		||||
  template <typename vec>
 | 
			
		||||
  static accelerator_inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){
 | 
			
		||||
    ExchangeN<1>(out1,out2,in1,in2);
 | 
			
		||||
  };
 | 
			
		||||
  template <typename vec>
 | 
			
		||||
  static accelerator_inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){
 | 
			
		||||
    ExchangeN<2>(out1,out2,in1,in2);
 | 
			
		||||
  };
 | 
			
		||||
  template <typename vec>
 | 
			
		||||
  static accelerator_inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){
 | 
			
		||||
    ExchangeN<3>(out1,out2,in1,in2);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct Rotate{
 | 
			
		||||
 | 
			
		||||
  template <int n, typename vec> static accelerator_inline vec tRotate(vec in){
 | 
			
		||||
    return rotate(in, n);
 | 
			
		||||
  }
 | 
			
		||||
    
 | 
			
		||||
  template <int _N, class _datum >
 | 
			
		||||
  static accelerator_inline GpuComplexVector<_N,_datum> rotate_template(GpuComplexVector<_N,_datum> &in, int n)
 | 
			
		||||
  {
 | 
			
		||||
    typedef GpuComplexVector<_N,_datum> vec;
 | 
			
		||||
    vec out;
 | 
			
		||||
    for(int i=0;i<vec::N;i++){
 | 
			
		||||
      out.rrrr[i] = in.rrrr[(i + n)%vec::N];
 | 
			
		||||
      out.iiii[i] = in.iiii[(i + n)%vec::N];
 | 
			
		||||
    }
 | 
			
		||||
    return out;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template <int _N, class _datum >
 | 
			
		||||
  static accelerator_inline GpuVector<_N,_datum> rotate_template(GpuVector<_N,_datum> &in, int n)
 | 
			
		||||
  {
 | 
			
		||||
    typedef GpuVector<_N,_datum> vec;
 | 
			
		||||
    vec out;
 | 
			
		||||
    for(int i=0;i<vec::N;i++){
 | 
			
		||||
      out.rrrr[i] = in.rrrr[(i + n)%vec::N];
 | 
			
		||||
    }
 | 
			
		||||
    return out;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  typedef GpuVectorRH  SIMD_Htype; // Single precision type
 | 
			
		||||
  typedef GpuVectorRF  SIMD_Ftype; // Single precision type
 | 
			
		||||
  typedef GpuVectorRD  SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef GpuVectorI   SIMD_Itype; // Integer type
 | 
			
		||||
 | 
			
		||||
  typedef GpuVectorCH  SIMD_CHtype; // Single precision type
 | 
			
		||||
  typedef GpuVectorCF  SIMD_CFtype; // Single precision type
 | 
			
		||||
  typedef GpuVectorCD  SIMD_CDtype; // Double precision type
 | 
			
		||||
 | 
			
		||||
  static accelerator_inline GpuVectorRH rotate(GpuVectorRH in, int n){ return rotate_template(in,n);}
 | 
			
		||||
  static accelerator_inline GpuVectorRF rotate(GpuVectorRF in, int n){ return rotate_template(in,n);}
 | 
			
		||||
  static accelerator_inline GpuVectorRD rotate(GpuVectorRD in, int n){ return rotate_template(in,n);}
 | 
			
		||||
  static accelerator_inline GpuVectorI  rotate(GpuVectorI  in, int n){ return rotate_template(in,n);}
 | 
			
		||||
  static accelerator_inline GpuVectorCH rotate(GpuVectorCH in, int n){ return rotate_template(in,n/2);} // Measure in complex not float
 | 
			
		||||
  static accelerator_inline GpuVectorCF rotate(GpuVectorCF in, int n){ return rotate_template(in,n/2);}
 | 
			
		||||
  static accelerator_inline GpuVectorCD rotate(GpuVectorCD in, int n){ return rotate_template(in,n/2);}
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
// Some Template specialization
 | 
			
		||||
 | 
			
		||||
  //Complex float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  accelerator_inline Grid::ComplexF 
 | 
			
		||||
  Reduce<Grid::ComplexF, GpuVectorCF>::operator()(GpuVectorCF in)
 | 
			
		||||
  {
 | 
			
		||||
    Grid::ComplexF greduce(in.rrrr[0],in.iiii[0]);
 | 
			
		||||
    for(int i=1;i<GpuVectorCF::N;i++) {
 | 
			
		||||
      greduce = greduce+Grid::ComplexF(in.rrrr[i],in.iiii[i]);
 | 
			
		||||
    }
 | 
			
		||||
    return greduce;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<>
 | 
			
		||||
  accelerator_inline Grid::ComplexD
 | 
			
		||||
  Reduce<Grid::ComplexD, GpuVectorCD>::operator()(GpuVectorCD in)
 | 
			
		||||
  {
 | 
			
		||||
    Grid::ComplexD greduce(in.rrrr[0],in.iiii[0]);
 | 
			
		||||
    for(int i=1;i<GpuVectorCD::N;i++) {
 | 
			
		||||
      greduce = greduce+ Grid::ComplexD(in.rrrr[i],in.iiii[i]);
 | 
			
		||||
    }
 | 
			
		||||
    return greduce;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Real
 | 
			
		||||
  template<>
 | 
			
		||||
  accelerator_inline Grid::RealF 
 | 
			
		||||
  Reduce<RealF, GpuVectorRF>::operator()(GpuVectorRF in)
 | 
			
		||||
  {
 | 
			
		||||
    RealF ret = in.rrrr[0];
 | 
			
		||||
    for(int i=1;i<GpuVectorRF::N;i++) {
 | 
			
		||||
      ret = ret+in.rrrr[i];
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<>
 | 
			
		||||
  accelerator_inline Grid::RealD 
 | 
			
		||||
  Reduce<RealD, GpuVectorRD>::operator()(GpuVectorRD in)
 | 
			
		||||
  {
 | 
			
		||||
    RealD ret = in.rrrr[0];
 | 
			
		||||
    for(int i=1;i<GpuVectorRD::N;i++) {
 | 
			
		||||
      ret = ret+in.rrrr[i];
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<>
 | 
			
		||||
  accelerator_inline Integer
 | 
			
		||||
  Reduce<Integer, GpuVectorI>::operator()(GpuVectorI in)
 | 
			
		||||
  {
 | 
			
		||||
    Integer ret = in.rrrr[0];
 | 
			
		||||
    for(int i=1;i<GpuVectorI::N;i++) {
 | 
			
		||||
      ret = ret+in.rrrr[i];
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}// End optimizatoin
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types 
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  typedef GpuVectorRH  SIMD_Htype; // Single precision type
 | 
			
		||||
  typedef GpuVectorRF  SIMD_Ftype; // Single precision type
 | 
			
		||||
  typedef GpuVectorRD  SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef GpuVectorI   SIMD_Itype; // Integer type
 | 
			
		||||
 | 
			
		||||
  typedef GpuVectorCH  SIMD_CHtype; // Single precision type
 | 
			
		||||
  typedef GpuVectorCF  SIMD_CFtype; // Single precision type
 | 
			
		||||
  typedef GpuVectorCD  SIMD_CDtype; // Double precision type
 | 
			
		||||
 | 
			
		||||
  // prefetch utilities
 | 
			
		||||
  accelerator_inline void v_prefetch0(int size, const char *ptr){};
 | 
			
		||||
  accelerator_inline void prefetch_HINT_T0(const char *ptr){};
 | 
			
		||||
 | 
			
		||||
  // Function name aliases
 | 
			
		||||
  typedef Optimization::Vsplat   VsplatSIMD;
 | 
			
		||||
  typedef Optimization::Vstore   VstoreSIMD;
 | 
			
		||||
  typedef Optimization::Vset     VsetSIMD;
 | 
			
		||||
  typedef Optimization::Vstream  VstreamSIMD;
 | 
			
		||||
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 | 
			
		||||
 | 
			
		||||
  // Arithmetic operations
 | 
			
		||||
  typedef Optimization::Sum         SumSIMD;
 | 
			
		||||
  typedef Optimization::Sub         SubSIMD;
 | 
			
		||||
  typedef Optimization::Div         DivSIMD;
 | 
			
		||||
  typedef Optimization::Mult        MultSIMD;
 | 
			
		||||
  typedef Optimization::MultComplex MultComplexSIMD;
 | 
			
		||||
  typedef Optimization::MultRealPart MultRealPartSIMD;
 | 
			
		||||
  typedef Optimization::MaddRealPart MaddRealPartSIMD;
 | 
			
		||||
  typedef Optimization::Conj        ConjSIMD;
 | 
			
		||||
  typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
			
		||||
  typedef Optimization::TimesI      TimesISIMD;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
@@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
#include <hip/hip_fp16.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
#if !defined(GRID_CUDA) && !defined(GRID_HIP)
 | 
			
		||||
namespace Grid {
 | 
			
		||||
  typedef struct { uint16_t x;} half;
 | 
			
		||||
  typedef struct { half   x; half   y;} half2;
 | 
			
		||||
@@ -486,7 +486,7 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  struct TimesMinusI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF in,GpuVectorCF dummy){
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF in){
 | 
			
		||||
      typedef GpuVectorCF vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
@@ -495,7 +495,7 @@ namespace Optimization {
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD in,GpuVectorCD dummy){
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD in){
 | 
			
		||||
      typedef GpuVectorCD vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
@@ -508,7 +508,7 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  struct TimesI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF in,GpuVectorCF dummy){
 | 
			
		||||
    accelerator_inline GpuVectorCF operator()(GpuVectorCF in){
 | 
			
		||||
      typedef GpuVectorCF vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
@@ -517,7 +517,7 @@ namespace Optimization {
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD in,GpuVectorCD dummy){
 | 
			
		||||
    accelerator_inline GpuVectorCD operator()(GpuVectorCD in){
 | 
			
		||||
      typedef GpuVectorCD vec;
 | 
			
		||||
      vec ret;
 | 
			
		||||
      for(int i=0;i<vec::N;i++){
 | 
			
		||||
 
 | 
			
		||||
@@ -356,7 +356,7 @@ struct Conj{
 | 
			
		||||
  
 | 
			
		||||
struct TimesMinusI{
 | 
			
		||||
  //Complex double
 | 
			
		||||
  inline vector4double operator()(vector4double v, vector4double ret){
 | 
			
		||||
  inline vector4double operator()(vector4double v){
 | 
			
		||||
    return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
 | 
			
		||||
			 (vector4double){0., 0., 0., 0.});
 | 
			
		||||
  }
 | 
			
		||||
@@ -367,7 +367,7 @@ struct TimesMinusI{
 | 
			
		||||
  
 | 
			
		||||
struct TimesI{
 | 
			
		||||
  //Complex double
 | 
			
		||||
  inline vector4double operator()(vector4double v, vector4double ret){
 | 
			
		||||
  inline vector4double operator()(vector4double v){
 | 
			
		||||
    return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
 | 
			
		||||
			 (vector4double){0., 0., 0., 0.});
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -273,27 +273,25 @@ struct Conj{
 | 
			
		||||
 | 
			
		||||
struct TimesMinusI{
 | 
			
		||||
  //Complex single
 | 
			
		||||
  inline __m128 operator()(__m128 in, __m128 ret){
 | 
			
		||||
  inline __m128 operator()(__m128 in){
 | 
			
		||||
    __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
 | 
			
		||||
    return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
  }
 | 
			
		||||
  //Complex double
 | 
			
		||||
  inline __m128d operator()(__m128d in, __m128d ret){
 | 
			
		||||
  inline __m128d operator()(__m128d in){
 | 
			
		||||
    __m128d tmp =_mm_addsub_pd(_mm_setzero_pd(),in); // r,-i
 | 
			
		||||
    return _mm_shuffle_pd(tmp,tmp,0x1);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct TimesI{
 | 
			
		||||
  //Complex single
 | 
			
		||||
  inline __m128 operator()(__m128 in, __m128 ret){
 | 
			
		||||
  inline __m128 operator()(__m128 in){
 | 
			
		||||
    __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
    return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
 | 
			
		||||
  }
 | 
			
		||||
  //Complex double
 | 
			
		||||
  inline __m128d operator()(__m128d in, __m128d ret){
 | 
			
		||||
  inline __m128d operator()(__m128d in){
 | 
			
		||||
    __m128d tmp = _mm_shuffle_pd(in,in,0x1);
 | 
			
		||||
    return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -110,11 +110,10 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
 | 
			
		||||
#ifdef GPU_VEC
 | 
			
		||||
#include "Grid_gpu_vec.h"
 | 
			
		||||
#endif
 | 
			
		||||
/*
 | 
			
		||||
#ifdef GEN
 | 
			
		||||
#include "Grid_generic.h"
 | 
			
		||||
 | 
			
		||||
#ifdef GPU_RRII
 | 
			
		||||
#include "Grid_gpu_rrii.h"
 | 
			
		||||
#endif
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
#ifdef GEN
 | 
			
		||||
  #if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here
 | 
			
		||||
@@ -131,7 +130,6 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
 | 
			
		||||
      #include "Grid_a64fx-fixedsize.h"
 | 
			
		||||
    #endif
 | 
			
		||||
  #else
 | 
			
		||||
    //#pragma message("building GEN") // generic
 | 
			
		||||
    #include "Grid_generic.h"
 | 
			
		||||
  #endif
 | 
			
		||||
#endif
 | 
			
		||||
@@ -270,12 +268,14 @@ public:
 | 
			
		||||
  typedef Vector_type vector_type;
 | 
			
		||||
  typedef Scalar_type scalar_type;
 | 
			
		||||
 | 
			
		||||
  /*
 | 
			
		||||
  typedef union conv_t_union {
 | 
			
		||||
    Vector_type v;
 | 
			
		||||
    Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
 | 
			
		||||
    accelerator_inline conv_t_union(){};
 | 
			
		||||
  } conv_t;
 | 
			
		||||
 | 
			
		||||
  */
 | 
			
		||||
  
 | 
			
		||||
  Vector_type v;
 | 
			
		||||
 | 
			
		||||
  static accelerator_inline constexpr int Nsimd(void) {
 | 
			
		||||
@@ -555,15 +555,13 @@ public:
 | 
			
		||||
  template <class functor>
 | 
			
		||||
  friend accelerator_inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
 | 
			
		||||
    Grid_simd ret;
 | 
			
		||||
    Grid_simd::conv_t conv;
 | 
			
		||||
    Grid_simd::scalar_type s;
 | 
			
		||||
 | 
			
		||||
    conv.v = v.v;
 | 
			
		||||
    for (int i = 0; i < Nsimd(); i++) {
 | 
			
		||||
      s = conv.s[i];
 | 
			
		||||
      conv.s[i] = func(s);
 | 
			
		||||
      s = v.getlane(i);
 | 
			
		||||
      s = func(s);
 | 
			
		||||
      ret.putlane(s,i);
 | 
			
		||||
    }
 | 
			
		||||
    ret.v = conv.v;
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  template <class functor>
 | 
			
		||||
@@ -571,18 +569,14 @@ public:
 | 
			
		||||
                                         const Grid_simd &x,
 | 
			
		||||
                                         const Grid_simd &y) {
 | 
			
		||||
    Grid_simd ret;
 | 
			
		||||
    Grid_simd::conv_t cx;
 | 
			
		||||
    Grid_simd::conv_t cy;
 | 
			
		||||
    Grid_simd::scalar_type sx,sy;
 | 
			
		||||
 | 
			
		||||
    cx.v = x.v;
 | 
			
		||||
    cy.v = y.v;
 | 
			
		||||
    for (int i = 0; i < Nsimd(); i++) {
 | 
			
		||||
      sx = cx.s[i];
 | 
			
		||||
      sy = cy.s[i];
 | 
			
		||||
      cx.s[i] = func(sx,sy);
 | 
			
		||||
      sx = x.getlane(i);
 | 
			
		||||
      sy = y.getlane(i);
 | 
			
		||||
      sx = func(sx,sy);
 | 
			
		||||
      ret.putlane(sx,i);
 | 
			
		||||
    }
 | 
			
		||||
    ret.v = cx.v;
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
@@ -645,15 +639,36 @@ public:
 | 
			
		||||
  ///////////////////////////////
 | 
			
		||||
  // Getting single lanes
 | 
			
		||||
  ///////////////////////////////
 | 
			
		||||
  accelerator_inline Scalar_type getlane(int lane) {
 | 
			
		||||
#ifdef GPU_RRII
 | 
			
		||||
  template <class S = Scalar_type,IfComplex<S> = 0>
 | 
			
		||||
  accelerator_inline Scalar_type getlane(int lane) const {
 | 
			
		||||
    return Scalar_type(v.rrrr[lane],v.iiii[lane]);
 | 
			
		||||
  }
 | 
			
		||||
  template <class S = Scalar_type,IfComplex<S> = 0>
 | 
			
		||||
  accelerator_inline void putlane(const Scalar_type &_S, int lane){
 | 
			
		||||
    v.rrrr[lane] = real(_S);
 | 
			
		||||
    v.iiii[lane] = imag(_S);
 | 
			
		||||
  }
 | 
			
		||||
  template <class S = Scalar_type,IfNotComplex<S> = 0>
 | 
			
		||||
  accelerator_inline Scalar_type getlane(int lane) const {
 | 
			
		||||
    return ((S*)&v)[lane];
 | 
			
		||||
  }
 | 
			
		||||
  template <class S = Scalar_type,IfNotComplex<S> = 0>
 | 
			
		||||
  accelerator_inline void putlane(const S &_S, int lane){
 | 
			
		||||
    ((Scalar_type*)&v)[lane] = _S;
 | 
			
		||||
  }
 | 
			
		||||
#else // Can pun to an array of complex
 | 
			
		||||
  accelerator_inline Scalar_type getlane(int lane) const {
 | 
			
		||||
    return ((Scalar_type*)&v)[lane];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  accelerator_inline void putlane(const Scalar_type &S, int lane){
 | 
			
		||||
    ((Scalar_type*)&v)[lane] = S;
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
};  // end of Grid_simd class definition
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
///////////////////////////////
 | 
			
		||||
// Define available types
 | 
			
		||||
///////////////////////////////
 | 
			
		||||
@@ -663,7 +678,7 @@ typedef Grid_simd<double , SIMD_Dtype> vRealD;
 | 
			
		||||
typedef Grid_simd<Integer, SIMD_Itype> vInteger;
 | 
			
		||||
typedef Grid_simd<uint16_t,SIMD_Htype> vRealH;
 | 
			
		||||
 | 
			
		||||
#ifdef GPU_VEC
 | 
			
		||||
#if defined(GPU_VEC) || defined(GPU_RRII)
 | 
			
		||||
typedef Grid_simd<complex<uint16_t>, SIMD_CHtype> vComplexH;
 | 
			
		||||
typedef Grid_simd<complex<float>   , SIMD_CFtype> vComplexF;
 | 
			
		||||
typedef Grid_simd<complex<double>  , SIMD_CDtype> vComplexD;
 | 
			
		||||
@@ -763,6 +778,7 @@ accelerator_inline void vsplat(Grid_simd<S, V> &ret, NotEnableIf<is_complex<S>,
 | 
			
		||||
}
 | 
			
		||||
//////////////////////////
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////
 | 
			
		||||
// Initialise to 1,0,i for the correct types
 | 
			
		||||
///////////////////////////////////////////////
 | 
			
		||||
@@ -907,34 +923,6 @@ accelerator_inline Grid_simd<S, V> fxmac(Grid_simd<S, V> a, Grid_simd<S, V> b, G
 | 
			
		||||
// ----------------------------------------------
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// Distinguish between complex types and others
 | 
			
		||||
template <class S, class V, IfComplex<S> = 0>
 | 
			
		||||
accelerator_inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
 | 
			
		||||
  typedef Grid_simd<S, V> simd;
 | 
			
		||||
 | 
			
		||||
  simd ret;
 | 
			
		||||
  simd den;
 | 
			
		||||
  typename simd::conv_t conv;
 | 
			
		||||
 | 
			
		||||
  ret = a * conjugate(b) ;
 | 
			
		||||
  den = b * conjugate(b) ;
 | 
			
		||||
 | 
			
		||||
  // duplicates real part
 | 
			
		||||
  auto real_den  = toReal(den);
 | 
			
		||||
  simd zden;
 | 
			
		||||
  memcpy((void *)&zden.v,(void *)&real_den.v,sizeof(zden));
 | 
			
		||||
  ret.v=binary<V>(ret.v, zden.v, DivSIMD());
 | 
			
		||||
  return ret;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Real/Integer types
 | 
			
		||||
template <class S, class V, IfNotComplex<S> = 0>
 | 
			
		||||
accelerator_inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
 | 
			
		||||
  Grid_simd<S, V> ret;
 | 
			
		||||
  ret.v = binary<V>(a.v, b.v, DivSIMD());
 | 
			
		||||
  return ret;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
///////////////////////
 | 
			
		||||
// Conjugate
 | 
			
		||||
///////////////////////
 | 
			
		||||
@@ -959,30 +947,29 @@ accelerator_inline Grid_simd<S, V> adj(const Grid_simd<S, V> &in) {
 | 
			
		||||
///////////////////////
 | 
			
		||||
template <class S, class V, IfComplex<S> = 0>
 | 
			
		||||
accelerator_inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
 | 
			
		||||
  ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD());
 | 
			
		||||
  ret.v = unary<V>(in.v, TimesMinusISIMD());
 | 
			
		||||
}
 | 
			
		||||
template <class S, class V, IfComplex<S> = 0>
 | 
			
		||||
accelerator_inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
 | 
			
		||||
  Grid_simd<S, V> ret;
 | 
			
		||||
  timesMinusI(ret, in);
 | 
			
		||||
  ret.v=unary<V>(in.v, TimesMinusISIMD());
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
template <class S, class V, IfNotComplex<S> = 0>
 | 
			
		||||
accelerator_inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
 | 
			
		||||
  return in;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
///////////////////////
 | 
			
		||||
// timesI
 | 
			
		||||
///////////////////////
 | 
			
		||||
template <class S, class V, IfComplex<S> = 0>
 | 
			
		||||
accelerator_inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
 | 
			
		||||
  ret.v = binary<V>(in.v, ret.v, TimesISIMD());
 | 
			
		||||
  ret.v = unary<V>(in.v, TimesISIMD());
 | 
			
		||||
}
 | 
			
		||||
template <class S, class V, IfComplex<S> = 0>
 | 
			
		||||
accelerator_inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
 | 
			
		||||
  Grid_simd<S, V> ret;
 | 
			
		||||
  timesI(ret, in);
 | 
			
		||||
  ret.v= unary<V>(in.v, TimesISIMD());
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
template <class S, class V, IfNotComplex<S> = 0>
 | 
			
		||||
@@ -990,6 +977,35 @@ accelerator_inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
 | 
			
		||||
  return in;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// Distinguish between complex types and others
 | 
			
		||||
template <class S, class V, IfComplex<S> = 0>
 | 
			
		||||
accelerator_inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
 | 
			
		||||
  typedef Grid_simd<S, V> simd;
 | 
			
		||||
 | 
			
		||||
  simd ret;
 | 
			
		||||
  simd den;
 | 
			
		||||
 | 
			
		||||
  ret = a * conjugate(b) ;
 | 
			
		||||
  den = b * conjugate(b) ;
 | 
			
		||||
 | 
			
		||||
  // duplicates real part
 | 
			
		||||
  auto real_den  = toReal(den);
 | 
			
		||||
  simd zden;
 | 
			
		||||
  memcpy((void *)&zden.v,(void *)&real_den.v,sizeof(zden));
 | 
			
		||||
  ret.v=binary<V>(ret.v, zden.v, DivSIMD());
 | 
			
		||||
  return ret;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Real/Integer types
 | 
			
		||||
template <class S, class V, IfNotComplex<S> = 0>
 | 
			
		||||
accelerator_inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
 | 
			
		||||
  Grid_simd<S, V> ret;
 | 
			
		||||
  ret.v = binary<V>(a.v, b.v, DivSIMD());
 | 
			
		||||
  return ret;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/////////////////////
 | 
			
		||||
// Inner, outer
 | 
			
		||||
/////////////////////
 | 
			
		||||
@@ -1021,12 +1037,12 @@ template <class Csimd>  // must be a real arg
 | 
			
		||||
accelerator_inline typename toRealMapper<Csimd>::Realified toReal(const Csimd &in) {
 | 
			
		||||
  typedef typename toRealMapper<Csimd>::Realified Rsimd;
 | 
			
		||||
  Rsimd ret;
 | 
			
		||||
  typename Rsimd::conv_t conv;
 | 
			
		||||
  memcpy((void *)&conv.v,(void *)&in.v,sizeof(conv.v));
 | 
			
		||||
  int j=0;
 | 
			
		||||
  for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
 | 
			
		||||
    conv.s[i + 1] = conv.s[i];  // duplicate (r,r);(r,r);(r,r); etc...
 | 
			
		||||
    auto s = real(in.getlane(j++));
 | 
			
		||||
    ret.putlane(s,i);
 | 
			
		||||
    ret.putlane(s,i+1);
 | 
			
		||||
  }
 | 
			
		||||
  memcpy((void *)&ret.v,(void *)&conv.v,sizeof(ret.v));
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -1039,18 +1055,19 @@ template <class Rsimd>  // must be a real arg
 | 
			
		||||
accelerator_inline typename toComplexMapper<Rsimd>::Complexified toComplex(const Rsimd &in) {
 | 
			
		||||
 | 
			
		||||
  typedef typename toComplexMapper<Rsimd>::Complexified   Csimd;
 | 
			
		||||
  typename Rsimd::conv_t conv;  // address as real
 | 
			
		||||
 | 
			
		||||
  conv.v = in.v;
 | 
			
		||||
  typedef typename Csimd::scalar_type scalar_type;
 | 
			
		||||
  int j=0;
 | 
			
		||||
  Csimd ret;
 | 
			
		||||
  for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
 | 
			
		||||
    assert(conv.s[i + 1] == conv.s[i]);
 | 
			
		||||
    auto rr = in.getlane(i);
 | 
			
		||||
    auto ri = in.getlane(i+1);
 | 
			
		||||
    assert(rr==ri);
 | 
			
		||||
    // trap any cases where real was not duplicated
 | 
			
		||||
    // indicating the SIMD grids of real and imag assignment did not correctly
 | 
			
		||||
    // match
 | 
			
		||||
    conv.s[i + 1] = 0.0;  // zero imaginary parts
 | 
			
		||||
    scalar_type s(rr,0.0);
 | 
			
		||||
    ret.putlane(s,j++);
 | 
			
		||||
  }
 | 
			
		||||
  Csimd ret;
 | 
			
		||||
  memcpy((void *)&ret.v,(void *)&conv.v,sizeof(ret.v));
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -1146,6 +1163,27 @@ template <> struct is_simd<vInteger>   : public std::true_type {};
 | 
			
		||||
template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
 | 
			
		||||
template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////
 | 
			
		||||
// Convenience insert / extract with complex support
 | 
			
		||||
///////////////////////////////////////////////
 | 
			
		||||
template <class S, class V>
 | 
			
		||||
accelerator_inline S getlane(const Grid_simd<S, V> &in,int lane) {
 | 
			
		||||
  return in.getlane(lane);
 | 
			
		||||
}
 | 
			
		||||
template <class S, class V>
 | 
			
		||||
accelerator_inline void putlane(Grid_simd<S, V> &vec,const S &_S, int lane){
 | 
			
		||||
  vec.putlane(_S,lane);
 | 
			
		||||
}
 | 
			
		||||
template <class S,IfNotSimd<S> = 0 >
 | 
			
		||||
accelerator_inline S getlane(const S &in,int lane) {
 | 
			
		||||
  return in;
 | 
			
		||||
}
 | 
			
		||||
template <class S,IfNotSimd<S> = 0 >
 | 
			
		||||
accelerator_inline void putlane(S &vec,const S &_S, int lane){
 | 
			
		||||
  vec = _S;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -69,6 +69,7 @@ typedef RealF  Real;
 | 
			
		||||
typedef thrust::complex<RealF> ComplexF;
 | 
			
		||||
typedef thrust::complex<RealD> ComplexD;
 | 
			
		||||
typedef thrust::complex<Real>  Complex;
 | 
			
		||||
typedef thrust::complex<uint16_t>  ComplexH;
 | 
			
		||||
template<class T> using complex = thrust::complex<T>;
 | 
			
		||||
 | 
			
		||||
accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(thrust::pow(r,(double)y)); }
 | 
			
		||||
@@ -77,6 +78,7 @@ accelerator_inline ComplexF pow(const ComplexF& r,RealF y){ return(thrust::pow(r
 | 
			
		||||
typedef std::complex<RealF> ComplexF;
 | 
			
		||||
typedef std::complex<RealD> ComplexD;
 | 
			
		||||
typedef std::complex<Real>  Complex;
 | 
			
		||||
typedef std::complex<uint16_t>  ComplexH; // Hack
 | 
			
		||||
template<class T> using complex = std::complex<T>;
 | 
			
		||||
 | 
			
		||||
accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(std::pow(r,y)); }
 | 
			
		||||
 
 | 
			
		||||
@@ -216,7 +216,6 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
  typedef typename cobj::vector_type vector_type;
 | 
			
		||||
  typedef typename cobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename cobj::scalar_object scalar_object;
 | 
			
		||||
  typedef const CartesianStencilView<vobj,cobj,Parameters> View_type;
 | 
			
		||||
  typedef typename View_type::StencilVector StencilVector;
 | 
			
		||||
@@ -1014,7 +1013,6 @@ public:
 | 
			
		||||
  int Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx, int point)
 | 
			
		||||
  {
 | 
			
		||||
    typedef typename cobj::vector_type vector_type;
 | 
			
		||||
    typedef typename cobj::scalar_type scalar_type;
 | 
			
		||||
 | 
			
		||||
    int comms_send   = this->_comms_send[point] ;
 | 
			
		||||
    int comms_recv   = this->_comms_recv[point] ;
 | 
			
		||||
 
 | 
			
		||||
@@ -178,6 +178,7 @@ public:
 | 
			
		||||
    stream << "S {" << o._internal << "}";
 | 
			
		||||
    return stream;
 | 
			
		||||
  };
 | 
			
		||||
  // FIXME These will break with change of data layout
 | 
			
		||||
  strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(&_internal); }
 | 
			
		||||
  strong_inline       scalar_type * begin()       { return reinterpret_cast<      scalar_type *>(&_internal); }
 | 
			
		||||
  strong_inline const scalar_type * end()   const { return begin() + Traits::count; }
 | 
			
		||||
@@ -288,6 +289,7 @@ public:
 | 
			
		||||
  //      return _internal[i];
 | 
			
		||||
  //    }
 | 
			
		||||
 | 
			
		||||
  // FIXME These will break with change of data layout
 | 
			
		||||
  strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(_internal); }
 | 
			
		||||
  strong_inline       scalar_type * begin()       { return reinterpret_cast<      scalar_type *>(_internal); }
 | 
			
		||||
  strong_inline const scalar_type * end()   const { return begin() + Traits::count; }
 | 
			
		||||
@@ -430,6 +432,7 @@ public:
 | 
			
		||||
  //    return _internal[i][j];
 | 
			
		||||
  //  }
 | 
			
		||||
 | 
			
		||||
  // FIXME These will break with change of data layout
 | 
			
		||||
  strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(_internal[0]); }
 | 
			
		||||
  strong_inline       scalar_type * begin()       { return reinterpret_cast<      scalar_type *>(_internal[0]); }
 | 
			
		||||
  strong_inline const scalar_type * end()   const { return begin() + Traits::count; }
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
n
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/tensors/Tensor_extract_merge.h
 | 
			
		||||
@@ -62,8 +62,18 @@ void extract(const vobj &vec,ExtractBuffer<sobj> &extracted)
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
  const int Nsimd=vector_type::Nsimd();
 | 
			
		||||
  const int Nextr=extracted.size();
 | 
			
		||||
  vector_type * vp = (vector_type *)&vec;
 | 
			
		||||
  const int s=Nsimd/Nextr;
 | 
			
		||||
  sobj_scalar_type *sp = (sobj_scalar_type *) &extracted[0];
 | 
			
		||||
  sobj_scalar_type stmp;
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    for(int i=0;i<Nextr;i++){
 | 
			
		||||
      stmp = vp[w].getlane(i*s);
 | 
			
		||||
      sp[i*words+w] =stmp;
 | 
			
		||||
      //      memcpy((char *)&sp[i*words+w],(char *)&stmp,sizeof(stmp));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  /*
 | 
			
		||||
  scalar_type *vp = (scalar_type *)&vec;
 | 
			
		||||
  scalar_type      vtmp;
 | 
			
		||||
  sobj_scalar_type stmp;
 | 
			
		||||
@@ -74,6 +84,8 @@ void extract(const vobj &vec,ExtractBuffer<sobj> &extracted)
 | 
			
		||||
      memcpy((char *)&sp[i*words+w],(char *)&stmp,sizeof(stmp));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  */
 | 
			
		||||
  
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -93,7 +105,7 @@ void   merge(vobj &vec,ExtractBuffer<sobj> &extracted)
 | 
			
		||||
  const int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  sobj_scalar_type *sp = (sobj_scalar_type *)&extracted[0];
 | 
			
		||||
  scalar_type *vp = (scalar_type *)&vec;
 | 
			
		||||
  vector_type *vp = (vector_type *)&vec;
 | 
			
		||||
  scalar_type      vtmp;
 | 
			
		||||
  sobj_scalar_type stmp;
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
@@ -101,7 +113,8 @@ void   merge(vobj &vec,ExtractBuffer<sobj> &extracted)
 | 
			
		||||
      for(int ii=0;ii<s;ii++){
 | 
			
		||||
	memcpy((char *)&stmp,(char *)&sp[i*words+w],sizeof(stmp));
 | 
			
		||||
	vtmp = stmp;
 | 
			
		||||
	memcpy((char *)&vp[w*Nsimd+i*s+ii],(char *)&vtmp,sizeof(vtmp));
 | 
			
		||||
	vp[w].putlane(vtmp,i*s+ii);
 | 
			
		||||
	//	memcpy((char *)&vp[w*Nsimd+i*s+ii],(char *)&vtmp,sizeof(vtmp));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
@@ -124,9 +137,9 @@ typename vobj::scalar_object extractLane(int lane, const vobj & __restrict__ vec
 | 
			
		||||
 | 
			
		||||
  scalar_object extracted;
 | 
			
		||||
  pointer __restrict__  sp = (pointer)&extracted; // Type pun
 | 
			
		||||
  pointer __restrict__  vp = (pointer)&vec;
 | 
			
		||||
  vector_type *vp = (vector_type *)&vec;
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    sp[w]=vp[w*Nsimd+lane];
 | 
			
		||||
    sp[w]=vp[w].getlane(lane);
 | 
			
		||||
  }
 | 
			
		||||
  return extracted;
 | 
			
		||||
}
 | 
			
		||||
@@ -143,9 +156,9 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob
 | 
			
		||||
  constexpr int Nsimd=vector_type::Nsimd();
 | 
			
		||||
 | 
			
		||||
  pointer __restrict__ sp = (pointer)&extracted;
 | 
			
		||||
  pointer __restrict__ vp = (pointer)&vec;
 | 
			
		||||
  vector_type *vp = (vector_type *)&vec;
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    vp[w*Nsimd+lane]=sp[w];
 | 
			
		||||
    vp[w].putlane(sp[w],lane);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -164,15 +177,13 @@ void extract(const vobj &vec,const ExtractPointerArray<sobj> &extracted, int off
 | 
			
		||||
  const int Nextr=extracted.size();
 | 
			
		||||
  const int s = Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  scalar_type * vp = (scalar_type *)&vec;
 | 
			
		||||
  vector_type * vp = (vector_type *)&vec;
 | 
			
		||||
  scalar_type      vtmp;
 | 
			
		||||
  sobj_scalar_type stmp;
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    for(int i=0;i<Nextr;i++){
 | 
			
		||||
      sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
 | 
			
		||||
      memcpy((char *)&vtmp,(char *)&vp[w*Nsimd+i*s],sizeof(vtmp));
 | 
			
		||||
      stmp = vtmp;
 | 
			
		||||
      memcpy((char *)&pointer[w],(char *)&stmp,sizeof(stmp)); // may do a precision conversion
 | 
			
		||||
      pointer[w] = vp[w].getlane(i*s);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -192,23 +203,21 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
 | 
			
		||||
  const int Nextr=extracted.size();
 | 
			
		||||
  const int s = Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  scalar_type * vp = (scalar_type *)&vec;
 | 
			
		||||
  vector_type * vp = (vector_type *)&vec;
 | 
			
		||||
  scalar_type      vtmp;
 | 
			
		||||
  sobj_scalar_type stmp;
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    for(int i=0;i<Nextr;i++){
 | 
			
		||||
      sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
 | 
			
		||||
      for(int ii=0;ii<s;ii++){
 | 
			
		||||
	memcpy((char *)&stmp,(char *)&pointer[w],sizeof(stmp));
 | 
			
		||||
	vtmp=stmp;
 | 
			
		||||
	memcpy((char *)&vp[w*Nsimd+i*s+ii],(char *)&vtmp,sizeof(vtmp));
 | 
			
		||||
	vtmp=pointer[w];
 | 
			
		||||
	vp[w].putlane(vtmp,i*s+ii);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//Copy a single lane of a SIMD tensor type from one object to another
 | 
			
		||||
//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
 | 
			
		||||
@@ -239,12 +248,12 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest
 | 
			
		||||
  iscalar_type itmp;
 | 
			
		||||
  oscalar_type otmp;
 | 
			
		||||
 | 
			
		||||
  opointer __restrict__  op = (opointer)&vecOut;
 | 
			
		||||
  ipointer __restrict__  ip = (ipointer)&vecIn;
 | 
			
		||||
  ovector_type * __restrict__ op = (ovector_type *)&vecOut;
 | 
			
		||||
  ivector_type * __restrict__ ip = (ivector_type *)&vecIn;
 | 
			
		||||
  for(int w=0;w<owords;w++){
 | 
			
		||||
    memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
 | 
			
		||||
    itmp = ip[iNsimd*w].getlane(lane_in);
 | 
			
		||||
    otmp = itmp; //potential precision change
 | 
			
		||||
    memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
 | 
			
		||||
    op[oNsimd*w].putlane(otmp,lane_out);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										84
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										84
									
								
								configure.ac
									
									
									
									
									
								
							@@ -1,5 +1,5 @@
 | 
			
		||||
AC_PREREQ([2.63])
 | 
			
		||||
AC_INIT([Grid], [0.7.0], [https://github.com/paboyle/Grid], [Grid])
 | 
			
		||||
AC_PREREQ([2.71])
 | 
			
		||||
AC_INIT([Grid],[0.7.0],[https://github.com/paboyle/Grid],[Grid])
 | 
			
		||||
AC_CANONICAL_BUILD
 | 
			
		||||
AC_CANONICAL_HOST
 | 
			
		||||
AC_CANONICAL_TARGET
 | 
			
		||||
@@ -20,7 +20,7 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 | 
			
		||||
 | 
			
		||||
################ Set flags
 | 
			
		||||
# do not move!
 | 
			
		||||
AC_ARG_ENABLE([debug],[AC_HELP_STRING([--enable-debug=yes|no], [enable debug compilation ])],
 | 
			
		||||
AC_ARG_ENABLE([debug],[AS_HELP_STRING([--enable-debug=yes|no],[enable debug compilation ])],
 | 
			
		||||
    [ac_DEBUG=${enable_debug}], [ac_DEBUG=no])
 | 
			
		||||
case ${ac_DEBUG} in
 | 
			
		||||
    yes)
 | 
			
		||||
@@ -114,7 +114,7 @@ AC_ARG_WITH([openssl],
 | 
			
		||||
 | 
			
		||||
############### lapack
 | 
			
		||||
AC_ARG_ENABLE([lapack],
 | 
			
		||||
    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])],
 | 
			
		||||
    [AS_HELP_STRING([--enable-lapack=yes|no|prefix],[enable LAPACK])],
 | 
			
		||||
    [ac_LAPACK=${enable_lapack}], [ac_LAPACK=no])
 | 
			
		||||
 | 
			
		||||
case ${ac_LAPACK} in
 | 
			
		||||
@@ -130,7 +130,7 @@ esac
 | 
			
		||||
 | 
			
		||||
############### tracing
 | 
			
		||||
AC_ARG_ENABLE([tracing],
 | 
			
		||||
    [AC_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer], [enable tracing])],
 | 
			
		||||
    [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
 | 
			
		||||
    [ac_TRACING=${enable_tracing}], [ac_TRACING=none])
 | 
			
		||||
 | 
			
		||||
case ${ac_TRACING} in
 | 
			
		||||
@@ -150,19 +150,19 @@ esac
 | 
			
		||||
 | 
			
		||||
############### fermions
 | 
			
		||||
AC_ARG_ENABLE([fermion-reps],
 | 
			
		||||
     [AC_HELP_STRING([--enable-fermion-reps=yes|no], [enable extra fermion representation support])],
 | 
			
		||||
     [AS_HELP_STRING([--enable-fermion-reps=yes|no],[enable extra fermion representation support])],
 | 
			
		||||
     [ac_FERMION_REPS=${enable_fermion_reps}], [ac_FERMION_REPS=yes])
 | 
			
		||||
 | 
			
		||||
AM_CONDITIONAL(BUILD_FERMION_REPS, [ test "${ac_FERMION_REPS}X" == "yesX" ])
 | 
			
		||||
 | 
			
		||||
AC_ARG_ENABLE([gparity],
 | 
			
		||||
     [AC_HELP_STRING([--enable-gparity=yes|no], [enable G-parity support])],
 | 
			
		||||
     [AS_HELP_STRING([--enable-gparity=yes|no],[enable G-parity support])],
 | 
			
		||||
     [ac_GPARITY=${enable_gparity}], [ac_GPARITY=yes])
 | 
			
		||||
 | 
			
		||||
AM_CONDITIONAL(BUILD_GPARITY, [ test "${ac_GPARITY}X" == "yesX" ])
 | 
			
		||||
 | 
			
		||||
AC_ARG_ENABLE([zmobius],
 | 
			
		||||
     [AC_HELP_STRING([--enable-zmobius=yes|no], [enable Zmobius support])],
 | 
			
		||||
     [AS_HELP_STRING([--enable-zmobius=yes|no],[enable Zmobius support])],
 | 
			
		||||
     [ac_ZMOBIUS=${enable_zmobius}], [ac_ZMOBIUS=yes])
 | 
			
		||||
 | 
			
		||||
AM_CONDITIONAL(BUILD_ZMOBIUS, [ test "${ac_ZMOBIUS}X" == "yesX" ])
 | 
			
		||||
@@ -179,7 +179,7 @@ case ${ac_ZMOBIUS} in
 | 
			
		||||
esac
 | 
			
		||||
############### Nc
 | 
			
		||||
AC_ARG_ENABLE([Nc],
 | 
			
		||||
    [AC_HELP_STRING([--enable-Nc=2|3|4|5], [enable number of colours])],
 | 
			
		||||
    [AS_HELP_STRING([--enable-Nc=2|3|4|5],[enable number of colours])],
 | 
			
		||||
    [ac_Nc=${enable_Nc}], [ac_Nc=3])
 | 
			
		||||
 | 
			
		||||
case ${ac_Nc} in
 | 
			
		||||
@@ -197,7 +197,7 @@ esac
 | 
			
		||||
 | 
			
		||||
############### FP16 conversions
 | 
			
		||||
AC_ARG_ENABLE([sfw-fp16],
 | 
			
		||||
    [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])],
 | 
			
		||||
    [AS_HELP_STRING([--enable-sfw-fp16=yes|no],[enable software fp16 comms])],
 | 
			
		||||
    [ac_SFW_FP16=${enable_sfw_fp16}], [ac_SFW_FP16=yes])
 | 
			
		||||
case ${ac_SFW_FP16} in
 | 
			
		||||
    yes)
 | 
			
		||||
@@ -209,11 +209,11 @@ esac
 | 
			
		||||
 | 
			
		||||
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
 | 
			
		||||
AC_ARG_ENABLE([accelerator-cshift],
 | 
			
		||||
    [AC_HELP_STRING([--enable-accelerator-cshift=yes|no], [run cshift on the device])],
 | 
			
		||||
    [AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])],
 | 
			
		||||
    [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
 | 
			
		||||
 | 
			
		||||
AC_ARG_ENABLE([ucx-buggy],
 | 
			
		||||
    [AC_HELP_STRING([--enable-ucx-buggy=yes|no], [enable workaround for UCX device buffer bugs])],
 | 
			
		||||
    [AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])],
 | 
			
		||||
    [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
 | 
			
		||||
 | 
			
		||||
case ${ac_UCXBUGGY} in
 | 
			
		||||
@@ -231,7 +231,7 @@ esac
 | 
			
		||||
 | 
			
		||||
############### SYCL/CUDA/HIP/none
 | 
			
		||||
AC_ARG_ENABLE([accelerator],
 | 
			
		||||
    [AC_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none], [enable none,cuda,sycl,hip acceleration])],
 | 
			
		||||
    [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
 | 
			
		||||
    [ac_ACCELERATOR=${enable_accelerator}], [ac_ACCELERATOR=none])
 | 
			
		||||
case ${ac_ACCELERATOR} in
 | 
			
		||||
    cuda)
 | 
			
		||||
@@ -254,7 +254,7 @@ esac
 | 
			
		||||
 | 
			
		||||
############### UNIFIED MEMORY
 | 
			
		||||
AC_ARG_ENABLE([unified],
 | 
			
		||||
    [AC_HELP_STRING([--enable-unified=yes|no], [enable unified address space for accelerator loops])],
 | 
			
		||||
    [AS_HELP_STRING([--enable-unified=yes|no],[enable unified address space for accelerator loops])],
 | 
			
		||||
    [ac_UNIFIED=${enable_unified}], [ac_UNIFIED=yes])
 | 
			
		||||
case ${ac_UNIFIED} in
 | 
			
		||||
    yes)
 | 
			
		||||
@@ -268,10 +268,10 @@ esac
 | 
			
		||||
 | 
			
		||||
############### Intel libraries
 | 
			
		||||
AC_ARG_ENABLE([mkl],
 | 
			
		||||
    [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
 | 
			
		||||
    [AS_HELP_STRING([--enable-mkl=yes|no|prefix],[enable Intel MKL for LAPACK & FFTW])],
 | 
			
		||||
    [ac_MKL=${enable_mkl}], [ac_MKL=no])
 | 
			
		||||
AC_ARG_ENABLE([ipp],
 | 
			
		||||
    [AC_HELP_STRING([--enable-ipp=yes|no|prefix], [enable Intel IPP for fast CRC32C])],
 | 
			
		||||
    [AS_HELP_STRING([--enable-ipp=yes|no|prefix],[enable Intel IPP for fast CRC32C])],
 | 
			
		||||
    [ac_IPP=${enable_ipp}], [ac_IPP=no])
 | 
			
		||||
 | 
			
		||||
case ${ac_MKL} in
 | 
			
		||||
@@ -369,8 +369,7 @@ CXXFLAGS=$CXXFLAGS_CPY
 | 
			
		||||
LDFLAGS=$LDFLAGS_CPY
 | 
			
		||||
 | 
			
		||||
############### SIMD instruction selection
 | 
			
		||||
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=code],
 | 
			
		||||
              [select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN])
 | 
			
		||||
AC_ARG_ENABLE([simd],[AS_HELP_STRING([--enable-simd=code],[select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN])
 | 
			
		||||
 | 
			
		||||
AC_ARG_ENABLE([gen-simd-width],
 | 
			
		||||
            [AS_HELP_STRING([--enable-gen-simd-width=size],
 | 
			
		||||
@@ -435,7 +434,13 @@ case ${ax_cv_cxx_compiler_vendor} in
 | 
			
		||||
  clang|gnu)
 | 
			
		||||
    case ${ac_SIMD} in
 | 
			
		||||
      GPU)
 | 
			
		||||
        AC_DEFINE([GPU_VEC],[1],[GPU vectorised 512bit])
 | 
			
		||||
        AC_DEFINE([GPU_VEC],[1],[GPU vectorised])
 | 
			
		||||
        AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width],
 | 
			
		||||
                           [generic SIMD vector width (in bytes)])
 | 
			
		||||
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
 | 
			
		||||
	SIMD_FLAGS='';;
 | 
			
		||||
      GPU-RRII)
 | 
			
		||||
        AC_DEFINE([GPU_RRII],[1],[GPU vectorised with RRRR / IIII layout])
 | 
			
		||||
        AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width],
 | 
			
		||||
                           [generic SIMD vector width (in bytes)])
 | 
			
		||||
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
 | 
			
		||||
@@ -504,6 +509,12 @@ case ${ax_cv_cxx_compiler_vendor} in
 | 
			
		||||
      GPU)
 | 
			
		||||
        AC_DEFINE([GPU_VEC],[1],[GPU vectorised ])
 | 
			
		||||
	SIMD_FLAGS='';;
 | 
			
		||||
      GPU-RRII)
 | 
			
		||||
        AC_DEFINE([GPU_RRII],[1],[GPU vectorised with RRRR / IIII layout])
 | 
			
		||||
        AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width],
 | 
			
		||||
                           [generic SIMD vector width (in bytes)])
 | 
			
		||||
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
 | 
			
		||||
	SIMD_FLAGS='';;
 | 
			
		||||
      SSE4)
 | 
			
		||||
        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
 | 
			
		||||
        SIMD_FLAGS='-msse4.2 -xsse4.2';;
 | 
			
		||||
@@ -551,8 +562,7 @@ AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE]
 | 
			
		||||
#########################################################
 | 
			
		||||
######################  GRID ALLOCATOR ALIGNMENT ##
 | 
			
		||||
#########################################################
 | 
			
		||||
AC_ARG_ENABLE([alloc-align],[AC_HELP_STRING([--enable-alloc-align=2MB|4k],
 | 
			
		||||
              [Alignment in bytes of GRID Allocator ])],[ac_ALLOC_ALIGN=${enable_alloc_align}],[ac_ALLOC_ALIGN=2MB])
 | 
			
		||||
AC_ARG_ENABLE([alloc-align],[AS_HELP_STRING([--enable-alloc-align=2MB|4k],[Alignment in bytes of GRID Allocator ])],[ac_ALLOC_ALIGN=${enable_alloc_align}],[ac_ALLOC_ALIGN=2MB])
 | 
			
		||||
case ${ac_ALLOC_ALIGN} in
 | 
			
		||||
    4k)
 | 
			
		||||
     AC_DEFINE([GRID_ALLOC_ALIGN],[(4096)],[GRID_ALLOC_ALIGN]);;
 | 
			
		||||
@@ -561,8 +571,7 @@ case ${ac_ALLOC_ALIGN} in
 | 
			
		||||
    *);;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
AC_ARG_ENABLE([alloc-cache],[AC_HELP_STRING([--enable-alloc-cache ],
 | 
			
		||||
              [Cache a pool of recent "frees" to reuse])],[ac_ALLOC_CACHE=${enable_alloc_cache}],[ac_ALLOC_CACHE=yes])
 | 
			
		||||
AC_ARG_ENABLE([alloc-cache],[AS_HELP_STRING([--enable-alloc-cache ],[Cache a pool of recent "frees" to reuse])],[ac_ALLOC_CACHE=${enable_alloc_cache}],[ac_ALLOC_CACHE=yes])
 | 
			
		||||
case ${ac_ALLOC_CACHE} in
 | 
			
		||||
    yes)
 | 
			
		||||
     AC_DEFINE([ALLOCATION_CACHE],[1],[ALLOCATION_CACHE]);;
 | 
			
		||||
@@ -573,8 +582,7 @@ esac
 | 
			
		||||
#########################################################
 | 
			
		||||
######################  set GPU device to rank in node ##
 | 
			
		||||
#########################################################
 | 
			
		||||
AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdevice],
 | 
			
		||||
              [Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no])
 | 
			
		||||
AC_ARG_ENABLE([setdevice],[AS_HELP_STRING([--enable-setdevice | --disable-setdevice],[Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no])
 | 
			
		||||
case ${ac_SETDEVICE} in
 | 
			
		||||
    yes)
 | 
			
		||||
	echo ENABLE SET DEVICE
 | 
			
		||||
@@ -588,8 +596,7 @@ esac
 | 
			
		||||
#########################################################
 | 
			
		||||
######################  Shared memory intranode #########
 | 
			
		||||
#########################################################
 | 
			
		||||
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no|none],
 | 
			
		||||
              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=no])
 | 
			
		||||
AC_ARG_ENABLE([shm],[AS_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no|none],[Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=no])
 | 
			
		||||
 | 
			
		||||
case ${ac_SHM} in
 | 
			
		||||
 | 
			
		||||
@@ -626,15 +633,13 @@ case ${ac_SHM} in
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
######################  Shared base path for SHMMMAP
 | 
			
		||||
AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
 | 
			
		||||
              [Select SHM mmap base path for hugetlbfs])],
 | 
			
		||||
AC_ARG_ENABLE([shmpath],[AS_HELP_STRING([--enable-shmpath=path],[Select SHM mmap base path for hugetlbfs])],
 | 
			
		||||
	      [ac_SHMPATH=${enable_shmpath}],
 | 
			
		||||
	      [ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/])
 | 
			
		||||
AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
 | 
			
		||||
 | 
			
		||||
############### force MPI in SMP
 | 
			
		||||
AC_ARG_ENABLE([shm-force-mpi],[AC_HELP_STRING([--enable-shm-force-mpi],
 | 
			
		||||
              [Force MPI within shared memory])],[ac_SHM_FORCE_MPI=${enable_shm_force_mpi}],[ac_SHM_FORCE_MPI=no])
 | 
			
		||||
AC_ARG_ENABLE([shm-force-mpi],[AS_HELP_STRING([--enable-shm-force-mpi],[Force MPI within shared memory])],[ac_SHM_FORCE_MPI=${enable_shm_force_mpi}],[ac_SHM_FORCE_MPI=no])
 | 
			
		||||
case ${ac_SHM_FORCE_MPI} in
 | 
			
		||||
     yes)
 | 
			
		||||
        AC_DEFINE([GRID_SHM_FORCE_MPI],[1],[GRID_SHM_FORCE_MPI] )
 | 
			
		||||
@@ -643,8 +648,7 @@ case ${ac_SHM_FORCE_MPI} in
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
############### communication type selection
 | 
			
		||||
AC_ARG_ENABLE([comms-threads],[AC_HELP_STRING([--enable-comms-threads | --disable-comms-threads],
 | 
			
		||||
              [Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
 | 
			
		||||
AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
 | 
			
		||||
 | 
			
		||||
case ${ac_COMMS_THREADS} in
 | 
			
		||||
     yes)
 | 
			
		||||
@@ -654,8 +658,7 @@ case ${ac_COMMS_THREADS} in
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
############### communication type selection
 | 
			
		||||
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto],
 | 
			
		||||
              [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
 | 
			
		||||
AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
case ${ac_COMMS} in
 | 
			
		||||
@@ -689,8 +692,8 @@ AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 | 
			
		||||
 | 
			
		||||
############### RNG selection
 | 
			
		||||
AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937|sitmo],\
 | 
			
		||||
	            [Select Random Number Generator to be used])],\
 | 
			
		||||
AC_ARG_ENABLE([rng],[AS_HELP_STRING([--enable-rng=ranlux48|mt19937|sitmo],[\
 | 
			
		||||
	            Select Random Number Generator to be used])],\
 | 
			
		||||
	            [ac_RNG=${enable_rng}],[ac_RNG=sitmo])
 | 
			
		||||
 | 
			
		||||
case ${ac_RNG} in
 | 
			
		||||
@@ -709,8 +712,8 @@ case ${ac_RNG} in
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
############### Timer option
 | 
			
		||||
AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers],\
 | 
			
		||||
	            [Enable system dependent high res timers])],\
 | 
			
		||||
AC_ARG_ENABLE([timers],[AS_HELP_STRING([--enable-timers],[\
 | 
			
		||||
	            Enable system dependent high res timers])],\
 | 
			
		||||
	            [ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
 | 
			
		||||
 | 
			
		||||
case ${ac_TIMERS} in
 | 
			
		||||
@@ -726,8 +729,7 @@ case ${ac_TIMERS} in
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
############### Chroma regression test
 | 
			
		||||
AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],
 | 
			
		||||
              [Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
 | 
			
		||||
AC_ARG_ENABLE([chroma],[AS_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
 | 
			
		||||
 | 
			
		||||
case ${ac_CHROMA} in
 | 
			
		||||
     yes|no)
 | 
			
		||||
 
 | 
			
		||||
@@ -1 +1,3 @@
 | 
			
		||||
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi
 | 
			
		||||
#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi
 | 
			
		||||
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU-RRII --enable-comms=mpi
 | 
			
		||||
#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU --enable-debug --enable-comms=mpi
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user