mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Merge branch 'master' into hadrons
This commit is contained in:
		
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							@@ -47,6 +47,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
 | 
			
		||||
#define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
 | 
			
		||||
 | 
			
		||||
#define RotateBit (0x100)
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  typedef uint32_t Integer;
 | 
			
		||||
 
 | 
			
		||||
@@ -321,6 +321,9 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	int simd_layout     = _grid->_simd_layout[dimension];
 | 
			
		||||
	int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
			
		||||
	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
 | 
			
		||||
	int rotate_dim      = _grid->_simd_layout[dimension]>2;
 | 
			
		||||
 | 
			
		||||
	assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
 | 
			
		||||
 | 
			
		||||
	int sshift[2];
 | 
			
		||||
	
 | 
			
		||||
@@ -368,7 +371,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      int rd = _grid->_rdimensions[dimension];
 | 
			
		||||
      int ld = _grid->_ldimensions[dimension];
 | 
			
		||||
      int gd = _grid->_gdimensions[dimension];
 | 
			
		||||
      
 | 
			
		||||
      int ly = _grid->_simd_layout[dimension];
 | 
			
		||||
 | 
			
		||||
      // Map to always positive shift modulo global full dimension.
 | 
			
		||||
      int shift = (shiftpm+fd)%fd;
 | 
			
		||||
      
 | 
			
		||||
@@ -398,7 +402,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	  int wrap = sshift/rd;
 | 
			
		||||
	  int  num = sshift%rd;
 | 
			
		||||
	  if ( x< rd-num ) permute_slice=wrap;
 | 
			
		||||
	  else permute_slice = 1-wrap;
 | 
			
		||||
	  else permute_slice = (wrap+1)%ly;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
  	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
 | 
			
		||||
@@ -418,7 +422,6 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      int simd_layout     = _grid->_simd_layout[dimension];
 | 
			
		||||
      int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
			
		||||
      
 | 
			
		||||
      //      assert(simd_layout==1); // Why?
 | 
			
		||||
      assert(comm_dim==1);
 | 
			
		||||
      int shift = (shiftpm + fd) %fd;
 | 
			
		||||
      assert(shift>=0);
 | 
			
		||||
@@ -529,7 +532,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	      _entries[point][lo+o+b]._around_the_world=wrap;
 | 
			
		||||
	    }
 | 
			
		||||
	    
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
	  o +=_grid->_slice_stride[dimension];
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
@@ -591,8 +594,11 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      template<class compressor>
 | 
			
		||||
      void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
      {
 | 
			
		||||
	auto thr = HaloExchangeBegin(source,compress);
 | 
			
		||||
        HaloExchangeComplete(thr);
 | 
			
		||||
	Mergers.resize(0); 
 | 
			
		||||
	Packets.resize(0);
 | 
			
		||||
	HaloGather(source,compress);
 | 
			
		||||
	Communicate();
 | 
			
		||||
	CommsMerge();
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      void HaloExchangeComplete(std::thread &thr) 
 | 
			
		||||
@@ -749,6 +755,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	  int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
			
		||||
 | 
			
		||||
	  assert(comm_dim==1);
 | 
			
		||||
	  // This will not work with a rotate dim
 | 
			
		||||
	  assert(simd_layout==2);
 | 
			
		||||
	  assert(shift>=0);
 | 
			
		||||
	  assert(shift<fd);
 | 
			
		||||
@@ -793,7 +800,9 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	      gathermtime+=usecond();
 | 
			
		||||
 | 
			
		||||
	      for(int i=0;i<Nsimd;i++){
 | 
			
		||||
 | 
			
		||||
		
 | 
			
		||||
		// FIXME 
 | 
			
		||||
		// This logic is hard coded to simd_layout ==2 and not allowing >2
 | 
			
		||||
		//		std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
 | 
			
		||||
 | 
			
		||||
		int inner_bit = (Nsimd>>(permute_type+1));
 | 
			
		||||
 
 | 
			
		||||
@@ -16,9 +16,13 @@
 | 
			
		||||
#define INCLUDED_ALG_REMEZ_H
 | 
			
		||||
 | 
			
		||||
#include <stddef.h>
 | 
			
		||||
#include <Config.h>
 | 
			
		||||
 | 
			
		||||
//#include <algorithms/approx/bigfloat.h>
 | 
			
		||||
#ifdef HAVE_GMP_H
 | 
			
		||||
#include <algorithms/approx/bigfloat.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <algorithms/approx/bigfloat_double.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define JMAX 10000 //Maximum number of iterations of Newton's approximation
 | 
			
		||||
#define SUM_MAX 10 // Maximum number of terms in exponential
 | 
			
		||||
 
 | 
			
		||||
@@ -564,6 +564,7 @@ until convergence
 | 
			
		||||
	  
 | 
			
		||||
	  for(int j=k1-1; j<k2+1; ++j){
 | 
			
		||||
	    for(int k=0; k<Nm; ++k){
 | 
			
		||||
	    B[j].checkerboard = evec[k].checkerboard;
 | 
			
		||||
	      B[j] += Qt[k+Nm*j] * evec[k];
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
@@ -592,6 +593,7 @@ until convergence
 | 
			
		||||
	  
 | 
			
		||||
	  for(int j = 0; j<Nk; ++j){
 | 
			
		||||
	    for(int k = 0; k<Nk; ++k){
 | 
			
		||||
	    B[j].checkerboard = evec[k].checkerboard;
 | 
			
		||||
	      B[j] += Qt[k+j*Nm] * evec[k];
 | 
			
		||||
	    }
 | 
			
		||||
//	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -138,6 +138,25 @@ public:
 | 
			
		||||
    }
 | 
			
		||||
    inline int PermuteType(int dimension){
 | 
			
		||||
      int permute_type=0;
 | 
			
		||||
      //
 | 
			
		||||
      // FIXME:
 | 
			
		||||
      //
 | 
			
		||||
      // Best way to encode this would be to present a mask 
 | 
			
		||||
      // for which simd dimensions are rotated, and the rotation
 | 
			
		||||
      // size. If there is only one simd dimension rotated, this is just 
 | 
			
		||||
      // a permute. 
 | 
			
		||||
      //
 | 
			
		||||
      // Cases: PermuteType == 1,2,4,8
 | 
			
		||||
      // Distance should be either 0,1,2..
 | 
			
		||||
      //
 | 
			
		||||
      if ( _simd_layout[dimension] > 2 ) { 
 | 
			
		||||
	for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	  if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
 | 
			
		||||
	}
 | 
			
		||||
	permute_type = RotateBit; // How to specify distance; this is not just direction.
 | 
			
		||||
	return permute_type;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      for(int d=_ndimension-1;d>dimension;d--){
 | 
			
		||||
	if (_simd_layout[d]>1 ) permute_type++;
 | 
			
		||||
      }
 | 
			
		||||
@@ -147,12 +166,12 @@ public:
 | 
			
		||||
    // Array sizing queries
 | 
			
		||||
    ////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
    inline int iSites(void) { return _isites; };
 | 
			
		||||
    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
 | 
			
		||||
    inline int oSites(void) { return _osites; };
 | 
			
		||||
    inline int lSites(void) { return _isites*_osites; }; 
 | 
			
		||||
    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
 | 
			
		||||
    inline int Nd    (void) { return _ndimension;};
 | 
			
		||||
    inline int iSites(void) const { return _isites; };
 | 
			
		||||
    inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
 | 
			
		||||
    inline int oSites(void) const { return _osites; };
 | 
			
		||||
    inline int lSites(void) const { return _isites*_osites; }; 
 | 
			
		||||
    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
 | 
			
		||||
    inline int Nd    (void) const { return _ndimension;};
 | 
			
		||||
 | 
			
		||||
    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
 | 
			
		||||
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
 | 
			
		||||
@@ -165,6 +184,9 @@ public:
 | 
			
		||||
    void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
 | 
			
		||||
      Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
 | 
			
		||||
    }
 | 
			
		||||
    void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
 | 
			
		||||
      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
 | 
			
		||||
    }
 | 
			
		||||
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
 | 
			
		||||
      gidx=0;
 | 
			
		||||
      int mult=1;
 | 
			
		||||
 
 | 
			
		||||
@@ -324,6 +324,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
 | 
			
		||||
  int rd = grid->_rdimensions[dimension];
 | 
			
		||||
  int ld = grid->_ldimensions[dimension];
 | 
			
		||||
  int gd = grid->_gdimensions[dimension];
 | 
			
		||||
  int ly = grid->_simd_layout[dimension];
 | 
			
		||||
 | 
			
		||||
  // Map to always positive shift modulo global full dimension.
 | 
			
		||||
  shift = (shift+fd)%fd;
 | 
			
		||||
@@ -332,6 +333,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
 | 
			
		||||
  // the permute type
 | 
			
		||||
  int permute_dim =grid->PermuteDim(dimension);
 | 
			
		||||
  int permute_type=grid->PermuteType(dimension);
 | 
			
		||||
  int permute_type_dist;
 | 
			
		||||
 | 
			
		||||
  for(int x=0;x<rd;x++){       
 | 
			
		||||
 | 
			
		||||
@@ -343,15 +345,31 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
 | 
			
		||||
    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
 | 
			
		||||
    int sx     = (x+sshift)%rd;
 | 
			
		||||
 | 
			
		||||
    // FIXME : This must change where we have a 
 | 
			
		||||
    // Rotate slice.
 | 
			
		||||
    
 | 
			
		||||
    // Document how this works ; why didn't I do this when I first wrote it...
 | 
			
		||||
    // wrap is whether sshift > rd.
 | 
			
		||||
    //  num is sshift mod rd.
 | 
			
		||||
    // 
 | 
			
		||||
    int permute_slice=0;
 | 
			
		||||
    if(permute_dim){
 | 
			
		||||
      int wrap = sshift/rd;
 | 
			
		||||
      int  num = sshift%rd;
 | 
			
		||||
 | 
			
		||||
      if ( x< rd-num ) permute_slice=wrap;
 | 
			
		||||
      else permute_slice = 1-wrap;
 | 
			
		||||
      else permute_slice = (wrap+1)%ly;
 | 
			
		||||
 | 
			
		||||
      if ( (ly>2) && (permute_slice) ) {
 | 
			
		||||
	assert(permute_type & RotateBit);
 | 
			
		||||
	permute_type_dist = permute_type|permute_slice;
 | 
			
		||||
      } else {
 | 
			
		||||
	permute_type_dist = permute_type;
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
 | 
			
		||||
    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
 | 
			
		||||
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
@@ -152,7 +152,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    // Peek a scalar object from the SIMD array
 | 
			
		||||
    //////////////////////////////////////////////////////////
 | 
			
		||||
    template<class vobj,class sobj>
 | 
			
		||||
    void peekLocalSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
 | 
			
		||||
    void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
 | 
			
		||||
        
 | 
			
		||||
      GridBase *grid=l._grid;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -142,6 +142,10 @@ namespace Grid {
 | 
			
		||||
        mult(&phi(),&U(mu),&chi());
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      template<class ref>
 | 
			
		||||
      inline void loadLinkElement(Simd & reg,ref &memory){
 | 
			
		||||
	reg = memory;
 | 
			
		||||
      }
 | 
			
		||||
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
			
		||||
      {
 | 
			
		||||
        conformable(Uds._grid,GaugeGrid);
 | 
			
		||||
@@ -181,6 +185,100 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ///////
 | 
			
		||||
    // Single flavour four spinors with colour index, 5d redblack
 | 
			
		||||
    ///////
 | 
			
		||||
    template<class S,int Nrepresentation=Nc>
 | 
			
		||||
    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
 | 
			
		||||
 | 
			
		||||
      INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
      
 | 
			
		||||
      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
 | 
			
		||||
      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
 | 
			
		||||
      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
 | 
			
		||||
      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
 | 
			
		||||
      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
 | 
			
		||||
    
 | 
			
		||||
      typedef iImplSpinor    <Simd>           SiteSpinor;
 | 
			
		||||
      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
 | 
			
		||||
      typedef Lattice<SiteSpinor>             FermionField;
 | 
			
		||||
 | 
			
		||||
      // Make the doubled gauge field a *scalar*
 | 
			
		||||
      typedef iImplDoubledGaugeField<typename Simd::scalar_type>    SiteDoubledGaugeField; // This is a scalar
 | 
			
		||||
      typedef iImplGaugeField<typename Simd::scalar_type>           SiteScalarGaugeField;  // scalar
 | 
			
		||||
      typedef iImplGaugeLink <typename Simd::scalar_type>           SiteScalarGaugeLink;   // scalar
 | 
			
		||||
 | 
			
		||||
      typedef Lattice<SiteDoubledGaugeField>                  DoubledGaugeField;
 | 
			
		||||
 | 
			
		||||
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
			
		||||
      typedef WilsonImplParams ImplParams;
 | 
			
		||||
      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
 | 
			
		||||
 | 
			
		||||
      ImplParams Params;
 | 
			
		||||
 | 
			
		||||
      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
 | 
			
		||||
 | 
			
		||||
      bool overlapCommsCompute(void) { return false; };
 | 
			
		||||
    
 | 
			
		||||
      template<class ref>
 | 
			
		||||
      inline void loadLinkElement(Simd & reg,ref &memory){
 | 
			
		||||
	vsplat(reg,memory);
 | 
			
		||||
      }
 | 
			
		||||
      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
 | 
			
		||||
      {
 | 
			
		||||
	SiteGaugeLink UU;
 | 
			
		||||
	for(int i=0;i<Nrepresentation;i++){
 | 
			
		||||
	  for(int j=0;j<Nrepresentation;j++){
 | 
			
		||||
	    vsplat(UU()()(i,j),U(mu)()(i,j));
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
        mult(&phi(),&UU(),&chi());
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
			
		||||
      {
 | 
			
		||||
	SiteScalarGaugeField  ScalarUmu;
 | 
			
		||||
	SiteDoubledGaugeField ScalarUds;
 | 
			
		||||
 | 
			
		||||
        GaugeLinkField U   (Umu._grid);
 | 
			
		||||
	GaugeField     Uadj(Umu._grid);
 | 
			
		||||
        for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
  	  U = PeekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
	  U = adj(Cshift(U,mu,-1));
 | 
			
		||||
	  PokeIndex<LorentzIndex>(Uadj,U,mu);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
 | 
			
		||||
	  std::vector<int> lcoor;
 | 
			
		||||
	  GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
 | 
			
		||||
 | 
			
		||||
	  peekLocalSite(ScalarUmu,Umu,lcoor);
 | 
			
		||||
	  for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
 | 
			
		||||
 | 
			
		||||
	  peekLocalSite(ScalarUmu,Uadj,lcoor);
 | 
			
		||||
	  for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
 | 
			
		||||
 | 
			
		||||
	  pokeLocalSite(ScalarUds,Uds,lcoor);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
	
 | 
			
		||||
      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
 | 
			
		||||
	assert(0);
 | 
			
		||||
      }   
 | 
			
		||||
 | 
			
		||||
      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){
 | 
			
		||||
	assert(0);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Flavour doubled spinors; is Gparity the only? what about C*?
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -290,8 +388,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	conformable(Uds._grid,GaugeGrid);
 | 
			
		||||
	conformable(Umu._grid,GaugeGrid);
 | 
			
		||||
	
 | 
			
		||||
	GaugeLinkField Utmp(GaugeGrid);
 | 
			
		||||
	GaugeLinkField U(GaugeGrid);
 | 
			
		||||
	GaugeLinkField Utmp (GaugeGrid);
 | 
			
		||||
	GaugeLinkField U    (GaugeGrid);
 | 
			
		||||
	GaugeLinkField Uconj(GaugeGrid);
 | 
			
		||||
	
 | 
			
		||||
	Lattice<iScalar<vInteger> > coor(GaugeGrid);
 | 
			
		||||
@@ -379,6 +477,10 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
 | 
			
		||||
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
 | 
			
		||||
 | 
			
		||||
    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
 | 
			
		||||
    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
 | 
			
		||||
    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
 | 
			
		||||
 | 
			
		||||
    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
 | 
			
		||||
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
 | 
			
		||||
    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
 | 
			
		||||
 
 | 
			
		||||
@@ -288,11 +288,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
					 const FermionField &in, FermionField &out,int dag) 
 | 
			
		||||
  {
 | 
			
		||||
    if ( Impl::overlapCommsCompute () ) { 
 | 
			
		||||
      DhopInternalCommsOverlapCompute(st,U,in,out,dag);
 | 
			
		||||
    } else { 
 | 
			
		||||
      DhopInternalCommsThenCompute(st,U,in,out,dag);
 | 
			
		||||
    }
 | 
			
		||||
    DhopInternalCommsThenCompute(st,U,in,out,dag);
 | 
			
		||||
  }
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
@@ -330,15 +326,6 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
						     const FermionField &in, FermionField &out,int dag) {
 | 
			
		||||
 | 
			
		||||
    assert(0);
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
  FermOpTemplateInstantiate(WilsonFermion);
 | 
			
		||||
  GparityFermOpTemplateInstantiate(WilsonFermion);
 | 
			
		||||
 
 | 
			
		||||
@@ -116,9 +116,6 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
				    const FermionField &in, FermionField &out,int dag) ;
 | 
			
		||||
      void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
				    const FermionField &in, FermionField &out,int dag) ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      // Constructor
 | 
			
		||||
      WilsonFermion(GaugeField &_Umu,
 | 
			
		||||
 
 | 
			
		||||
@@ -68,10 +68,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
  // some assertions
 | 
			
		||||
  assert(FiveDimGrid._ndimension==5);
 | 
			
		||||
  assert(FourDimGrid._ndimension==4);
 | 
			
		||||
  
 | 
			
		||||
  assert(FiveDimRedBlackGrid._ndimension==5);
 | 
			
		||||
  assert(FourDimRedBlackGrid._ndimension==4);
 | 
			
		||||
 | 
			
		||||
  assert(FiveDimRedBlackGrid._checker_dim==1);
 | 
			
		||||
 | 
			
		||||
  // Dimension zero of the five-d is the Ls direction
 | 
			
		||||
@@ -106,11 +104,75 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
  dslashtime=0;
 | 
			
		||||
  dslash1time=0;
 | 
			
		||||
}  
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
 | 
			
		||||
				       GridCartesian         &FiveDimGrid,
 | 
			
		||||
				       GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
				       GridCartesian         &FourDimGrid,
 | 
			
		||||
				       GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
				       RealD _M5,const ImplParams &p) :
 | 
			
		||||
  Kernels(p),
 | 
			
		||||
  _FiveDimGrid        (&FiveDimGrid),
 | 
			
		||||
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
 | 
			
		||||
  _FourDimGrid        (&FourDimGrid),
 | 
			
		||||
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
 | 
			
		||||
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
 | 
			
		||||
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
 | 
			
		||||
  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
 | 
			
		||||
  M5(_M5),
 | 
			
		||||
  Umu(_FourDimGrid),
 | 
			
		||||
  UmuEven(_FourDimRedBlackGrid),
 | 
			
		||||
  UmuOdd (_FourDimRedBlackGrid),
 | 
			
		||||
  Lebesgue(_FourDimGrid),
 | 
			
		||||
  LebesgueEvenOdd(_FourDimRedBlackGrid)
 | 
			
		||||
{
 | 
			
		||||
  int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
  // some assertions
 | 
			
		||||
  assert(FiveDimGrid._ndimension==5);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._ndimension==5);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
 | 
			
		||||
  assert(FourDimGrid._ndimension==4);
 | 
			
		||||
  assert(FourDimRedBlackGrid._ndimension==4);
 | 
			
		||||
 | 
			
		||||
  // Dimension zero of the five-d is the Ls direction
 | 
			
		||||
  Ls=FiveDimGrid._fdimensions[0];
 | 
			
		||||
  assert(FiveDimGrid._processors[0]         ==1);
 | 
			
		||||
  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
 | 
			
		||||
 | 
			
		||||
  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._processors[0] ==1);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
 | 
			
		||||
 | 
			
		||||
  // Other dimensions must match the decomposition of the four-D fields 
 | 
			
		||||
  for(int d=0;d<4;d++){
 | 
			
		||||
    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
 | 
			
		||||
    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
 | 
			
		||||
 | 
			
		||||
    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
 | 
			
		||||
    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
 | 
			
		||||
 | 
			
		||||
    assert(FourDimGrid._simd_layout[d]=1);
 | 
			
		||||
    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
 | 
			
		||||
    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
 | 
			
		||||
    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
 | 
			
		||||
 | 
			
		||||
    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
 | 
			
		||||
    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
 | 
			
		||||
    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Allocate the required comms buffer
 | 
			
		||||
  ImportGauge(_Umu);
 | 
			
		||||
}  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
			
		||||
{
 | 
			
		||||
    GaugeField HUmu(_Umu._grid);
 | 
			
		||||
    HUmu = _Umu*(-0.5);
 | 
			
		||||
  GaugeField HUmu(_Umu._grid);
 | 
			
		||||
  HUmu = _Umu*(-0.5);
 | 
			
		||||
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
 | 
			
		||||
  pickCheckerboard(Even,UmuEven,Umu);
 | 
			
		||||
  pickCheckerboard(Odd ,UmuOdd,Umu);
 | 
			
		||||
@@ -294,15 +356,12 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  int LLs = in._grid->_rdimensions[0];
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  st.HaloExchangeComplete(handle);
 | 
			
		||||
  //  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  //  st.HaloExchangeComplete(handle);
 | 
			
		||||
  st.HaloExchange(in,compressor);
 | 
			
		||||
  commtime +=usecond();
 | 
			
		||||
 | 
			
		||||
  jointime -=usecond();
 | 
			
		||||
@@ -318,97 +377,48 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	for(int s=0;s<LLs;s++){
 | 
			
		||||
	  int sU=ss;
 | 
			
		||||
	  int sF = s+LLs*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=ss;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	for(int s=0;s<LLs;s++){
 | 
			
		||||
	  int sU=ss;
 | 
			
		||||
	  int sF = s+LLs*sU;
 | 
			
		||||
	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->AsmOptDslash ) {
 | 
			
		||||
      //      for(int i=0;i<1;i++){
 | 
			
		||||
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      //	PerformanceCounter Counter(i);
 | 
			
		||||
      //	Counter.Start();
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel for 
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
 | 
			
		||||
	    sU=ss+ ssoff;
 | 
			
		||||
 | 
			
		||||
	    if ( LebesgueOrder::UseLebesgueOrder ) {
 | 
			
		||||
	      sU = lo.Reorder(sU);
 | 
			
		||||
	    }
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	for(int s=0;s<LLs;s++){
 | 
			
		||||
	  int sU=ss;
 | 
			
		||||
	  int sF = s+LLs*sU;
 | 
			
		||||
	  Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //      Counter.Stop();
 | 
			
		||||
      //      Counter.Report();
 | 
			
		||||
      //      }
 | 
			
		||||
    } else if( this->HandOptDslash ) {
 | 
			
		||||
      /*
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel for schedule(static)
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  sU=ss+ ssoff;
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      */
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP     
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	for(int s=0;s<LLs;s++){
 | 
			
		||||
	  int sU=ss;
 | 
			
		||||
	  int sF = s+LLs*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	for(int s=0;s<LLs;s++){
 | 
			
		||||
	  int sU=ss;
 | 
			
		||||
	  int sF = s+LLs*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
@@ -418,251 +428,6 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						 DoubledGaugeField & U,
 | 
			
		||||
						 const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  alltime-=usecond();
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  st.HaloExchangeComplete(handle);
 | 
			
		||||
  commtime +=usecond();
 | 
			
		||||
 | 
			
		||||
  jointime -=usecond();
 | 
			
		||||
  jointime +=usecond();
 | 
			
		||||
  
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  // Not loop ordering and data layout.
 | 
			
		||||
  // Designed to create 
 | 
			
		||||
  // - per thread reuse in L1 cache for U
 | 
			
		||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
  for(int jjj=0;jjj<100;jjj++){
 | 
			
		||||
#pragma omp barrier
 | 
			
		||||
  dslashtime -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=ss;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->AsmOptDslash ) {
 | 
			
		||||
      //      for(int i=0;i<1;i++){
 | 
			
		||||
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      //	PerformanceCounter Counter(i);
 | 
			
		||||
      //	Counter.Start();
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
 | 
			
		||||
	    sU=ss+ ssoff;
 | 
			
		||||
 | 
			
		||||
	    if ( LebesgueOrder::UseLebesgueOrder ) {
 | 
			
		||||
	      sU = lo.Reorder(sU);
 | 
			
		||||
	    }
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //      Counter.Stop();
 | 
			
		||||
      //      Counter.Report();
 | 
			
		||||
      //      }
 | 
			
		||||
    } else if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  dslashtime +=usecond();
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						DoubledGaugeField & U,
 | 
			
		||||
						const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  alltime-=usecond();
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  st.HaloExchangeComplete(handle);
 | 
			
		||||
  commtime +=usecond();
 | 
			
		||||
 | 
			
		||||
  jointime -=usecond();
 | 
			
		||||
  jointime +=usecond();
 | 
			
		||||
  
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  // Not loop ordering and data layout.
 | 
			
		||||
  // Designed to create 
 | 
			
		||||
  // - per thread reuse in L1 cache for U
 | 
			
		||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
  for(int jjj=0;jjj<100;jjj++){
 | 
			
		||||
#pragma omp barrier
 | 
			
		||||
  dslashtime -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=0;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->AsmOptDslash ) {
 | 
			
		||||
      //      for(int i=0;i<1;i++){
 | 
			
		||||
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      //	PerformanceCounter Counter(i);
 | 
			
		||||
      //	Counter.Start();
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
 | 
			
		||||
	    sU=0;
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //      Counter.Stop();
 | 
			
		||||
      //      Counter.Report();
 | 
			
		||||
      //      }
 | 
			
		||||
    } else if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  dslashtime +=usecond();
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						     DoubledGaugeField & U,
 | 
			
		||||
						     const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
@@ -706,6 +471,8 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 | 
			
		||||
 | 
			
		||||
FermOpTemplateInstantiate(WilsonFermion5D);
 | 
			
		||||
GparityFermOpTemplateInstantiate(WilsonFermion5D);
 | 
			
		||||
template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
 | 
			
		||||
template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -87,6 +87,7 @@ namespace Grid {
 | 
			
		||||
      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
 | 
			
		||||
      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
 | 
			
		||||
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
 | 
			
		||||
      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
 | 
			
		||||
 | 
			
		||||
      // These can be overridden by fancy 5d chiral action
 | 
			
		||||
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
 | 
			
		||||
@@ -121,32 +122,12 @@ namespace Grid {
 | 
			
		||||
			FermionField &out,
 | 
			
		||||
			int dag);
 | 
			
		||||
 | 
			
		||||
      void DhopInternalOMPbench(StencilImpl & st,
 | 
			
		||||
				LebesgueOrder &lo,
 | 
			
		||||
				DoubledGaugeField &U,
 | 
			
		||||
				const FermionField &in, 
 | 
			
		||||
				FermionField &out,
 | 
			
		||||
				int dag);
 | 
			
		||||
 | 
			
		||||
      void DhopInternalL1bench(StencilImpl & st,
 | 
			
		||||
				LebesgueOrder &lo,
 | 
			
		||||
				DoubledGaugeField &U,
 | 
			
		||||
				const FermionField &in, 
 | 
			
		||||
				FermionField &out,
 | 
			
		||||
				int dag);
 | 
			
		||||
 | 
			
		||||
      void DhopInternalCommsThenCompute(StencilImpl & st,
 | 
			
		||||
			LebesgueOrder &lo,
 | 
			
		||||
			DoubledGaugeField &U,
 | 
			
		||||
			const FermionField &in, 
 | 
			
		||||
			FermionField &out,
 | 
			
		||||
			int dag);
 | 
			
		||||
      void DhopInternalCommsOverlapCompute(StencilImpl & st,
 | 
			
		||||
			LebesgueOrder &lo,
 | 
			
		||||
			DoubledGaugeField &U,
 | 
			
		||||
			const FermionField &in, 
 | 
			
		||||
			FermionField &out,
 | 
			
		||||
			int dag);
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
@@ -156,6 +137,15 @@ namespace Grid {
 | 
			
		||||
		      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
		      double _M5,const ImplParams &p= ImplParams());
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      WilsonFermion5D(int simd, 
 | 
			
		||||
		      GaugeField &_Umu,
 | 
			
		||||
		      GridCartesian         &FiveDimGrid,
 | 
			
		||||
		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
		      GridCartesian         &FourDimGrid,
 | 
			
		||||
		      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
		      double _M5,const ImplParams &p= ImplParams());
 | 
			
		||||
 | 
			
		||||
      // DoubleStore
 | 
			
		||||
      void ImportGauge(const GaugeField &_Umu);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -529,5 +529,7 @@ void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  FermOpTemplateInstantiate(WilsonKernels);
 | 
			
		||||
template class WilsonKernels<DomainWallRedBlack5dImplF>;		
 | 
			
		||||
template class WilsonKernels<DomainWallRedBlack5dImplD>;
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
#if defined(AVX512) || defined (IMCI)
 | 
			
		||||
#if defined(AVX512) 
 | 
			
		||||
//#if defined (IMCI)
 | 
			
		||||
 | 
			
		||||
#include <simd/Intel512wilson.h>
 | 
			
		||||
@@ -256,5 +256,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 | 
			
		||||
  template class WilsonKernels<WilsonImplD>; 
 | 
			
		||||
  template class WilsonKernels<GparityWilsonImplF>;
 | 
			
		||||
  template class WilsonKernels<GparityWilsonImplD>;
 | 
			
		||||
  template class WilsonKernels<DomainWallRedBlack5dImplF>;
 | 
			
		||||
  template class WilsonKernels<DomainWallRedBlack5dImplD>;
 | 
			
		||||
}}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    Chi_11 = ref()(1)(1);\
 | 
			
		||||
    Chi_12 = ref()(1)(2);
 | 
			
		||||
 | 
			
		||||
// To splat or not to splat depends on the implementation
 | 
			
		||||
#define MULT_2SPIN(A)\
 | 
			
		||||
   auto & ref(U._odata[sU](A));	\
 | 
			
		||||
    U_00 = ref()(0,0);\
 | 
			
		||||
    U_10 = ref()(1,0);\
 | 
			
		||||
    U_20 = ref()(2,0);\
 | 
			
		||||
    U_01 = ref()(0,1);\
 | 
			
		||||
    U_11 = ref()(1,1);				\
 | 
			
		||||
    U_21 = ref()(2,1);\
 | 
			
		||||
   Impl::loadLinkElement(U_00,ref()(0,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_10,ref()(1,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_20,ref()(2,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_01,ref()(0,1));	\
 | 
			
		||||
   Impl::loadLinkElement(U_11,ref()(1,1));	\
 | 
			
		||||
   Impl::loadLinkElement(U_21,ref()(2,1));	\
 | 
			
		||||
    UChi_00 = U_00*Chi_00;\
 | 
			
		||||
    UChi_10 = U_00*Chi_10;\
 | 
			
		||||
    UChi_01 = U_10*Chi_00;\
 | 
			
		||||
@@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    UChi_11+= U_11*Chi_11;\
 | 
			
		||||
    UChi_02+= U_21*Chi_01;\
 | 
			
		||||
    UChi_12+= U_21*Chi_11;\
 | 
			
		||||
    U_00 = ref()(0,2);\
 | 
			
		||||
    U_10 = ref()(1,2);\
 | 
			
		||||
    U_20 = ref()(2,2);\
 | 
			
		||||
    Impl::loadLinkElement(U_00,ref()(0,2));	\
 | 
			
		||||
    Impl::loadLinkElement(U_10,ref()(1,2));	\
 | 
			
		||||
    Impl::loadLinkElement(U_20,ref()(2,2));	\
 | 
			
		||||
    UChi_00+= U_00*Chi_02;\
 | 
			
		||||
    UChi_10+= U_00*Chi_12;\
 | 
			
		||||
    UChi_01+= U_10*Chi_02;\
 | 
			
		||||
@@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    UChi_02+= U_20*Chi_02;\
 | 
			
		||||
    UChi_12+= U_20*Chi_12;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR(dir)			\
 | 
			
		||||
      permute##dir(Chi_00,Chi_00);\
 | 
			
		||||
      permute##dir(Chi_01,Chi_01);\
 | 
			
		||||
@@ -809,7 +811,6 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Doub
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
 | 
			
		||||
  //check consistency of return types between these functions and the ones in WilsonKernels.cc
 | 
			
		||||
  return 0;
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
@@ -843,6 +844,47 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,D
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  //////////////
 | 
			
		||||
/*
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
@@ -870,4 +912,21 @@ template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilI
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -42,7 +42,9 @@ template<class Gimpl> class WilsonLoops;
 | 
			
		||||
#define INHERIT_GIMPL_TYPES(GImpl) \
 | 
			
		||||
    typedef typename GImpl::Simd                           Simd;\
 | 
			
		||||
    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
 | 
			
		||||
    typedef typename GImpl::GaugeField               GaugeField;	
 | 
			
		||||
    typedef typename GImpl::GaugeField               GaugeField;\
 | 
			
		||||
    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
 | 
			
		||||
    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // 
 | 
			
		||||
 
 | 
			
		||||
@@ -41,7 +41,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
 | 
			
		||||
{
 | 
			
		||||
  return new GridRedBlackCartesian(FourDimGrid); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<int> simd(4,1);
 | 
			
		||||
  return makeFourDimGrid(latt,simd,mpi);
 | 
			
		||||
}
 | 
			
		||||
GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
			
		||||
{
 | 
			
		||||
  int N4=FourDimGrid->_ndimension;
 | 
			
		||||
@@ -58,6 +62,7 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian
 | 
			
		||||
  return new GridCartesian(latt5,simd5,mpi5); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
			
		||||
{
 | 
			
		||||
  int N4=FourDimGrid->_ndimension;
 | 
			
		||||
@@ -76,4 +81,42 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
 | 
			
		||||
  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
			
		||||
{
 | 
			
		||||
  int N4=FourDimGrid->_ndimension;
 | 
			
		||||
  int nsimd = FourDimGrid->Nsimd();
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt5(1,Ls);
 | 
			
		||||
  std::vector<int> simd5(1,nsimd);
 | 
			
		||||
  std::vector<int>  mpi5(1,1);
 | 
			
		||||
  
 | 
			
		||||
  for(int d=0;d<N4;d++){
 | 
			
		||||
    latt5.push_back(FourDimGrid->_fdimensions[d]);
 | 
			
		||||
    simd5.push_back(1);
 | 
			
		||||
     mpi5.push_back(FourDimGrid->_processors[d]);
 | 
			
		||||
  }
 | 
			
		||||
  return new GridCartesian(latt5,simd5,mpi5); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
			
		||||
{
 | 
			
		||||
  int N4=FourDimGrid->_ndimension;
 | 
			
		||||
  int nsimd = FourDimGrid->Nsimd();
 | 
			
		||||
  int cbd=0;
 | 
			
		||||
  std::vector<int> latt5(1,Ls);
 | 
			
		||||
  std::vector<int> simd5(1,nsimd);
 | 
			
		||||
  std::vector<int>  mpi5(1,1);
 | 
			
		||||
  std::vector<int>   cb5(1,1);
 | 
			
		||||
    
 | 
			
		||||
  for(int d=0;d<N4;d++){
 | 
			
		||||
    latt5.push_back(FourDimGrid->_fdimensions[d]);
 | 
			
		||||
    simd5.push_back(1);
 | 
			
		||||
     mpi5.push_back(FourDimGrid->_processors[d]);
 | 
			
		||||
      cb5.push_back(1);
 | 
			
		||||
    }
 | 
			
		||||
  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -35,9 +35,14 @@ class SpaceTimeGrid {
 | 
			
		||||
 | 
			
		||||
  static GridCartesian         *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
 | 
			
		||||
  static GridRedBlackCartesian *makeFourDimRedBlackGrid       (const GridCartesian *FourDimGrid);
 | 
			
		||||
 | 
			
		||||
  static GridCartesian         *makeFiveDimGrid        (int Ls,const GridCartesian *FourDimGrid);
 | 
			
		||||
  static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
 | 
			
		||||
 | 
			
		||||
  static GridCartesian         *makeFiveDimDWFGrid        (int Ls,const GridCartesian *FourDimGrid);
 | 
			
		||||
  static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
 | 
			
		||||
  static GridCartesian         *makeFourDimDWFGrid        (const std::vector<int> & latt,const std::vector<int> &mpi);
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -410,22 +410,22 @@ namespace Optimization {
 | 
			
		||||
  struct Permute{
 | 
			
		||||
 | 
			
		||||
    static inline __m256 Permute0(__m256 in){
 | 
			
		||||
      return _mm256_permute2f128_ps(in,in,0x01);
 | 
			
		||||
      return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m256 Permute1(__m256 in){
 | 
			
		||||
      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
 | 
			
		||||
      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m256 Permute2(__m256 in){
 | 
			
		||||
      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m256 Permute3(__m256 in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static inline __m256d Permute0(__m256d in){
 | 
			
		||||
      return _mm256_permute2f128_pd(in,in,0x01);
 | 
			
		||||
      return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m256d Permute1(__m256d in){
 | 
			
		||||
    static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
 | 
			
		||||
      return _mm256_shuffle_pd(in,in,0x5);
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m256d Permute2(__m256d in){
 | 
			
		||||
@@ -437,6 +437,111 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
#if defined (AVX2) || defined (AVXFMA4) 
 | 
			
		||||
#define _mm256_alignr_epi32(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
 | 
			
		||||
#define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined (AVX1) 
 | 
			
		||||
 | 
			
		||||
#define _mm256_alignr_epi32(ret,a,b,n) {	\
 | 
			
		||||
    __m128 aa, bb;				\
 | 
			
		||||
						\
 | 
			
		||||
    aa  = _mm256_extractf128_ps(a,1);		\
 | 
			
		||||
    bb  = _mm256_extractf128_ps(b,1);		\
 | 
			
		||||
    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
 | 
			
		||||
    ret = _mm256_insertf128_ps(ret,aa,1);	\
 | 
			
		||||
						\
 | 
			
		||||
    aa  = _mm256_extractf128_ps(a,0);		\
 | 
			
		||||
    bb  = _mm256_extractf128_ps(b,0);		\
 | 
			
		||||
    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
 | 
			
		||||
    ret = _mm256_insertf128_ps(ret,aa,0);	\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define _mm256_alignr_epi64(ret,a,b,n) {	\
 | 
			
		||||
    __m128d aa, bb;				\
 | 
			
		||||
						\
 | 
			
		||||
    aa  = _mm256_extractf128_pd(a,1);		\
 | 
			
		||||
    bb  = _mm256_extractf128_pd(b,1);		\
 | 
			
		||||
    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
 | 
			
		||||
    ret = _mm256_insertf128_pd(ret,aa,1);	\
 | 
			
		||||
						\
 | 
			
		||||
    aa  = _mm256_extractf128_pd(a,0);		\
 | 
			
		||||
    bb  = _mm256_extractf128_pd(b,0);		\
 | 
			
		||||
    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
 | 
			
		||||
    ret = _mm256_insertf128_pd(ret,aa,0);	\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    inline std::ostream & operator << (std::ostream& stream, const __m256 a)
 | 
			
		||||
    {
 | 
			
		||||
      const float *p=(const float *)&a;
 | 
			
		||||
      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<","<<p[4]<<","<<p[5]<<","<<p[6]<<","<<p[7]<<"}";
 | 
			
		||||
      return stream;
 | 
			
		||||
    };
 | 
			
		||||
    inline std::ostream & operator<< (std::ostream& stream, const __m256d a)
 | 
			
		||||
    {
 | 
			
		||||
      const double *p=(const double *)&a;
 | 
			
		||||
      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<"}";
 | 
			
		||||
      return stream;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 | 
			
		||||
    static inline __m256 rotate(__m256 in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      case 4: return tRotate<4>(in);break;
 | 
			
		||||
      case 5: return tRotate<5>(in);break;
 | 
			
		||||
      case 6: return tRotate<6>(in);break;
 | 
			
		||||
      case 7: return tRotate<7>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    static inline __m256d rotate(__m256d in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  
 | 
			
		||||
    
 | 
			
		||||
    template<int n>
 | 
			
		||||
    static inline __m256 tRotate(__m256 in){ 
 | 
			
		||||
      __m256 tmp = Permute::Permute0(in);
 | 
			
		||||
      __m256 ret;
 | 
			
		||||
      if ( n > 3 ) { 
 | 
			
		||||
	_mm256_alignr_epi32(ret,in,tmp,n);  
 | 
			
		||||
      } else {
 | 
			
		||||
        _mm256_alignr_epi32(ret,tmp,in,n);          
 | 
			
		||||
      }
 | 
			
		||||
      //      std::cout << " align epi32 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
 | 
			
		||||
      return ret;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template<int n>
 | 
			
		||||
    static inline __m256d tRotate(__m256d in){ 
 | 
			
		||||
      __m256d tmp = Permute::Permute0(in);
 | 
			
		||||
      __m256d ret;
 | 
			
		||||
      if ( n > 1 ) {
 | 
			
		||||
	_mm256_alignr_epi64(ret,in,tmp,n);          
 | 
			
		||||
      } else {
 | 
			
		||||
        _mm256_alignr_epi64(ret,tmp,in,n);          
 | 
			
		||||
      }
 | 
			
		||||
      //      std::cout << " align epi64 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
 | 
			
		||||
      return ret;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  //Complex float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
 
 | 
			
		||||
@@ -309,6 +309,54 @@ namespace Optimization {
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 | 
			
		||||
    static inline __m512 rotate(__m512 in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      case 4: return tRotate<4>(in);break;
 | 
			
		||||
      case 5: return tRotate<5>(in);break;
 | 
			
		||||
      case 6: return tRotate<6>(in);break;
 | 
			
		||||
      case 7: return tRotate<7>(in);break;
 | 
			
		||||
 | 
			
		||||
      case 8 : return tRotate<8>(in);break;
 | 
			
		||||
      case 9 : return tRotate<9>(in);break;
 | 
			
		||||
      case 10: return tRotate<10>(in);break;
 | 
			
		||||
      case 11: return tRotate<11>(in);break;
 | 
			
		||||
      case 12: return tRotate<12>(in);break;
 | 
			
		||||
      case 13: return tRotate<13>(in);break;
 | 
			
		||||
      case 14: return tRotate<14>(in);break;
 | 
			
		||||
      case 15: return tRotate<15>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    static inline __m512d rotate(__m512d in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      case 4: return tRotate<4>(in);break;
 | 
			
		||||
      case 5: return tRotate<5>(in);break;
 | 
			
		||||
      case 6: return tRotate<6>(in);break;
 | 
			
		||||
      case 7: return tRotate<7>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<int n> static inline __m512 tRotate(__m512 in){ 
 | 
			
		||||
      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template<int n> static inline __m512d tRotate(__m512d in){ 
 | 
			
		||||
      return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);          
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Some Template specialization
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
@@ -55,51 +55,67 @@ namespace Optimization {
 | 
			
		||||
  
 | 
			
		||||
  struct Vsplat{
 | 
			
		||||
    //Complex float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(float a, float b){
 | 
			
		||||
      u128f out; 
 | 
			
		||||
      out.f[0] = a;
 | 
			
		||||
      out.f[1] = b;
 | 
			
		||||
      out.f[2] = a;
 | 
			
		||||
      out.f[3] = b;
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float operator()(float a){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(float a){
 | 
			
		||||
      u128f out; 
 | 
			
		||||
      out.f[0] = a;
 | 
			
		||||
      out.f[1] = a;
 | 
			
		||||
      out.f[2] = a;
 | 
			
		||||
      out.f[3] = a;
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline double operator()(double a, double b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(double a, double b){
 | 
			
		||||
      u128d out; 
 | 
			
		||||
      out.f[0] = a;
 | 
			
		||||
      out.f[1] = b;
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Real double
 | 
			
		||||
    inline double operator()(double a){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(double a){
 | 
			
		||||
      u128d out; 
 | 
			
		||||
      out.f[0] = a;
 | 
			
		||||
      out.f[1] = a;
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline int operator()(Integer a){
 | 
			
		||||
      return 0;
 | 
			
		||||
      return a;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Vstore{
 | 
			
		||||
    //Float 
 | 
			
		||||
    inline void operator()(float a, float* F){
 | 
			
		||||
      
 | 
			
		||||
    inline void operator()(u128f a, float* F){
 | 
			
		||||
      memcpy(F,a.f,4*sizeof(float));
 | 
			
		||||
    }
 | 
			
		||||
    //Double
 | 
			
		||||
    inline void operator()(double a, double* D){
 | 
			
		||||
     
 | 
			
		||||
    inline void operator()(u128d a, double* D){
 | 
			
		||||
      memcpy(D,a.f,2*sizeof(double));
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline void operator()(int a, Integer* I){
 | 
			
		||||
      
 | 
			
		||||
      I[0] = a;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Vstream{
 | 
			
		||||
    //Float
 | 
			
		||||
    inline void operator()(float * a, float b){
 | 
			
		||||
     
 | 
			
		||||
    inline void operator()(float * a, u128f b){
 | 
			
		||||
      memcpy(a,b.f,4*sizeof(float));
 | 
			
		||||
    }
 | 
			
		||||
    //Double
 | 
			
		||||
    inline void operator()(double * a, double b){
 | 
			
		||||
     
 | 
			
		||||
    inline void operator()(double * a, u128d b){
 | 
			
		||||
      memcpy(a,b.f,2*sizeof(double));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -107,24 +123,40 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  struct Vset{
 | 
			
		||||
    // Complex float 
 | 
			
		||||
    inline float operator()(Grid::ComplexF *a){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(Grid::ComplexF *a){
 | 
			
		||||
      u128f out; 
 | 
			
		||||
      out.f[0] = a[0].real();
 | 
			
		||||
      out.f[1] = a[0].imag();
 | 
			
		||||
      out.f[2] = a[1].real();
 | 
			
		||||
      out.f[3] = a[1].imag();
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double 
 | 
			
		||||
    inline double operator()(Grid::ComplexD *a){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(Grid::ComplexD *a){
 | 
			
		||||
      u128d out; 
 | 
			
		||||
      out.f[0] = a[0].real();
 | 
			
		||||
      out.f[1] = a[0].imag();
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Real float 
 | 
			
		||||
    inline float operator()(float *a){
 | 
			
		||||
      return  0;
 | 
			
		||||
    inline u128f operator()(float *a){
 | 
			
		||||
      u128f out; 
 | 
			
		||||
      out.f[0] = a[0];
 | 
			
		||||
      out.f[1] = a[1];
 | 
			
		||||
      out.f[2] = a[2];
 | 
			
		||||
      out.f[3] = a[3];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Real double
 | 
			
		||||
    inline double operator()(double *a){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(double *a){
 | 
			
		||||
      u128d out; 
 | 
			
		||||
      out.f[0] = a[0];
 | 
			
		||||
      out.f[1] = a[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Integer
 | 
			
		||||
    inline int operator()(Integer *a){
 | 
			
		||||
      return 0;
 | 
			
		||||
      return a[0];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -146,129 +178,198 @@ namespace Optimization {
 | 
			
		||||
  /////////////////////////////////////////////////////
 | 
			
		||||
  struct Sum{
 | 
			
		||||
    //Complex/Real float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f a, u128f b){
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = a.f[0] + b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1] + b.f[1];
 | 
			
		||||
      out.f[2] = a.f[2] + b.f[2];
 | 
			
		||||
      out.f[3] = a.f[3] + b.f[3];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex/Real double
 | 
			
		||||
    inline double operator()(double a, double b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d a, u128d b){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = a.f[0] + b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1] + b.f[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline int operator()(int a, int b){
 | 
			
		||||
      return 0;
 | 
			
		||||
      return a + b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Sub{
 | 
			
		||||
    //Complex/Real float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f a, u128f b){
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = a.f[0] - b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1] - b.f[1];
 | 
			
		||||
      out.f[2] = a.f[2] - b.f[2];
 | 
			
		||||
      out.f[3] = a.f[3] - b.f[3];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex/Real double
 | 
			
		||||
    inline double operator()(double a, double b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d a, u128d b){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = a.f[0] - b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1] - b.f[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline int operator()(int a, int b){
 | 
			
		||||
      return 0;
 | 
			
		||||
      return a-b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MultComplex{
 | 
			
		||||
    // Complex float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f a, u128f b){
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
 | 
			
		||||
      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
 | 
			
		||||
      out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
 | 
			
		||||
      out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline double operator()(double a, double b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d a, u128d b){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
 | 
			
		||||
      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Mult{
 | 
			
		||||
    inline float  mac(float a, float b,double c){
 | 
			
		||||
      return 0;
 | 
			
		||||
    }
 | 
			
		||||
    inline double mac(double a, double b,double c){
 | 
			
		||||
      return 0;
 | 
			
		||||
    }
 | 
			
		||||
    //CK: Appear unneeded
 | 
			
		||||
    // inline float  mac(float a, float b,double c){
 | 
			
		||||
    //   return 0;
 | 
			
		||||
    // }
 | 
			
		||||
    // inline double mac(double a, double b,double c){
 | 
			
		||||
    //   return 0;
 | 
			
		||||
    // }
 | 
			
		||||
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f a, u128f b){
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = a.f[0]*b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1]*b.f[1];
 | 
			
		||||
      out.f[2] = a.f[2]*b.f[2];
 | 
			
		||||
      out.f[3] = a.f[3]*b.f[3];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Real double
 | 
			
		||||
    inline double operator()(double a, double b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d a, u128d b){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = a.f[0]*b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1]*b.f[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Integer
 | 
			
		||||
    inline int operator()(int a, int b){
 | 
			
		||||
      return 0;
 | 
			
		||||
      return a*b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Conj{
 | 
			
		||||
    // Complex single
 | 
			
		||||
    inline float operator()(float in){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f in){
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = in.f[0];
 | 
			
		||||
      out.f[1] = -in.f[1];
 | 
			
		||||
      out.f[2] = in.f[2];
 | 
			
		||||
      out.f[3] = -in.f[3];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline double operator()(double in){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d in){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = in.f[0];
 | 
			
		||||
      out.f[1] = -in.f[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // do not define for integer input
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct TimesMinusI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    inline float operator()(float in, float ret){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = in.f[1];
 | 
			
		||||
      out.f[1] = -in.f[0];
 | 
			
		||||
      out.f[2] = in.f[3];
 | 
			
		||||
      out.f[3] = -in.f[2];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline double operator()(double in, double ret){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d in, u128d ret){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = in.f[1];
 | 
			
		||||
      out.f[1] = -in.f[0];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct TimesI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    inline float operator()(float in, float ret){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = -in.f[1];
 | 
			
		||||
      out.f[1] = in.f[0];
 | 
			
		||||
      out.f[2] = -in.f[3];
 | 
			
		||||
      out.f[3] = in.f[2];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline double operator()(double in, double ret){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d in, u128d ret){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = -in.f[1];
 | 
			
		||||
      out.f[1] = in.f[0];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Some Template specialization
 | 
			
		||||
  struct Permute{
 | 
			
		||||
 | 
			
		||||
    static inline float Permute0(float in){
 | 
			
		||||
    //We just have to mirror the permutes of Grid_sse4.h
 | 
			
		||||
    static inline u128f Permute0(u128f in){ //AB CD -> CD AB
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = in.f[2];
 | 
			
		||||
      out.f[1] = in.f[3];
 | 
			
		||||
      out.f[2] = in.f[0];
 | 
			
		||||
      out.f[3] = in.f[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    };
 | 
			
		||||
    static inline u128f Permute1(u128f in){ //AB CD -> BA DC
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = in.f[1];
 | 
			
		||||
      out.f[1] = in.f[0];
 | 
			
		||||
      out.f[2] = in.f[3];
 | 
			
		||||
      out.f[3] = in.f[2];
 | 
			
		||||
      return out;
 | 
			
		||||
    };
 | 
			
		||||
    static inline u128f Permute2(u128f in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline float Permute1(float in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline float Permute2(float in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline float Permute3(float in){
 | 
			
		||||
    static inline u128f Permute3(u128f in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static inline double Permute0(double in){
 | 
			
		||||
    static inline u128d Permute0(u128d in){ //AB -> BA
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = in.f[1];
 | 
			
		||||
      out.f[1] = in.f[0];
 | 
			
		||||
      return out;      
 | 
			
		||||
    };
 | 
			
		||||
    static inline u128d Permute1(u128d in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline double Permute1(double in){
 | 
			
		||||
    static inline u128d Permute2(u128d in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline double Permute2(double in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline double Permute3(double in){
 | 
			
		||||
    static inline u128d Permute3(u128d in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
@@ -280,26 +381,26 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  //Complex float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
 | 
			
		||||
    return 0;
 | 
			
		||||
  inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
 | 
			
		||||
    return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
 | 
			
		||||
  }
 | 
			
		||||
  //Real float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
 | 
			
		||||
    return 0;
 | 
			
		||||
  inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
 | 
			
		||||
    return in.f[0] + in.f[1] + in.f[2] + in.f[3];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  //Complex double Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::ComplexD Reduce<Grid::ComplexD, double>::operator()(double in){
 | 
			
		||||
    return 0;
 | 
			
		||||
  inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
 | 
			
		||||
    return Grid::ComplexD(in.f[0],in.f[1]);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  //Real double Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::RealD Reduce<Grid::RealD, double>::operator()(double in){
 | 
			
		||||
    return 0;
 | 
			
		||||
  inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
 | 
			
		||||
    return in.f[0] + in.f[1];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
@@ -314,8 +415,8 @@ namespace Optimization {
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types 
 | 
			
		||||
 | 
			
		||||
  typedef float SIMD_Ftype;  // Single precision type
 | 
			
		||||
  typedef double SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef Optimization::u128f SIMD_Ftype;  // Single precision type
 | 
			
		||||
  typedef Optimization::u128d SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef int SIMD_Itype; // Integer type
 | 
			
		||||
 | 
			
		||||
  // prefetch utilities
 | 
			
		||||
 
 | 
			
		||||
@@ -36,7 +36,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
#include <immintrin.h>
 | 
			
		||||
#include <zmmintrin.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid{
 | 
			
		||||
namespace Optimization {
 | 
			
		||||
  
 | 
			
		||||
  struct Vsplat{
 | 
			
		||||
@@ -316,6 +318,54 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 | 
			
		||||
    static inline __m512 rotate(__m512 in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      case 4: return tRotate<4>(in);break;
 | 
			
		||||
      case 5: return tRotate<5>(in);break;
 | 
			
		||||
      case 6: return tRotate<6>(in);break;
 | 
			
		||||
      case 7: return tRotate<7>(in);break;
 | 
			
		||||
 | 
			
		||||
      case 8 : return tRotate<8>(in);break;
 | 
			
		||||
      case 9 : return tRotate<9>(in);break;
 | 
			
		||||
      case 10: return tRotate<10>(in);break;
 | 
			
		||||
      case 11: return tRotate<11>(in);break;
 | 
			
		||||
      case 12: return tRotate<12>(in);break;
 | 
			
		||||
      case 13: return tRotate<13>(in);break;
 | 
			
		||||
      case 14: return tRotate<14>(in);break;
 | 
			
		||||
      case 15: return tRotate<15>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    static inline __m512d rotate(__m512d in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      case 4: return tRotate<4>(in);break;
 | 
			
		||||
      case 5: return tRotate<5>(in);break;
 | 
			
		||||
      case 6: return tRotate<6>(in);break;
 | 
			
		||||
      case 7: return tRotate<7>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<int n> static inline __m512 tRotate(__m512 in){ 
 | 
			
		||||
      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template<int n> static inline __m512d tRotate(__m512d in){ 
 | 
			
		||||
      return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);          
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
@@ -358,7 +408,7 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types 
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  typedef __m512 SIMD_Ftype;  // Single precision type
 | 
			
		||||
  typedef __m512d SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef __m512i SIMD_Itype; // Integer type
 | 
			
		||||
 
 | 
			
		||||
@@ -267,10 +267,10 @@ namespace Optimization {
 | 
			
		||||
  struct Permute{
 | 
			
		||||
 | 
			
		||||
    static inline __m128 Permute0(__m128 in){
 | 
			
		||||
      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
 | 
			
		||||
      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m128 Permute1(__m128 in){
 | 
			
		||||
      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m128 Permute2(__m128 in){
 | 
			
		||||
      return in;
 | 
			
		||||
@@ -279,7 +279,7 @@ namespace Optimization {
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static inline __m128d Permute0(__m128d in){
 | 
			
		||||
    static inline __m128d Permute0(__m128d in){ //AB -> BA
 | 
			
		||||
      return _mm_shuffle_pd(in,in,0x1);
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m128d Permute1(__m128d in){
 | 
			
		||||
@@ -294,6 +294,32 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 | 
			
		||||
    static inline __m128 rotate(__m128 in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    static inline __m128d rotate(__m128d in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  
 | 
			
		||||
#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
 | 
			
		||||
#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
 | 
			
		||||
    
 | 
			
		||||
    template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
 | 
			
		||||
    template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Some Template specialization
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -299,16 +299,44 @@ namespace Grid {
 | 
			
		||||
    }
 | 
			
		||||
    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
 | 
			
		||||
    {
 | 
			
		||||
      if      (perm==3) permute3(y,b);
 | 
			
		||||
      else if (perm==2) permute2(y,b);
 | 
			
		||||
      else if (perm==1) permute1(y,b);
 | 
			
		||||
      else if (perm==0) permute0(y,b);
 | 
			
		||||
      if ( perm & RotateBit ) {
 | 
			
		||||
	int dist = perm&0xF;
 | 
			
		||||
        y=rotate(b,dist);
 | 
			
		||||
	return;
 | 
			
		||||
      }
 | 
			
		||||
      switch(perm){
 | 
			
		||||
      case 3: permute3(y,b); break;
 | 
			
		||||
      case 2: permute2(y,b); break;
 | 
			
		||||
      case 1: permute1(y,b); break;
 | 
			
		||||
      case 0: permute0(y,b); break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
  };// end of Grid_simd class definition 
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // General rotate
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template <class S, class V, IfNotComplex<S> =0> 
 | 
			
		||||
  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
 | 
			
		||||
  {
 | 
			
		||||
    nrot = nrot % Grid_simd<S,V>::Nsimd();
 | 
			
		||||
    Grid_simd<S,V> ret;
 | 
			
		||||
    //    std::cout << "Rotate Real by "<<nrot<<std::endl;
 | 
			
		||||
    ret.v = Optimization::Rotate::rotate(b.v,nrot);
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  template <class S, class V, IfComplex<S> =0> 
 | 
			
		||||
  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
 | 
			
		||||
  {
 | 
			
		||||
    nrot = nrot % Grid_simd<S,V>::Nsimd();
 | 
			
		||||
    Grid_simd<S,V> ret;
 | 
			
		||||
    //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
 | 
			
		||||
    ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
  // Splat
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -44,7 +44,7 @@ template<class vsimd,class scalar>
 | 
			
		||||
inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y, 
 | 
			
		||||
		    std::vector<scalar *> &extracted,int offset){
 | 
			
		||||
  // FIXME: bounce off memory is painful
 | 
			
		||||
  static const int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
@@ -59,7 +59,9 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y, 
 | 
			
		||||
		  std::vector<scalar *> &extracted,int offset){
 | 
			
		||||
  static const int Nsimd=vsimd::Nsimd();
 | 
			
		||||
 | 
			
		||||
  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
 | 
			
		||||
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
 | 
			
		||||
                     // replicate n-fold. Use to allow Integer masks to 
 | 
			
		||||
@@ -127,7 +129,7 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
 | 
			
		||||
  static const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
 | 
			
		||||
  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
@@ -174,7 +176,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
  
 | 
			
		||||
  static const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
 | 
			
		||||
  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  int Nextr = extracted.size();
 | 
			
		||||
@@ -199,7 +201,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
  
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user