mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Hand unrolled version of dslash in a separate class.
Useful to compare; raises Intel compiler from 9GFlop/s to 17.5 Gflops.
                   on ivybridge core. Raises Clang form 14.5 to 17.5
			
			
This commit is contained in:
		@@ -31,11 +31,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG          pRNG(&Grid);
 | 
			
		||||
  //  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
  //  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
  pRNG.SeedRandomDevice();
 | 
			
		||||
  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
  //  pRNG.SeedRandomDevice();
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src   (&Grid); random(pRNG,src);
 | 
			
		||||
  LatticeFermion result(&Grid); result=zero;
 | 
			
		||||
@@ -55,8 +53,10 @@ int main (int argc, char ** argv)
 | 
			
		||||
  Complex cone(1.0,0.0);
 | 
			
		||||
  for(int nn=0;nn<Nd;nn++){
 | 
			
		||||
    random(pRNG,U[nn]);
 | 
			
		||||
    if (nn!=0) U[nn]=zero;
 | 
			
		||||
    else U[nn] = cone;
 | 
			
		||||
    if(0) {
 | 
			
		||||
      if (nn==-1) { U[nn]=zero; std::cout << "zeroing gauge field in dir "<<nn<<std::endl; }
 | 
			
		||||
      else       { U[nn] = cone;std::cout << "unit gauge field in dir "<<nn<<std::endl; }
 | 
			
		||||
    }
 | 
			
		||||
    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -85,7 +85,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
 | 
			
		||||
  
 | 
			
		||||
  std::cout << "Calling Dw"<<std::endl;
 | 
			
		||||
  int ncall=1000;
 | 
			
		||||
  int ncall=10000;
 | 
			
		||||
  double t0=usecond();
 | 
			
		||||
  for(int i=0;i<ncall;i++){
 | 
			
		||||
    Dw.Dhop(src,result,0);
 | 
			
		||||
 
 | 
			
		||||
@@ -65,7 +65,6 @@
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  void Grid_init(int *argc,char ***argv);
 | 
			
		||||
  void Grid_finalize(void);
 | 
			
		||||
  // internal, controled with --handle
 | 
			
		||||
 
 | 
			
		||||
@@ -142,6 +142,9 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
 | 
			
		||||
    Grid_quiesce_nodes();
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
 | 
			
		||||
    WilsonMatrix::HandOptDslash=1;
 | 
			
		||||
  }
 | 
			
		||||
  GridParseLayout(*argv,*argc,
 | 
			
		||||
		  Grid_default_latt,
 | 
			
		||||
		  Grid_default_mpi);
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										100
									
								
								lib/Grid_simd.h
									
									
									
									
									
								
							
							
						
						
									
										100
									
								
								lib/Grid_simd.h
									
									
									
									
									
								
							@@ -162,6 +162,74 @@ namespace Grid {
 | 
			
		||||
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
 | 
			
		||||
// Permute 4 possible on half precision @512bit vectors.
 | 
			
		||||
//////////////////////////////////////////////////////////
 | 
			
		||||
template<class vsimd>
 | 
			
		||||
inline void Gpermute0(vsimd &y,const vsimd &b) {
 | 
			
		||||
  union { 
 | 
			
		||||
    fvec f;
 | 
			
		||||
    decltype(vsimd::v) v;
 | 
			
		||||
  } conv;
 | 
			
		||||
  conv.v = b.v;
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
  conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(AVX1)||defined(AVX2)
 | 
			
		||||
  conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); 
 | 
			
		||||
#endif  
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
 conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); 
 | 
			
		||||
#endif
 | 
			
		||||
 y.v=conv.v;
 | 
			
		||||
};
 | 
			
		||||
template<class vsimd>
 | 
			
		||||
inline void Gpermute1(vsimd &y,const vsimd &b) {
 | 
			
		||||
  union { 
 | 
			
		||||
    fvec f;
 | 
			
		||||
    decltype(vsimd::v) v;
 | 
			
		||||
  } conv;
 | 
			
		||||
  conv.v = b.v;
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
  conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); 
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(AVX1)||defined(AVX2)
 | 
			
		||||
  conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); 
 | 
			
		||||
#endif  
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
 conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); 
 | 
			
		||||
#endif  
 | 
			
		||||
  y.v=conv.v;
 | 
			
		||||
};
 | 
			
		||||
template<class vsimd>
 | 
			
		||||
inline void Gpermute2(vsimd &y,const vsimd &b) {
 | 
			
		||||
  union { 
 | 
			
		||||
    fvec f;
 | 
			
		||||
    decltype(vsimd::v) v;
 | 
			
		||||
  } conv;
 | 
			
		||||
  conv.v = b.v;
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(AVX1)||defined(AVX2)
 | 
			
		||||
  conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); 
 | 
			
		||||
#endif  
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); 
 | 
			
		||||
#endif  
 | 
			
		||||
  y.v=conv.v;
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
template<class vsimd>
 | 
			
		||||
inline void Gpermute3(vsimd &y,const vsimd &b) {
 | 
			
		||||
  union { 
 | 
			
		||||
    fvec f;
 | 
			
		||||
    decltype(vsimd::v) v;
 | 
			
		||||
  } conv;
 | 
			
		||||
  conv.v = b.v;
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); 
 | 
			
		||||
#endif  
 | 
			
		||||
  y.v=conv.v;
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<class vsimd>
 | 
			
		||||
inline void Gpermute(vsimd &y,const vsimd &b,int perm){
 | 
			
		||||
	union { 
 | 
			
		||||
@@ -170,36 +238,12 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
 | 
			
		||||
	} conv;
 | 
			
		||||
	conv.v = b.v;
 | 
			
		||||
      switch (perm){
 | 
			
		||||
#if defined(AVX1)||defined(AVX2)
 | 
			
		||||
      // 8x32 bits=>3 permutes
 | 
			
		||||
      case 2: 
 | 
			
		||||
	conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); 
 | 
			
		||||
	break;
 | 
			
		||||
      case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
 | 
			
		||||
      case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
      case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
 | 
			
		||||
      case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
	// 16 floats=> permutes
 | 
			
		||||
        // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo 
 | 
			
		||||
        // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn 
 | 
			
		||||
        // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
 | 
			
		||||
        // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
 | 
			
		||||
      case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
 | 
			
		||||
      case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
 | 
			
		||||
      case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
 | 
			
		||||
      case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
#error not implemented
 | 
			
		||||
#endif
 | 
			
		||||
      case 3: Gpermute3(y,b); break;
 | 
			
		||||
      case 2: Gpermute2(y,b); break;
 | 
			
		||||
      case 1: Gpermute1(y,b); break;
 | 
			
		||||
      case 0: Gpermute0(y,b); break;
 | 
			
		||||
      default: assert(0); break;
 | 
			
		||||
      }
 | 
			
		||||
      y.v=conv.v;
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -18,6 +18,8 @@ libGrid_a_SOURCES =				\
 | 
			
		||||
	Grid_init.cc				\
 | 
			
		||||
	stencil/Grid_stencil_common.cc		\
 | 
			
		||||
	qcd/Grid_qcd_dirac.cc			\
 | 
			
		||||
	qcd/Grid_qcd_dhop.cc			\
 | 
			
		||||
	qcd/Grid_qcd_dhop_hand.cc		\
 | 
			
		||||
	qcd/Grid_qcd_wilson_dop.cc		\
 | 
			
		||||
	algorithms/approx/Zolotarev.cc		\
 | 
			
		||||
	algorithms/approx/Remez.cc		\
 | 
			
		||||
 
 | 
			
		||||
@@ -47,6 +47,11 @@ class LatticeTrinaryExpression :public std::pair<Op,std::tuple<T1,T2,T3> >, publ
 | 
			
		||||
 LatticeTrinaryExpression(const std::pair<Op,std::tuple<T1,T2,T3> > &arg): std::pair<Op,std::tuple<T1,T2,T3> >(arg) {};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void inline conformable(GridBase *lhs,GridBase *rhs)
 | 
			
		||||
{
 | 
			
		||||
  assert(lhs == rhs);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
class Lattice : public LatticeBase
 | 
			
		||||
{
 | 
			
		||||
@@ -61,6 +66,7 @@ public:
 | 
			
		||||
    typedef typename vobj::vector_type vector_type;
 | 
			
		||||
    typedef vobj vector_object;
 | 
			
		||||
 
 | 
			
		||||
   
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Expression Template closure support
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -276,17 +282,15 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#include <lattice/Grid_lattice_conformable.h>
 | 
			
		||||
 | 
			
		||||
#include <lattice/Grid_lattice_conformable.h>
 | 
			
		||||
#define GRID_LATTICE_EXPRESSION_TEMPLATES
 | 
			
		||||
#ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES
 | 
			
		||||
#include <lattice/Grid_lattice_ET.h>
 | 
			
		||||
#else 
 | 
			
		||||
#include <lattice/Grid_lattice_overload.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#include <lattice/Grid_lattice_arith.h>
 | 
			
		||||
 | 
			
		||||
#include <lattice/Grid_lattice_trace.h>
 | 
			
		||||
#include <lattice/Grid_lattice_transpose.h>
 | 
			
		||||
#include <lattice/Grid_lattice_local.h>
 | 
			
		||||
 
 | 
			
		||||
@@ -3,16 +3,11 @@
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
    template<class obj1,class obj2>
 | 
			
		||||
    void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
 | 
			
		||||
    template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
 | 
			
		||||
    {
 | 
			
		||||
        assert(lhs._grid == rhs._grid);
 | 
			
		||||
        assert(lhs.checkerboard == rhs.checkerboard);
 | 
			
		||||
    }
 | 
			
		||||
    void inline conformable(const GridBase *lhs,GridBase *rhs)
 | 
			
		||||
    {
 | 
			
		||||
        assert(lhs == rhs);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,3 @@
 | 
			
		||||
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
@@ -7,15 +6,7 @@ namespace QCD {
 | 
			
		||||
const std::vector<int> WilsonMatrix::directions   ({0,1,2,3, 0, 1, 2, 3});
 | 
			
		||||
const std::vector<int> WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1});
 | 
			
		||||
 | 
			
		||||
  // Should be in header?
 | 
			
		||||
const int WilsonMatrix::Xp = 0;
 | 
			
		||||
const int WilsonMatrix::Yp = 1;
 | 
			
		||||
const int WilsonMatrix::Zp = 2;
 | 
			
		||||
const int WilsonMatrix::Tp = 3;
 | 
			
		||||
const int WilsonMatrix::Xm = 4;
 | 
			
		||||
const int WilsonMatrix::Ym = 5;
 | 
			
		||||
const int WilsonMatrix::Zm = 6;
 | 
			
		||||
const int WilsonMatrix::Tm = 7;
 | 
			
		||||
  int WilsonMatrix::HandOptDslash;
 | 
			
		||||
 | 
			
		||||
  class WilsonCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
@@ -39,28 +30,28 @@ const int WilsonMatrix::Tm = 7;
 | 
			
		||||
	mudag=(mu+Nd)%(2*Nd);
 | 
			
		||||
      }
 | 
			
		||||
      switch(mudag) {
 | 
			
		||||
      case WilsonMatrix::Xp:
 | 
			
		||||
      case Xp:
 | 
			
		||||
	spProjXp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Yp:
 | 
			
		||||
      case Yp:
 | 
			
		||||
	spProjYp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Zp:
 | 
			
		||||
      case Zp:
 | 
			
		||||
	spProjZp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Tp:
 | 
			
		||||
      case Tp:
 | 
			
		||||
	spProjTp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Xm:
 | 
			
		||||
      case Xm:
 | 
			
		||||
	spProjXm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Ym:
 | 
			
		||||
      case Ym:
 | 
			
		||||
	spProjYm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Zm:
 | 
			
		||||
      case Zm:
 | 
			
		||||
	spProjZm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Tm:
 | 
			
		||||
      case Tm:
 | 
			
		||||
	spProjTm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      default: 
 | 
			
		||||
@@ -157,316 +148,36 @@ void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
  MooeeInv(in,out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
			    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
			    int ss,const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
    vHalfSpinColourVector  tmp;    
 | 
			
		||||
    vHalfSpinColourVector  chi;    
 | 
			
		||||
    vSpinColourVector result;
 | 
			
		||||
    vHalfSpinColourVector Uchi;
 | 
			
		||||
    int offset,local,perm, ptype;
 | 
			
		||||
 | 
			
		||||
    //#define VERBOSE( A)  if ( ss<10 ) { std::cout << "site " <<ss << " " #A " neigh " << offset << " perm "<< perm <<std::endl;}    
 | 
			
		||||
 | 
			
		||||
    // Xp
 | 
			
		||||
    offset = st._offsets [Xp][ss];
 | 
			
		||||
    local  = st._is_local[Xp][ss];
 | 
			
		||||
    perm   = st._permute[Xp][ss];
 | 
			
		||||
 | 
			
		||||
    ptype  = st._permute_type[Xp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjXp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjXp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Xp),&chi());
 | 
			
		||||
    spReconXp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Yp
 | 
			
		||||
    offset = st._offsets [Yp][ss];
 | 
			
		||||
    local  = st._is_local[Yp][ss];
 | 
			
		||||
    perm   = st._permute[Yp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Yp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjYp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjYp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Yp),&chi());
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zp
 | 
			
		||||
    offset = st._offsets [Zp][ss];
 | 
			
		||||
    local  = st._is_local[Zp][ss];
 | 
			
		||||
    perm   = st._permute[Zp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Zp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjZp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjZp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Zp),&chi());
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tp
 | 
			
		||||
    offset = st._offsets [Tp][ss];
 | 
			
		||||
    local  = st._is_local[Tp][ss];
 | 
			
		||||
    perm   = st._permute[Tp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Tp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjTp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjTp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Tp),&chi());
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Xm
 | 
			
		||||
    offset = st._offsets [Xm][ss];
 | 
			
		||||
    local  = st._is_local[Xm][ss];
 | 
			
		||||
    perm   = st._permute[Xm][ss];
 | 
			
		||||
    ptype  = st._permute_type[Xm];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
      spProjXm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjXm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Xm),&chi());
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // Ym
 | 
			
		||||
    offset = st._offsets [Ym][ss];
 | 
			
		||||
    local  = st._is_local[Ym][ss];
 | 
			
		||||
    perm   = st._permute[Ym][ss];
 | 
			
		||||
    ptype  = st._permute_type[Ym];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjYm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjYm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Ym),&chi());
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zm
 | 
			
		||||
    offset = st._offsets [Zm][ss];
 | 
			
		||||
    local  = st._is_local[Zm][ss];
 | 
			
		||||
    perm   = st._permute[Zm][ss];
 | 
			
		||||
    ptype  = st._permute_type[Zm];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjZm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjZm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Zm),&chi());
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tm
 | 
			
		||||
    offset = st._offsets [Tm][ss];
 | 
			
		||||
    local  = st._is_local[Tm][ss];
 | 
			
		||||
    perm   = st._permute[Tm][ss];
 | 
			
		||||
    ptype  = st._permute_type[Tm];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjTm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjTm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Tm),&chi());
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    vstream(out._odata[ss],result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
			       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
			       int ss,const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
    vHalfSpinColourVector  tmp;    
 | 
			
		||||
    vHalfSpinColourVector  chi;    
 | 
			
		||||
    vSpinColourVector result;
 | 
			
		||||
    vHalfSpinColourVector Uchi;
 | 
			
		||||
    int offset,local,perm, ptype;
 | 
			
		||||
 | 
			
		||||
    // Xp
 | 
			
		||||
    offset = st._offsets [Xm][ss];
 | 
			
		||||
    local  = st._is_local[Xm][ss];
 | 
			
		||||
    perm   = st._permute[Xm][ss];
 | 
			
		||||
 | 
			
		||||
    ptype  = st._permute_type[Xm];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjXp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjXp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Xm),&chi());
 | 
			
		||||
    spReconXp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Yp
 | 
			
		||||
    offset = st._offsets [Ym][ss];
 | 
			
		||||
    local  = st._is_local[Ym][ss];
 | 
			
		||||
    perm   = st._permute[Ym][ss];
 | 
			
		||||
    ptype  = st._permute_type[Ym];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjYp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjYp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Ym),&chi());
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zp
 | 
			
		||||
    offset = st._offsets [Zm][ss];
 | 
			
		||||
    local  = st._is_local[Zm][ss];
 | 
			
		||||
    perm   = st._permute[Zm][ss];
 | 
			
		||||
    ptype  = st._permute_type[Zm];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjZp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjZp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Zm),&chi());
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tp
 | 
			
		||||
    offset = st._offsets [Tm][ss];
 | 
			
		||||
    local  = st._is_local[Tm][ss];
 | 
			
		||||
    perm   = st._permute[Tm][ss];
 | 
			
		||||
    ptype  = st._permute_type[Tm];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjTp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjTp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Tm),&chi());
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Xm
 | 
			
		||||
    offset = st._offsets [Xp][ss];
 | 
			
		||||
    local  = st._is_local[Xp][ss];
 | 
			
		||||
    perm   = st._permute[Xp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Xp];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
      spProjXm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjXm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Xp),&chi());
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Ym
 | 
			
		||||
    offset = st._offsets [Yp][ss];
 | 
			
		||||
    local  = st._is_local[Yp][ss];
 | 
			
		||||
    perm   = st._permute[Yp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Yp];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjYm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjYm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Yp),&chi());
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zm
 | 
			
		||||
    offset = st._offsets [Zp][ss];
 | 
			
		||||
    local  = st._is_local[Zp][ss];
 | 
			
		||||
    perm   = st._permute[Zp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Zp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjZm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjZm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Zp),&chi());
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tm
 | 
			
		||||
    offset = st._offsets [Tp][ss];
 | 
			
		||||
    local  = st._is_local[Tp][ss];
 | 
			
		||||
    perm   = st._permute[Tp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Tp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjTm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjTm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Tp),&chi());
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    vstream(out._odata[ss],result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField & U,
 | 
			
		||||
				const LatticeFermion &in, LatticeFermion &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  WilsonCompressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  st.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
 | 
			
		||||
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
      DhopSiteDag(st,U,comm_buf,sss,in,out);
 | 
			
		||||
        DiracOptHand::DhopSiteDag(st,U,comm_buf,sss,in,out);
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
      DhopSite(st,U,comm_buf,sss,in,out);
 | 
			
		||||
        DiracOpt::DhopSiteDag(st,U,comm_buf,sss,in,out);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
        DiracOptHand::DhopSite(st,U,comm_buf,sss,in,out);
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
        DiracOpt::DhopSite(st,U,comm_buf,sss,in,out);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -6,10 +6,22 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
  namespace QCD {
 | 
			
		||||
 | 
			
		||||
  // Should be in header?
 | 
			
		||||
    const int Xp = 0;
 | 
			
		||||
    const int Yp = 1;
 | 
			
		||||
    const int Zp = 2;
 | 
			
		||||
    const int Tp = 3;
 | 
			
		||||
    const int Xm = 4;
 | 
			
		||||
    const int Ym = 5;
 | 
			
		||||
    const int Zm = 6;
 | 
			
		||||
    const int Tm = 7;
 | 
			
		||||
 | 
			
		||||
    class WilsonMatrix : public CheckerBoardedSparseMatrixBase<LatticeFermion>
 | 
			
		||||
    {
 | 
			
		||||
      //NB r=1;
 | 
			
		||||
    public:
 | 
			
		||||
      static int HandOptDslash;
 | 
			
		||||
 | 
			
		||||
      double                        mass;
 | 
			
		||||
      //      GridBase                     *    grid; // Inherited
 | 
			
		||||
      //      GridBase                     *  cbgrid;
 | 
			
		||||
@@ -56,14 +68,6 @@ namespace Grid {
 | 
			
		||||
      void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag);
 | 
			
		||||
      void DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField &U,
 | 
			
		||||
			const LatticeFermion &in, LatticeFermion &out,int dag);
 | 
			
		||||
      // These ones will need to be package intelligently. WilsonType base class
 | 
			
		||||
      // for use by DWF etc..
 | 
			
		||||
      void DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
		    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
		    int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
      void DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
		       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
		       int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
 | 
			
		||||
      typedef iScalar<iMatrix<vComplex, Nc> > matrix;
 | 
			
		||||
 | 
			
		||||
@@ -71,6 +75,31 @@ namespace Grid {
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    class DiracOpt {
 | 
			
		||||
    public:
 | 
			
		||||
      // These ones will need to be package intelligently. WilsonType base class
 | 
			
		||||
      // for use by DWF etc..
 | 
			
		||||
      static void DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
		    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
		    int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
      static void DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
		       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
		       int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
    class DiracOptHand {
 | 
			
		||||
    public:
 | 
			
		||||
      // These ones will need to be package intelligently. WilsonType base class
 | 
			
		||||
      // for use by DWF etc..
 | 
			
		||||
      static void DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
		    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
		    int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
      static void DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
		       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
		       int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user