mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Verified
This commit is contained in:
		@@ -338,12 +338,12 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
 | 
			
		||||
      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), sss, sss, in, out);
 | 
			
		||||
      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
 | 
			
		||||
      Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), sss, sss, in, out);
 | 
			
		||||
      Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 
 | 
			
		||||
@@ -228,9 +228,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 | 
			
		||||
						    const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  Compressor compressor;
 | 
			
		||||
 | 
			
		||||
  int LLs = in._grid->_rdimensions[0];
 | 
			
		||||
  
 | 
			
		||||
  st.HaloExchange(in,compressor);
 | 
			
		||||
  
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
@@ -241,28 +239,11 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 | 
			
		||||
      Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
#if 1
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
      int sU=ss;
 | 
			
		||||
	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
 | 
			
		||||
    }
 | 
			
		||||
#else
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
    for(int i=0;i<10;i++){
 | 
			
		||||
      int len = U._grid->oSites();
 | 
			
		||||
      int me,mywork,myoff;
 | 
			
		||||
      GridThread::GetWorkBarrier(len,me, mywork,myoff);
 | 
			
		||||
      for (int ss = myoff; ss < myoff+mywork; ss++) {
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	int sF=LLs*sU; 
 | 
			
		||||
	  Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
 | 
			
		||||
      }
 | 
			
		||||
      GridThread::ThreadBarrier();
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -186,32 +186,31 @@ template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
 | 
			
		||||
						  SiteSpinor *buf, int LLs, int sU,
 | 
			
		||||
						  const FermionField &in, FermionField &out) {
 | 
			
		||||
  int dag(1);
 | 
			
		||||
  SiteSpinor naik;
 | 
			
		||||
  SiteSpinor naive;
 | 
			
		||||
  int oneLink  =0;
 | 
			
		||||
  int threeLink=1;
 | 
			
		||||
  Real scale;
 | 
			
		||||
  if(dag) scale = -1.0;
 | 
			
		||||
  else    scale = 1.0;
 | 
			
		||||
 | 
			
		||||
  int dag=1;
 | 
			
		||||
  switch(Opt) {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
  //FIXME; move the sign into the Asm routine
 | 
			
		||||
  case OptInlineAsm:
 | 
			
		||||
    DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
 | 
			
		||||
    for(int s=0;s<LLs;s++) {
 | 
			
		||||
      int sF=s+LLs*sU;
 | 
			
		||||
      out._odata[sF]=-out._odata[sF];
 | 
			
		||||
    }
 | 
			
		||||
    break;
 | 
			
		||||
#endif
 | 
			
		||||
  case OptHandUnroll:
 | 
			
		||||
    DhopSiteDepthHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
 | 
			
		||||
    DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
 | 
			
		||||
    break;
 | 
			
		||||
  case OptGeneric:
 | 
			
		||||
    for(int s=0;s<LLs;s++){
 | 
			
		||||
 | 
			
		||||
       int sF=s+LLs*sU;
 | 
			
		||||
 | 
			
		||||
       DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
 | 
			
		||||
       DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
 | 
			
		||||
       out._odata[sF] =scale*(naive+naik); 
 | 
			
		||||
       out._odata[sF] =-naive-naik; 
 | 
			
		||||
     }
 | 
			
		||||
    break;
 | 
			
		||||
  default:
 | 
			
		||||
@@ -223,17 +222,13 @@ void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, Dou
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
 | 
			
		||||
				      SiteSpinor *buf, int LLs,
 | 
			
		||||
				      int sU, const FermionField &in, FermionField &out) {
 | 
			
		||||
 | 
			
		||||
   int dag(0);
 | 
			
		||||
 | 
			
		||||
     int oneLink  =0;
 | 
			
		||||
     int threeLink=1;
 | 
			
		||||
     SiteSpinor naik;
 | 
			
		||||
     SiteSpinor naive;
 | 
			
		||||
     static int once;
 | 
			
		||||
     int sF=LLs*sU; 
 | 
			
		||||
 | 
			
		||||
				      int sU, const FermionField &in, FermionField &out) 
 | 
			
		||||
{
 | 
			
		||||
  int oneLink  =0;
 | 
			
		||||
  int threeLink=1;
 | 
			
		||||
  SiteSpinor naik;
 | 
			
		||||
  SiteSpinor naive;
 | 
			
		||||
  int dag=0;
 | 
			
		||||
  switch(Opt) {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
  case OptInlineAsm:
 | 
			
		||||
@@ -241,22 +236,23 @@ void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, Double
 | 
			
		||||
    break;
 | 
			
		||||
#endif
 | 
			
		||||
  case OptHandUnroll:
 | 
			
		||||
       DhopSiteDepthHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
 | 
			
		||||
  break;
 | 
			
		||||
    DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
 | 
			
		||||
    break;
 | 
			
		||||
  case OptGeneric:
 | 
			
		||||
 | 
			
		||||
     for(int s=0;s<LLs;s++){
 | 
			
		||||
         DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
 | 
			
		||||
         DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
 | 
			
		||||
         out._odata[sF] =naive+naik;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    for(int s=0;s<LLs;s++){
 | 
			
		||||
      int sF=LLs*sU+s;
 | 
			
		||||
      //      assert(sF<in._odata.size());
 | 
			
		||||
      //      assert(sU< U._odata.size());
 | 
			
		||||
      //      assert(sF>=0);      assert(sU>=0);
 | 
			
		||||
      DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
 | 
			
		||||
      DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
 | 
			
		||||
      out._odata[sF] =naive+naik;
 | 
			
		||||
    }
 | 
			
		||||
    break;
 | 
			
		||||
  default:
 | 
			
		||||
    assert(0);
 | 
			
		||||
    break;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
 
 | 
			
		||||
@@ -57,11 +57,11 @@ public:
 | 
			
		||||
		     int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
   void DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
 | 
			
		||||
   void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
 | 
			
		||||
		     int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink);
 | 
			
		||||
 | 
			
		||||
   void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf,
 | 
			
		||||
		     int Lls, int sU, const FermionField &in, FermionField &out, int dag);
 | 
			
		||||
   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf,
 | 
			
		||||
		     int LLs, int sU, const FermionField &in, FermionField &out, int dag);
 | 
			
		||||
 | 
			
		||||
   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
 | 
			
		||||
			 int LLs, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
 
 | 
			
		||||
@@ -517,7 +517,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
       : : "r" (a0) : "%r8" );						\
 | 
			
		||||
 | 
			
		||||
#define PF_GAUGE_XYZT(a0)							
 | 
			
		||||
#define PF_GAUGE_XYZTa(a0)							\
 | 
			
		||||
#define PF_GAUGE_XYZTa(a0)						\
 | 
			
		||||
  asm (									\
 | 
			
		||||
       "movq %0, %%r8 \n\t"						\
 | 
			
		||||
       VPREFETCH1(0,%%r8)						\
 | 
			
		||||
@@ -578,10 +578,10 @@ namespace QCD {
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
					      DoubledGaugeField &U,
 | 
			
		||||
					      DoubledGaugeField &UUU,
 | 
			
		||||
					      SiteSpinor *buf, int LLs,
 | 
			
		||||
					      int sU, const FermionField &in, FermionField &out) 
 | 
			
		||||
					 DoubledGaugeField &U,
 | 
			
		||||
					 DoubledGaugeField &UUU,
 | 
			
		||||
					 SiteSpinor *buf, int LLs,
 | 
			
		||||
					 int sU, const FermionField &in, FermionField &out) 
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
 | 
			
		||||
@@ -611,35 +611,35 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
  o0 = SE0->_offset;							\
 | 
			
		||||
  l0 = SE0->_is_local;							\
 | 
			
		||||
  p0 = SE0->_permute;							\
 | 
			
		||||
  CONDITIONAL_MOVE(l0,o0,addr0);							\
 | 
			
		||||
  CONDITIONAL_MOVE(l0,o0,addr0);					\
 | 
			
		||||
  PF_CHI(addr0);							\
 | 
			
		||||
									\
 | 
			
		||||
  SE1=st.GetEntry(ptype,Y+skew,sF);			\
 | 
			
		||||
  o1 = SE1->_offset;					\
 | 
			
		||||
  l1 = SE1->_is_local;					\
 | 
			
		||||
  p1 = SE1->_permute;					\
 | 
			
		||||
  CONDITIONAL_MOVE(l1,o1,addr1);							\
 | 
			
		||||
  									\
 | 
			
		||||
  SE1=st.GetEntry(ptype,Y+skew,sF);					\
 | 
			
		||||
  o1 = SE1->_offset;							\
 | 
			
		||||
  l1 = SE1->_is_local;							\
 | 
			
		||||
  p1 = SE1->_permute;							\
 | 
			
		||||
  CONDITIONAL_MOVE(l1,o1,addr1);					\
 | 
			
		||||
  PF_CHI(addr1);							\
 | 
			
		||||
									\
 | 
			
		||||
  SE2=st.GetEntry(ptype,Z+skew,sF);			\
 | 
			
		||||
  o2 = SE2->_offset;					\
 | 
			
		||||
  l2 = SE2->_is_local;					\
 | 
			
		||||
  p2 = SE2->_permute;					\
 | 
			
		||||
  CONDITIONAL_MOVE(l2,o2,addr2);							\
 | 
			
		||||
  									\
 | 
			
		||||
  SE2=st.GetEntry(ptype,Z+skew,sF);					\
 | 
			
		||||
  o2 = SE2->_offset;							\
 | 
			
		||||
  l2 = SE2->_is_local;							\
 | 
			
		||||
  p2 = SE2->_permute;							\
 | 
			
		||||
  CONDITIONAL_MOVE(l2,o2,addr2);					\
 | 
			
		||||
  PF_CHI(addr2);							\
 | 
			
		||||
									\
 | 
			
		||||
  SE3=st.GetEntry(ptype,T+skew,sF);			\
 | 
			
		||||
  o3 = SE3->_offset;					\
 | 
			
		||||
  l3 = SE3->_is_local;					\
 | 
			
		||||
  p3 = SE3->_permute;					\
 | 
			
		||||
  CONDITIONAL_MOVE(l3,o3,addr3);							\
 | 
			
		||||
  									\
 | 
			
		||||
  SE3=st.GetEntry(ptype,T+skew,sF);					\
 | 
			
		||||
  o3 = SE3->_offset;							\
 | 
			
		||||
  l3 = SE3->_is_local;							\
 | 
			
		||||
  p3 = SE3->_permute;							\
 | 
			
		||||
  CONDITIONAL_MOVE(l3,o3,addr3);					\
 | 
			
		||||
  PF_CHI(addr3);							\
 | 
			
		||||
  									\
 | 
			
		||||
  gauge0 =(uint64_t)&UU._odata[sU]( X ); \
 | 
			
		||||
  gauge1 =(uint64_t)&UU._odata[sU]( Y ); \
 | 
			
		||||
  gauge2 =(uint64_t)&UU._odata[sU]( Z ); \
 | 
			
		||||
  gauge0 =(uint64_t)&UU._odata[sU]( X );				\
 | 
			
		||||
  gauge1 =(uint64_t)&UU._odata[sU]( Y );				\
 | 
			
		||||
  gauge2 =(uint64_t)&UU._odata[sU]( Z );				\
 | 
			
		||||
  gauge3 =(uint64_t)&UU._odata[sU]( T ); 
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  // This is the single precision 5th direction vectorised kernel
 | 
			
		||||
#include <simd/Intel512single.h>
 | 
			
		||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
@@ -762,6 +762,14 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
 | 
			
		||||
  VPERM0(Chi_11,Chi_11)	\
 | 
			
		||||
  VPERM0(Chi_12,Chi_12) );
 | 
			
		||||
 | 
			
		||||
#define PERMUTE01 \
 | 
			
		||||
  if ( p0 ) { PERMUTE_DIR3; }\
 | 
			
		||||
  if ( p1 ) { PERMUTE_DIR2; }
 | 
			
		||||
 | 
			
		||||
#define PERMUTE23 \
 | 
			
		||||
  if ( p2 ) { PERMUTE_DIR1; }\
 | 
			
		||||
  if ( p3 ) { PERMUTE_DIR0; }
 | 
			
		||||
 | 
			
		||||
  // This is the single precision 5th direction vectorised kernel
 | 
			
		||||
 | 
			
		||||
#include <simd/Intel512single.h>
 | 
			
		||||
@@ -785,35 +793,50 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 | 
			
		||||
  StencilEntry *SE2;
 | 
			
		||||
  StencilEntry *SE3;
 | 
			
		||||
 | 
			
		||||
   for(int s=0;s<LLs;s++){
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    
 | 
			
		||||
    int sF=s+LLs*sU;
 | 
			
		||||
    // Xp, Yp, Zp, Tp
 | 
			
		||||
    PREPARE(Xp,Yp,Zp,Tp,0,U);
 | 
			
		||||
    LOAD_CHI(addr0,addr1,addr2,addr3);
 | 
			
		||||
    MULT_LS(gauge0,gauge1,gauge2,gauge3);  
 | 
			
		||||
    LOAD_CHIa(addr0,addr1);
 | 
			
		||||
    PERMUTE01;
 | 
			
		||||
    MULT_XYZT(gauge0,gauge1);
 | 
			
		||||
    LOAD_CHIa(addr2,addr3);
 | 
			
		||||
    PERMUTE23;
 | 
			
		||||
    MULT_ADD_XYZT(gauge2,gauge3);  
 | 
			
		||||
 | 
			
		||||
    PREPARE(Xm,Ym,Zm,Tm,0,U);
 | 
			
		||||
    LOAD_CHI(addr0,addr1,addr2,addr3);
 | 
			
		||||
    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);  
 | 
			
		||||
    LOAD_CHIa(addr0,addr1);
 | 
			
		||||
    PERMUTE01;
 | 
			
		||||
    MULT_ADD_XYZT(gauge0,gauge1);
 | 
			
		||||
    LOAD_CHIa(addr2,addr3);
 | 
			
		||||
    PERMUTE23;
 | 
			
		||||
    MULT_ADD_XYZT(gauge2,gauge3);  
 | 
			
		||||
 | 
			
		||||
    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
 | 
			
		||||
    LOAD_CHI(addr0,addr1,addr2,addr3);
 | 
			
		||||
    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
 | 
			
		||||
 | 
			
		||||
    LOAD_CHIa(addr0,addr1);
 | 
			
		||||
    PERMUTE01;
 | 
			
		||||
    MULT_ADD_XYZT(gauge0,gauge1);
 | 
			
		||||
    LOAD_CHIa(addr2,addr3);
 | 
			
		||||
    PERMUTE23;
 | 
			
		||||
    MULT_ADD_XYZT(gauge2,gauge3);  
 | 
			
		||||
    
 | 
			
		||||
    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
 | 
			
		||||
    LOAD_CHI(addr0,addr1,addr2,addr3);
 | 
			
		||||
    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
 | 
			
		||||
    LOAD_CHIa(addr0,addr1);
 | 
			
		||||
    PERMUTE01;
 | 
			
		||||
    MULT_ADD_XYZT(gauge0,gauge1);
 | 
			
		||||
    LOAD_CHIa(addr2,addr3);
 | 
			
		||||
    PERMUTE23;
 | 
			
		||||
    MULT_ADD_XYZT(gauge2,gauge3);  
 | 
			
		||||
 | 
			
		||||
    addr0 = (uint64_t) &out._odata[sF];
 | 
			
		||||
    REDUCE(addr0);
 | 
			
		||||
   }
 | 
			
		||||
    REDUCEa(addr0);
 | 
			
		||||
  }
 | 
			
		||||
#else 
 | 
			
		||||
    assert(0);
 | 
			
		||||
  assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#include <simd/Intel512double.h>
 | 
			
		||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
								    DoubledGaugeField &U,
 | 
			
		||||
@@ -835,31 +858,47 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
 | 
			
		||||
  StencilEntry *SE2;
 | 
			
		||||
  StencilEntry *SE3;
 | 
			
		||||
 | 
			
		||||
   for(int s=0;s<LLs;s++){
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    
 | 
			
		||||
    int sF=s+LLs*sU;
 | 
			
		||||
    // Xp, Yp, Zp, Tp
 | 
			
		||||
    PREPARE(Xp,Yp,Zp,Tp,0,U);
 | 
			
		||||
    LOAD_CHI(addr0,addr1,addr2,addr3);
 | 
			
		||||
    MULT_LS(gauge0,gauge1,gauge2,gauge3);  
 | 
			
		||||
 | 
			
		||||
    LOAD_CHIa(addr0,addr1);
 | 
			
		||||
    PERMUTE01;
 | 
			
		||||
    MULT_XYZT(gauge0,gauge1);
 | 
			
		||||
    LOAD_CHIa(addr2,addr3);
 | 
			
		||||
    PERMUTE23;
 | 
			
		||||
    MULT_ADD_XYZT(gauge2,gauge3);  
 | 
			
		||||
    
 | 
			
		||||
    PREPARE(Xm,Ym,Zm,Tm,0,U);
 | 
			
		||||
    LOAD_CHI(addr0,addr1,addr2,addr3);
 | 
			
		||||
    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);  
 | 
			
		||||
 | 
			
		||||
    LOAD_CHIa(addr0,addr1);
 | 
			
		||||
    PERMUTE01;
 | 
			
		||||
    MULT_ADD_XYZT(gauge0,gauge1);
 | 
			
		||||
    LOAD_CHIa(addr2,addr3);
 | 
			
		||||
    PERMUTE23;
 | 
			
		||||
    MULT_ADD_XYZT(gauge2,gauge3);  
 | 
			
		||||
    
 | 
			
		||||
    PREPARE(Xp,Yp,Zp,Tp,8,UUU);
 | 
			
		||||
    LOAD_CHI(addr0,addr1,addr2,addr3);
 | 
			
		||||
    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
 | 
			
		||||
 | 
			
		||||
    LOAD_CHIa(addr0,addr1);
 | 
			
		||||
    PERMUTE01;
 | 
			
		||||
    MULT_ADD_XYZT(gauge0,gauge1);
 | 
			
		||||
    LOAD_CHIa(addr2,addr3);
 | 
			
		||||
    PERMUTE23;
 | 
			
		||||
    MULT_ADD_XYZT(gauge2,gauge3);  
 | 
			
		||||
    
 | 
			
		||||
    PREPARE(Xm,Ym,Zm,Tm,8,UUU);
 | 
			
		||||
    LOAD_CHI(addr0,addr1,addr2,addr3);
 | 
			
		||||
    MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
 | 
			
		||||
 | 
			
		||||
    LOAD_CHIa(addr0,addr1);
 | 
			
		||||
    PERMUTE01;
 | 
			
		||||
    MULT_ADD_XYZT(gauge0,gauge1);
 | 
			
		||||
    LOAD_CHIa(addr2,addr3);
 | 
			
		||||
    PERMUTE23;
 | 
			
		||||
    MULT_ADD_XYZT(gauge2,gauge3);  
 | 
			
		||||
    
 | 
			
		||||
    addr0 = (uint64_t) &out._odata[sF];
 | 
			
		||||
    REDUCE(addr0);
 | 
			
		||||
   }
 | 
			
		||||
    REDUCEa(addr0);
 | 
			
		||||
  }
 | 
			
		||||
#else 
 | 
			
		||||
    assert(0);
 | 
			
		||||
  assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -91,34 +91,31 @@ namespace QCD {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
 | 
			
		||||
					   SiteSpinor *buf, int LLs,
 | 
			
		||||
					   int sU, const FermionField &in, FermionField &out, int dag) {
 | 
			
		||||
 | 
			
		||||
    SiteSpinor naik; 
 | 
			
		||||
    SiteSpinor naive;
 | 
			
		||||
    int oneLink  =0;
 | 
			
		||||
    int threeLink=1;
 | 
			
		||||
    int skew(0);
 | 
			
		||||
    Real scale(1.0);
 | 
			
		||||
 | 
			
		||||
    if(dag) scale = -1.0;
 | 
			
		||||
 | 
			
		||||
   for(int s=0;s<LLs;s++){
 | 
			
		||||
   
 | 
			
		||||
     int sF=s+LLs*sU;
 | 
			
		||||
       DhopSiteDepthHandLocal(st,lo,U,buf,sF,sU,in,naive,oneLink);
 | 
			
		||||
       DhopSiteDepthHandLocal(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
 | 
			
		||||
       out._odata[sF] =scale*(naive+naik);
 | 
			
		||||
   }
 | 
			
		||||
   
 | 
			
		||||
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
 | 
			
		||||
					  SiteSpinor *buf, int LLs,
 | 
			
		||||
					  int sU, const FermionField &in, FermionField &out, int dag) 
 | 
			
		||||
{
 | 
			
		||||
  SiteSpinor naik; 
 | 
			
		||||
  SiteSpinor naive;
 | 
			
		||||
  int oneLink  =0;
 | 
			
		||||
  int threeLink=1;
 | 
			
		||||
  int skew(0);
 | 
			
		||||
  Real scale(1.0);
 | 
			
		||||
  
 | 
			
		||||
  if(dag) scale = -1.0;
 | 
			
		||||
  
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    int sF=s+LLs*sU;
 | 
			
		||||
    DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
 | 
			
		||||
    DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
 | 
			
		||||
    out._odata[sF] =scale*(naive+naik);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
					   SiteSpinor *buf, int sF,
 | 
			
		||||
					   int sU, const FermionField &in, SiteSpinor &out,int threeLink) {
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
					       SiteSpinor *buf, int sF,
 | 
			
		||||
					       int sU, const FermionField &in, SiteSpinor &out,int threeLink) 
 | 
			
		||||
{
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
@@ -300,7 +297,6 @@ void StaggeredKernels<Impl>::DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrd
 | 
			
		||||
  vstream(out()()(1),even_1+odd_1);
 | 
			
		||||
  vstream(out()()(2),even_2+odd_2);
 | 
			
		||||
 | 
			
		||||
 }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
FermOpStaggeredTemplateInstantiate(StaggeredKernels);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user