mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Major rework of stencil. Half precision and MPI3 now working.
This commit is contained in:
		@@ -241,13 +241,18 @@ public:
 | 
			
		||||
 | 
			
		||||
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> same_node;
 | 
			
		||||
 | 
			
		||||
  WilsonStencil(GridBase *grid,
 | 
			
		||||
		int npoints,
 | 
			
		||||
		int checkerboard,
 | 
			
		||||
		const std::vector<int> &directions,
 | 
			
		||||
		const std::vector<int> &distances)  
 | 
			
		||||
   : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) 
 | 
			
		||||
  { /*Do nothing*/ };
 | 
			
		||||
    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
 | 
			
		||||
      same_node(npoints)
 | 
			
		||||
  { 
 | 
			
		||||
    assert(npoints==8);// or 10 if do naive DWF 5d red black ?
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  template < class compressor>
 | 
			
		||||
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
@@ -257,6 +262,7 @@ public:
 | 
			
		||||
    this->CommunicateBegin(reqs);
 | 
			
		||||
    this->CommunicateComplete(reqs);
 | 
			
		||||
    this->CommsMerge(compress);
 | 
			
		||||
    this->CommsMergeSHM(compress);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template <class compressor>
 | 
			
		||||
@@ -295,23 +301,23 @@ public:
 | 
			
		||||
    int face_idx=0;
 | 
			
		||||
    if ( dag ) { 
 | 
			
		||||
      //	std::cout << " Optimised Dagger compress " <<std::endl;
 | 
			
		||||
      this->HaloGatherDir(source,XpCompress,Xp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,YpCompress,Yp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,ZpCompress,Zp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,TpCompress,Tp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,XmCompress,Xm,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,YmCompress,Ym,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,ZmCompress,Zm,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,TmCompress,Tm,face_idx);
 | 
			
		||||
      same_node[Xp]=this->HaloGatherDir(source,XpCompress,Xp,face_idx);
 | 
			
		||||
      same_node[Yp]=this->HaloGatherDir(source,YpCompress,Yp,face_idx);
 | 
			
		||||
      same_node[Zp]=this->HaloGatherDir(source,ZpCompress,Zp,face_idx);
 | 
			
		||||
      same_node[Tp]=this->HaloGatherDir(source,TpCompress,Tp,face_idx);
 | 
			
		||||
      same_node[Xm]=this->HaloGatherDir(source,XmCompress,Xm,face_idx);
 | 
			
		||||
      same_node[Ym]=this->HaloGatherDir(source,YmCompress,Ym,face_idx);
 | 
			
		||||
      same_node[Zm]=this->HaloGatherDir(source,ZmCompress,Zm,face_idx);
 | 
			
		||||
      same_node[Tm]=this->HaloGatherDir(source,TmCompress,Tm,face_idx);
 | 
			
		||||
    } else {
 | 
			
		||||
      this->HaloGatherDir(source,XmCompress,Xp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,YmCompress,Yp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,ZmCompress,Zp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,TmCompress,Tp,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,XpCompress,Xm,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,YpCompress,Ym,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,ZpCompress,Zm,face_idx);
 | 
			
		||||
      this->HaloGatherDir(source,TpCompress,Tm,face_idx);
 | 
			
		||||
      same_node[Xp]=this->HaloGatherDir(source,XmCompress,Xp,face_idx);
 | 
			
		||||
      same_node[Yp]=this->HaloGatherDir(source,YmCompress,Yp,face_idx);
 | 
			
		||||
      same_node[Zp]=this->HaloGatherDir(source,ZmCompress,Zp,face_idx);
 | 
			
		||||
      same_node[Tp]=this->HaloGatherDir(source,TmCompress,Tp,face_idx);
 | 
			
		||||
      same_node[Xm]=this->HaloGatherDir(source,XpCompress,Xm,face_idx);
 | 
			
		||||
      same_node[Ym]=this->HaloGatherDir(source,YpCompress,Ym,face_idx);
 | 
			
		||||
      same_node[Zm]=this->HaloGatherDir(source,ZpCompress,Zm,face_idx);
 | 
			
		||||
      same_node[Tm]=this->HaloGatherDir(source,TpCompress,Tm,face_idx);
 | 
			
		||||
    }
 | 
			
		||||
    this->face_table_computed=1;
 | 
			
		||||
    assert(this->u_comm_offset==this->_unified_buffer_size);
 | 
			
		||||
 
 | 
			
		||||
@@ -118,48 +118,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
  // Allocate the required comms buffer
 | 
			
		||||
  ImportGauge(_Umu);
 | 
			
		||||
}
 | 
			
		||||
  /*
 | 
			
		||||
template<class Impl>
 | 
			
		||||
WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
 | 
			
		||||
               GridCartesian         &FiveDimGrid,
 | 
			
		||||
               GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
               GridCartesian         &FourDimGrid,
 | 
			
		||||
               RealD _M5,const ImplParams &p) :
 | 
			
		||||
{
 | 
			
		||||
  int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
  // some assertions
 | 
			
		||||
  assert(FiveDimGrid._ndimension==5);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._ndimension==5);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
 | 
			
		||||
  assert(FourDimGrid._ndimension==4);
 | 
			
		||||
 | 
			
		||||
  // Dimension zero of the five-d is the Ls direction
 | 
			
		||||
  Ls=FiveDimGrid._fdimensions[0];
 | 
			
		||||
  assert(FiveDimGrid._processors[0]         ==1);
 | 
			
		||||
  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
 | 
			
		||||
 | 
			
		||||
  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._processors[0] ==1);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
 | 
			
		||||
 | 
			
		||||
  // Other dimensions must match the decomposition of the four-D fields 
 | 
			
		||||
  for(int d=0;d<4;d++){
 | 
			
		||||
    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
 | 
			
		||||
    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
 | 
			
		||||
 | 
			
		||||
    assert(FourDimGrid._simd_layout[d]=1);
 | 
			
		||||
    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
 | 
			
		||||
 | 
			
		||||
    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
 | 
			
		||||
    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
 | 
			
		||||
    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
  }
 | 
			
		||||
}  
 | 
			
		||||
  */
 | 
			
		||||
     
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::Report(void)
 | 
			
		||||
@@ -415,6 +373,10 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
  std::vector<std::vector<CommsRequest_t> > reqs;
 | 
			
		||||
 | 
			
		||||
  // Rely on async comms; start comms before merge of local data
 | 
			
		||||
  st.CommunicateBegin(reqs);
 | 
			
		||||
  st.CommsMergeSHM(compressor);
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  { 
 | 
			
		||||
    int nthreads = omp_get_num_threads();
 | 
			
		||||
@@ -426,7 +388,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
 | 
			
		||||
    if ( me == 0 ) {
 | 
			
		||||
      DhopCommTime-=usecond();
 | 
			
		||||
      st.CommunicateBegin(reqs);
 | 
			
		||||
      st.CommunicateComplete(reqs);
 | 
			
		||||
      DhopCommTime+=usecond();
 | 
			
		||||
    } else { 
 | 
			
		||||
@@ -442,10 +403,13 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
  st.CommsMerge(compressor);
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
 | 
			
		||||
  // Load imbalance alert. Should use dynamic schedule OMP for loop
 | 
			
		||||
  // Perhaps create a list of only those sites with face work, and 
 | 
			
		||||
  // load balance process the list.
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
    int nthreads = omp_get_num_threads();
 | 
			
		||||
    int me = omp_get_thread_num();
 | 
			
		||||
    int me       = omp_get_thread_num();
 | 
			
		||||
    int myoff, mywork;
 | 
			
		||||
 | 
			
		||||
    GridThread::GetWork(len,me,mywork,myoff,nthreads);
 | 
			
		||||
 
 | 
			
		||||
@@ -36,62 +36,78 @@ namespace QCD {
 | 
			
		||||
  int WilsonKernelsStatic::Opt   = WilsonKernelsStatic::OptGeneric;
 | 
			
		||||
  int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
 | 
			
		||||
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
#include <spi/include/kernel/location.h>
 | 
			
		||||
#include <spi/include/l1p/types.h>
 | 
			
		||||
#include <hwi/include/bqc/l1p_mmio.h>
 | 
			
		||||
#include <hwi/include/bqc/A2_inlines.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
void bgq_l1p_optimisation(int mode)
 | 
			
		||||
{
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
#undef L1P_CFG_PF_USR
 | 
			
		||||
#define L1P_CFG_PF_USR  (0x3fde8000108ll)   /*  (64 bit reg, 23 bits wide, user/unpriv) */
 | 
			
		||||
 | 
			
		||||
  uint64_t cfg_pf_usr;
 | 
			
		||||
  if ( mode ) { 
 | 
			
		||||
    cfg_pf_usr =
 | 
			
		||||
        L1P_CFG_PF_USR_ifetch_depth(0)       
 | 
			
		||||
      | L1P_CFG_PF_USR_ifetch_max_footprint(1)   
 | 
			
		||||
      | L1P_CFG_PF_USR_pf_stream_est_on_dcbt 
 | 
			
		||||
      | L1P_CFG_PF_USR_pf_stream_establish_enable
 | 
			
		||||
      | L1P_CFG_PF_USR_pf_stream_optimistic
 | 
			
		||||
      | L1P_CFG_PF_USR_pf_adaptive_throttle(0xF) ;
 | 
			
		||||
    //    if ( sizeof(Float) == sizeof(double) ) {
 | 
			
		||||
      cfg_pf_usr |=  L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3)   ;
 | 
			
		||||
      //    } else {
 | 
			
		||||
      //      cfg_pf_usr |=  L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2)   ;
 | 
			
		||||
      //    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    cfg_pf_usr = L1P_CFG_PF_USR_dfetch_depth(1)
 | 
			
		||||
      | L1P_CFG_PF_USR_dfetch_max_footprint(2)   
 | 
			
		||||
      | L1P_CFG_PF_USR_ifetch_depth(0)       
 | 
			
		||||
      | L1P_CFG_PF_USR_ifetch_max_footprint(1)   
 | 
			
		||||
      | L1P_CFG_PF_USR_pf_stream_est_on_dcbt 
 | 
			
		||||
      | L1P_CFG_PF_USR_pf_stream_establish_enable
 | 
			
		||||
      | L1P_CFG_PF_USR_pf_stream_optimistic
 | 
			
		||||
      | L1P_CFG_PF_USR_pf_stream_prefetch_enable;
 | 
			
		||||
  }
 | 
			
		||||
  *((uint64_t *)L1P_CFG_PF_USR) = cfg_pf_usr;
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////
 | 
			
		||||
// Generic implementation; move to different file?
 | 
			
		||||
////////////////////////////////////////////
 | 
			
		||||
  
 | 
			
		||||
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
 | 
			
		||||
  SE = st.GetEntry(ptype, Dir, sF);				\
 | 
			
		||||
  if (SE->_is_local) {						\
 | 
			
		||||
    chi_p = χ						\
 | 
			
		||||
    if (SE->_permute) {						\
 | 
			
		||||
      spProj(tmp, in._odata[SE->_offset]);			\
 | 
			
		||||
      permute(chi, tmp, ptype);					\
 | 
			
		||||
    } else {							\
 | 
			
		||||
      spProj(chi, in._odata[SE->_offset]);			\
 | 
			
		||||
    }								\
 | 
			
		||||
  } else {							\
 | 
			
		||||
    chi_p = &buf[SE->_offset];					\
 | 
			
		||||
  }								\
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
 | 
			
		||||
  Recon(result, Uchi);
 | 
			
		||||
  
 | 
			
		||||
#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
 | 
			
		||||
  SE = st.GetEntry(ptype, Dir, sF);				\
 | 
			
		||||
  if (SE->_is_local) {						\
 | 
			
		||||
    chi_p = χ						\
 | 
			
		||||
    if (SE->_permute) {						\
 | 
			
		||||
      spProj(tmp, in._odata[SE->_offset]);			\
 | 
			
		||||
      permute(chi, tmp, ptype);					\
 | 
			
		||||
    } else {							\
 | 
			
		||||
      spProj(chi, in._odata[SE->_offset]);			\
 | 
			
		||||
    }								\
 | 
			
		||||
  } else if ( st.same_node[Dir] ) {				\
 | 
			
		||||
      chi_p = &buf[SE->_offset];				\
 | 
			
		||||
  }								\
 | 
			
		||||
  if (SE->_is_local || st.same_node[Dir] ) {			\
 | 
			
		||||
    Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
 | 
			
		||||
    Recon(result, Uchi);					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
 | 
			
		||||
  SE = st.GetEntry(ptype, Dir, sF);				\
 | 
			
		||||
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
 | 
			
		||||
    chi_p = &buf[SE->_offset];					\
 | 
			
		||||
    Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st);	\
 | 
			
		||||
    Recon(result, Uchi);					\
 | 
			
		||||
    nmu++;							\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
 | 
			
		||||
  if (gamma == Dir) {						\
 | 
			
		||||
    if (SE->_is_local && SE->_permute) {			\
 | 
			
		||||
      spProj(tmp, in._odata[SE->_offset]);			\
 | 
			
		||||
      permute(chi, tmp, ptype);					\
 | 
			
		||||
    } else if (SE->_is_local) {					\
 | 
			
		||||
      spProj(chi, in._odata[SE->_offset]);			\
 | 
			
		||||
    } else {							\
 | 
			
		||||
      chi = buf[SE->_offset];					\
 | 
			
		||||
    }								\
 | 
			
		||||
    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);	\
 | 
			
		||||
    Recon(result, Uchi);					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // All legs kernels ; comms then compute
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
						     SiteHalfSpinor *buf, int sF,
 | 
			
		||||
						     int sU, const FermionField &in, FermionField &out,
 | 
			
		||||
						     int interior,int exterior) {
 | 
			
		||||
					     SiteHalfSpinor *buf, int sF,
 | 
			
		||||
					     int sU, const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  SiteHalfSpinor tmp;
 | 
			
		||||
  SiteHalfSpinor chi;
 | 
			
		||||
  SiteHalfSpinor *chi_p;
 | 
			
		||||
@@ -100,174 +116,22 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Xp, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjXp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st);
 | 
			
		||||
  spReconXp(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Yp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Yp, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjYp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st);
 | 
			
		||||
  accumReconYp(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Zp, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjZp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st);
 | 
			
		||||
  accumReconZp(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Tp, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjTp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjTp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st);
 | 
			
		||||
  accumReconTp(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Xm, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjXm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st);
 | 
			
		||||
  accumReconXm(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Ym
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Ym, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjYm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st);
 | 
			
		||||
  accumReconYm(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Zm, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjZm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st);
 | 
			
		||||
  accumReconZm(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Tm, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjTm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjTm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st);
 | 
			
		||||
  accumReconTm(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
 | 
			
		||||
  vstream(out._odata[sF], result);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Need controls to do interior, exterior, or both
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
						  SiteHalfSpinor *buf, int sF,
 | 
			
		||||
						  int sU, const FermionField &in, FermionField &out,int interior,int exterior) {
 | 
			
		||||
					  SiteHalfSpinor *buf, int sF,
 | 
			
		||||
					  int sU, const FermionField &in, FermionField &out) 
 | 
			
		||||
{
 | 
			
		||||
  SiteHalfSpinor tmp;
 | 
			
		||||
  SiteHalfSpinor chi;
 | 
			
		||||
  SiteHalfSpinor *chi_p;
 | 
			
		||||
@@ -276,168 +140,123 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, Do
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Xm, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjXp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st);
 | 
			
		||||
  spReconXp(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Yp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Ym, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjYp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st);
 | 
			
		||||
  accumReconYp(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Zm, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjZp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st);
 | 
			
		||||
  accumReconZp(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Tm, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjTp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjTp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st);
 | 
			
		||||
  accumReconTp(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Xp, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjXm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st);
 | 
			
		||||
  accumReconXm(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Ym
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Yp, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjYm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st);
 | 
			
		||||
  accumReconYm(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Zp, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjZm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st);
 | 
			
		||||
  accumReconZm(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE = st.GetEntry(ptype, Tp, sF);
 | 
			
		||||
 | 
			
		||||
  if (SE->_is_local) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if (SE->_permute) {
 | 
			
		||||
      spProjTm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjTm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p = &buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st);
 | 
			
		||||
  accumReconTm(result, Uchi);
 | 
			
		||||
 | 
			
		||||
  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
 | 
			
		||||
  vstream(out._odata[sF], result);
 | 
			
		||||
};
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Interior kernels
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
						SiteHalfSpinor *buf, int sF,
 | 
			
		||||
						int sU, const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  SiteHalfSpinor tmp;
 | 
			
		||||
  SiteHalfSpinor chi;
 | 
			
		||||
  SiteHalfSpinor *chi_p;
 | 
			
		||||
  SiteHalfSpinor Uchi;
 | 
			
		||||
  SiteSpinor result;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
 | 
			
		||||
  result=zero;
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
 | 
			
		||||
  vstream(out._odata[sF], result);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
					     SiteHalfSpinor *buf, int sF,
 | 
			
		||||
					     int sU, const FermionField &in, FermionField &out) 
 | 
			
		||||
{
 | 
			
		||||
  SiteHalfSpinor tmp;
 | 
			
		||||
  SiteHalfSpinor chi;
 | 
			
		||||
  SiteHalfSpinor *chi_p;
 | 
			
		||||
  SiteHalfSpinor Uchi;
 | 
			
		||||
  SiteSpinor result;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
  result=zero;
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
 | 
			
		||||
  vstream(out._odata[sF], result);
 | 
			
		||||
};
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Exterior kernels
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
						SiteHalfSpinor *buf, int sF,
 | 
			
		||||
						int sU, const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  SiteHalfSpinor tmp;
 | 
			
		||||
  SiteHalfSpinor chi;
 | 
			
		||||
  SiteHalfSpinor *chi_p;
 | 
			
		||||
  SiteHalfSpinor Uchi;
 | 
			
		||||
  SiteSpinor result;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
  result=zero;
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
 | 
			
		||||
  if ( nmu ) { 
 | 
			
		||||
    out._odata[sF] = out._odata[sF] + result; 
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
					     SiteHalfSpinor *buf, int sF,
 | 
			
		||||
					     int sU, const FermionField &in, FermionField &out) 
 | 
			
		||||
{
 | 
			
		||||
  SiteHalfSpinor tmp;
 | 
			
		||||
  SiteHalfSpinor chi;
 | 
			
		||||
  SiteHalfSpinor *chi_p;
 | 
			
		||||
  SiteHalfSpinor Uchi;
 | 
			
		||||
  SiteSpinor result;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
  result=zero;
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
 | 
			
		||||
  if ( nmu ) { 
 | 
			
		||||
    out._odata[sF] = out._odata[sF] + result; 
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
 | 
			
		||||
@@ -451,119 +270,14 @@ void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHal
 | 
			
		||||
  int ptype;
 | 
			
		||||
 | 
			
		||||
  SE = st.GetEntry(ptype, dir, sF);
 | 
			
		||||
 | 
			
		||||
  // Xp
 | 
			
		||||
  if (gamma == Xp) {
 | 
			
		||||
    if (SE->_is_local && SE->_permute) {
 | 
			
		||||
      spProjXp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else if (SE->_is_local) {
 | 
			
		||||
      spProjXp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      chi = buf[SE->_offset];
 | 
			
		||||
    }
 | 
			
		||||
    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
 | 
			
		||||
    spReconXp(result, Uchi);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Yp
 | 
			
		||||
  if (gamma == Yp) {
 | 
			
		||||
    if (SE->_is_local && SE->_permute) {
 | 
			
		||||
      spProjYp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else if (SE->_is_local) {
 | 
			
		||||
      spProjYp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      chi = buf[SE->_offset];
 | 
			
		||||
    }
 | 
			
		||||
    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
 | 
			
		||||
    spReconYp(result, Uchi);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Zp
 | 
			
		||||
  if (gamma == Zp) {
 | 
			
		||||
    if (SE->_is_local && SE->_permute) {
 | 
			
		||||
      spProjZp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else if (SE->_is_local) {
 | 
			
		||||
      spProjZp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      chi = buf[SE->_offset];
 | 
			
		||||
    }
 | 
			
		||||
    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
 | 
			
		||||
    spReconZp(result, Uchi);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Tp
 | 
			
		||||
  if (gamma == Tp) {
 | 
			
		||||
    if (SE->_is_local && SE->_permute) {
 | 
			
		||||
      spProjTp(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else if (SE->_is_local) {
 | 
			
		||||
      spProjTp(chi, in._odata[SE->_offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      chi = buf[SE->_offset];
 | 
			
		||||
    }
 | 
			
		||||
    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
 | 
			
		||||
    spReconTp(result, Uchi);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Xm
 | 
			
		||||
  if (gamma == Xm) {
 | 
			
		||||
    if (SE->_is_local && SE->_permute) {
 | 
			
		||||
      spProjXm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else if (SE->_is_local) {
 | 
			
		||||
      spProjXm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      chi = buf[SE->_offset];
 | 
			
		||||
    }
 | 
			
		||||
    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
 | 
			
		||||
    spReconXm(result, Uchi);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Ym
 | 
			
		||||
  if (gamma == Ym) {
 | 
			
		||||
    if (SE->_is_local && SE->_permute) {
 | 
			
		||||
      spProjYm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else if (SE->_is_local) {
 | 
			
		||||
      spProjYm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      chi = buf[SE->_offset];
 | 
			
		||||
    }
 | 
			
		||||
    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
 | 
			
		||||
    spReconYm(result, Uchi);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Zm
 | 
			
		||||
  if (gamma == Zm) {
 | 
			
		||||
    if (SE->_is_local && SE->_permute) {
 | 
			
		||||
      spProjZm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else if (SE->_is_local) {
 | 
			
		||||
      spProjZm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      chi = buf[SE->_offset];
 | 
			
		||||
    }
 | 
			
		||||
    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
 | 
			
		||||
    spReconZm(result, Uchi);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Tm
 | 
			
		||||
  if (gamma == Tm) {
 | 
			
		||||
    if (SE->_is_local && SE->_permute) {
 | 
			
		||||
      spProjTm(tmp, in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi, tmp, ptype);
 | 
			
		||||
    } else if (SE->_is_local) {
 | 
			
		||||
      spProjTm(chi, in._odata[SE->_offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      chi = buf[SE->_offset];
 | 
			
		||||
    }
 | 
			
		||||
    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
 | 
			
		||||
    spReconTm(result, Uchi);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
 | 
			
		||||
  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
 | 
			
		||||
  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
 | 
			
		||||
  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
 | 
			
		||||
  GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm);
 | 
			
		||||
  GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm);
 | 
			
		||||
  GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
 | 
			
		||||
  GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
 | 
			
		||||
  vstream(out._odata[sF], result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -34,8 +34,6 @@ directory
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
void bgq_l1p_optimisation(int mode);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Helper routines that implement Wilson stencil for a single site.
 | 
			
		||||
  // Common to both the WilsonFermion and WilsonFermion5D
 | 
			
		||||
@@ -44,9 +42,8 @@ class WilsonKernelsStatic {
 | 
			
		||||
 public:
 | 
			
		||||
  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
 | 
			
		||||
  enum { CommsAndCompute, CommsThenCompute };
 | 
			
		||||
  // S-direction is INNERMOST and takes no part in the parity.
 | 
			
		||||
  static int Opt;  // these are a temporary hack
 | 
			
		||||
  static int Comms;  // these are a temporary hack
 | 
			
		||||
  static int Opt;  
 | 
			
		||||
  static int Comms;
 | 
			
		||||
};
 | 
			
		||||
 
 | 
			
		||||
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
 | 
			
		||||
@@ -75,7 +72,7 @@ public:
 | 
			
		||||
    case OptHandUnroll:
 | 
			
		||||
      for (int site = 0; site < Ns; site++) {
 | 
			
		||||
	for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	  if( exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior);
 | 
			
		||||
	  if( exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  sF++;
 | 
			
		||||
	}
 | 
			
		||||
	sU++;
 | 
			
		||||
@@ -84,7 +81,10 @@ public:
 | 
			
		||||
    case OptGeneric:
 | 
			
		||||
      for (int site = 0; site < Ns; site++) {
 | 
			
		||||
	for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	  if( exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior);
 | 
			
		||||
	  if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  else if (interior)     WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  else assert(0);
 | 
			
		||||
	  sF++;
 | 
			
		||||
	}
 | 
			
		||||
	sU++;
 | 
			
		||||
@@ -99,11 +99,14 @@ public:
 | 
			
		||||
  template <bool EnableBool = true>
 | 
			
		||||
  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
 | 
			
		||||
  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
 | 
			
		||||
	   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
 | 
			
		||||
    // no kernel choice  
 | 
			
		||||
    for (int site = 0; site < Ns; site++) {
 | 
			
		||||
      for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	if( exterior) WilsonKernels<Impl>::GenericDhopSite(st, lo, U, buf, sF, sU, in, out,interior,exterior);
 | 
			
		||||
	if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	else if (interior)     WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	else assert(0);
 | 
			
		||||
	sF++;
 | 
			
		||||
      }
 | 
			
		||||
      sU++;
 | 
			
		||||
@@ -113,8 +116,8 @@ public:
 | 
			
		||||
  template <bool EnableBool = true>
 | 
			
		||||
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
 | 
			
		||||
  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
 | 
			
		||||
 | 
			
		||||
	      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
 | 
			
		||||
{
 | 
			
		||||
    bgq_l1p_optimisation(1);
 | 
			
		||||
    switch(Opt) {
 | 
			
		||||
#if defined(AVX512) || defined (QPX)
 | 
			
		||||
@@ -128,7 +131,7 @@ public:
 | 
			
		||||
    case OptHandUnroll:
 | 
			
		||||
      for (int site = 0; site < Ns; site++) {
 | 
			
		||||
	for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	  if( exterior) WilsonKernels<Impl>::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior);
 | 
			
		||||
	  if( exterior) WilsonKernels<Impl>::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  sF++;
 | 
			
		||||
	}
 | 
			
		||||
	sU++;
 | 
			
		||||
@@ -137,7 +140,10 @@ public:
 | 
			
		||||
    case OptGeneric:
 | 
			
		||||
      for (int site = 0; site < Ns; site++) {
 | 
			
		||||
	for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	  if( exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior);
 | 
			
		||||
	  if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  else if (interior)     WilsonKernels<Impl>::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  else assert(0);
 | 
			
		||||
	  sF++;
 | 
			
		||||
	}
 | 
			
		||||
	sU++;
 | 
			
		||||
@@ -156,7 +162,10 @@ public:
 | 
			
		||||
 | 
			
		||||
    for (int site = 0; site < Ns; site++) {
 | 
			
		||||
      for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	if( exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior);
 | 
			
		||||
	if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	else if (interior)     WilsonKernels<Impl>::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	else if (exterior)     WilsonKernels<Impl>::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	else assert(0);
 | 
			
		||||
	sF++;
 | 
			
		||||
      }
 | 
			
		||||
      sU++;
 | 
			
		||||
@@ -169,35 +178,47 @@ public:
 | 
			
		||||
private:
 | 
			
		||||
     // Specialised variants
 | 
			
		||||
  void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			       int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior);
 | 
			
		||||
		       int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
      
 | 
			
		||||
  void GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
				  int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior);
 | 
			
		||||
			  int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
  void GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			  int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
      
 | 
			
		||||
  void GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			     int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
  void GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			  int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
      
 | 
			
		||||
  void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			     int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
  void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
 | 
			
		||||
		   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
 | 
			
		||||
 | 
			
		||||
  void AsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
			
		||||
		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
  void AsmDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
 | 
			
		||||
		      int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
 | 
			
		||||
 | 
			
		||||
  void AsmDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
			
		||||
			 int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
  void AsmDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			      int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
 | 
			
		||||
		      int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
 | 
			
		||||
 | 
			
		||||
  void AsmDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
				 int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
			
		||||
			 int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  void HandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			    int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior);
 | 
			
		||||
		    int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
  void HandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
			       int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior);
 | 
			
		||||
		       int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
      
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -307,55 +307,106 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
  result_31-= UChi_11;	\
 | 
			
		||||
  result_32-= UChi_12;
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
 | 
			
		||||
  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if ( local ) {				\
 | 
			
		||||
    LOAD_CHIMU;					\
 | 
			
		||||
    PROJ;					\
 | 
			
		||||
    if ( perm) {				\
 | 
			
		||||
      PERMUTE_DIR(PERM);			\
 | 
			
		||||
    }						\
 | 
			
		||||
  } else {					\
 | 
			
		||||
    LOAD_CHI;					\
 | 
			
		||||
  }						\
 | 
			
		||||
  {						\
 | 
			
		||||
    MULT_2SPIN(DIR);				\
 | 
			
		||||
  }						\
 | 
			
		||||
  RECON;					
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
 | 
			
		||||
  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if ( local ) {				\
 | 
			
		||||
    LOAD_CHIMU;					\
 | 
			
		||||
    PROJ;					\
 | 
			
		||||
    if ( perm) {				\
 | 
			
		||||
      PERMUTE_DIR(PERM);			\
 | 
			
		||||
    }						\
 | 
			
		||||
  } else {					\
 | 
			
		||||
    if ( st.same_node[DIR] ) {			\
 | 
			
		||||
      LOAD_CHI;					\
 | 
			
		||||
    }						\
 | 
			
		||||
  }						\
 | 
			
		||||
  if (local || st.same_node[DIR] ) {		\
 | 
			
		||||
    MULT_2SPIN(DIR);				\
 | 
			
		||||
    RECON;					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
 | 
			
		||||
  SE=st.GetEntry(ptype,Dir,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if((!SE->_is_local)&&(!st.same_node[Dir]) ) {	\
 | 
			
		||||
    LOAD_CHI;					\
 | 
			
		||||
    MULT_2SPIN(DIR);				\
 | 
			
		||||
    RECON;					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HAND_RESULT(ss)				\
 | 
			
		||||
  {						\
 | 
			
		||||
    SiteSpinor & ref (out._odata[ss]);		\
 | 
			
		||||
    vstream(ref()(0)(0),result_00);		\
 | 
			
		||||
    vstream(ref()(0)(1),result_01);		\
 | 
			
		||||
    vstream(ref()(0)(2),result_02);		\
 | 
			
		||||
    vstream(ref()(1)(0),result_10);		\
 | 
			
		||||
    vstream(ref()(1)(1),result_11);		\
 | 
			
		||||
    vstream(ref()(1)(2),result_12);		\
 | 
			
		||||
    vstream(ref()(2)(0),result_20);		\
 | 
			
		||||
    vstream(ref()(2)(1),result_21);		\
 | 
			
		||||
    vstream(ref()(2)(2),result_22);		\
 | 
			
		||||
    vstream(ref()(3)(0),result_30);		\
 | 
			
		||||
    vstream(ref()(3)(1),result_31);		\
 | 
			
		||||
    vstream(ref()(3)(2),result_32);		\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl> void 
 | 
			
		||||
WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
 | 
			
		||||
					  int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_00; // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_01;
 | 
			
		||||
  REGISTER Simd result_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_10;
 | 
			
		||||
  REGISTER Simd result_11;
 | 
			
		||||
  REGISTER Simd result_12;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_20;
 | 
			
		||||
  REGISTER Simd result_21;
 | 
			
		||||
  REGISTER Simd result_22;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_30;
 | 
			
		||||
  REGISTER Simd result_31;
 | 
			
		||||
  REGISTER Simd result_32; // 20 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd Chi_01;
 | 
			
		||||
  REGISTER Simd Chi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_10;
 | 
			
		||||
  REGISTER Simd Chi_11;
 | 
			
		||||
  REGISTER Simd Chi_12;   // 14 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_00;  // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd UChi_01;
 | 
			
		||||
  REGISTER Simd UChi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_10;
 | 
			
		||||
  REGISTER Simd UChi_11;
 | 
			
		||||
  REGISTER Simd UChi_12;  // 8 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd U_00;  // two rows of U matrix
 | 
			
		||||
  REGISTER Simd U_10;
 | 
			
		||||
  REGISTER Simd U_20;  
 | 
			
		||||
  REGISTER Simd U_01;
 | 
			
		||||
  REGISTER Simd U_11;
 | 
			
		||||
  REGISTER Simd U_21;  // 2 reg left.
 | 
			
		||||
#define HAND_DECLARATIONS(a)			\
 | 
			
		||||
  Simd result_00;				\
 | 
			
		||||
  Simd result_01;				\
 | 
			
		||||
  Simd result_02;				\
 | 
			
		||||
  Simd result_10;				\
 | 
			
		||||
  Simd result_11;				\
 | 
			
		||||
  Simd result_12;				\
 | 
			
		||||
  Simd result_20;				\
 | 
			
		||||
  Simd result_21;				\
 | 
			
		||||
  Simd result_22;				\
 | 
			
		||||
  Simd result_30;				\
 | 
			
		||||
  Simd result_31;				\
 | 
			
		||||
  Simd result_32;				\
 | 
			
		||||
  Simd Chi_00;					\
 | 
			
		||||
  Simd Chi_01;					\
 | 
			
		||||
  Simd Chi_02;					\
 | 
			
		||||
  Simd Chi_10;					\
 | 
			
		||||
  Simd Chi_11;					\
 | 
			
		||||
  Simd Chi_12;					\
 | 
			
		||||
  Simd UChi_00;					\
 | 
			
		||||
  Simd UChi_01;					\
 | 
			
		||||
  Simd UChi_02;					\
 | 
			
		||||
  Simd UChi_10;					\
 | 
			
		||||
  Simd UChi_11;					\
 | 
			
		||||
  Simd UChi_12;					\
 | 
			
		||||
  Simd U_00;					\
 | 
			
		||||
  Simd U_10;					\
 | 
			
		||||
  Simd U_20;					\
 | 
			
		||||
  Simd U_01;					\
 | 
			
		||||
  Simd U_11;					\
 | 
			
		||||
  Simd U_21; 
 | 
			
		||||
 | 
			
		||||
#define Chimu_00 Chi_00
 | 
			
		||||
#define Chimu_01 Chi_01
 | 
			
		||||
@@ -370,430 +421,54 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
 | 
			
		||||
#define Chimu_31 UChi_11
 | 
			
		||||
#define Chimu_32 UChi_12
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
template<class Impl> void 
 | 
			
		||||
WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
 | 
			
		||||
					  int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  HAND_DECLARATIONS(ignore);
 | 
			
		||||
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
 | 
			
		||||
  // Xp
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Xp);
 | 
			
		||||
  }
 | 
			
		||||
  XM_RECON;
 | 
			
		||||
  
 | 
			
		||||
  // Yp
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Yp);
 | 
			
		||||
  }
 | 
			
		||||
  YM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Zp
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Zp);
 | 
			
		||||
  }
 | 
			
		||||
  ZM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Tp
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Tp);
 | 
			
		||||
  }
 | 
			
		||||
  TM_RECON_ACCUM;
 | 
			
		||||
  
 | 
			
		||||
  // Xm
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Xm);
 | 
			
		||||
  }
 | 
			
		||||
  XP_RECON_ACCUM;
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  // Ym
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Ym);
 | 
			
		||||
  }
 | 
			
		||||
  YP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Zm
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Zm);
 | 
			
		||||
  }
 | 
			
		||||
  ZP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Tm
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Tm);
 | 
			
		||||
  }
 | 
			
		||||
  TP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    SiteSpinor & ref (out._odata[ss]);
 | 
			
		||||
    vstream(ref()(0)(0),result_00);
 | 
			
		||||
    vstream(ref()(0)(1),result_01);
 | 
			
		||||
    vstream(ref()(0)(2),result_02);
 | 
			
		||||
    vstream(ref()(1)(0),result_10);
 | 
			
		||||
    vstream(ref()(1)(1),result_11);
 | 
			
		||||
    vstream(ref()(1)(2),result_12);
 | 
			
		||||
    vstream(ref()(2)(0),result_20);
 | 
			
		||||
    vstream(ref()(2)(1),result_21);
 | 
			
		||||
    vstream(ref()(2)(2),result_22);
 | 
			
		||||
    vstream(ref()(3)(0),result_30);
 | 
			
		||||
    vstream(ref()(3)(1),result_31);
 | 
			
		||||
    vstream(ref()(3)(2),result_32);
 | 
			
		||||
  }
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT(ss);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 | 
			
		||||
						  int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior)
 | 
			
		||||
						  int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_00; // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_01;
 | 
			
		||||
  REGISTER Simd result_02;
 | 
			
		||||
  
 | 
			
		||||
  REGISTER Simd result_10;
 | 
			
		||||
  REGISTER Simd result_11;
 | 
			
		||||
  REGISTER Simd result_12;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_20;
 | 
			
		||||
  REGISTER Simd result_21;
 | 
			
		||||
  REGISTER Simd result_22;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_30;
 | 
			
		||||
  REGISTER Simd result_31;
 | 
			
		||||
  REGISTER Simd result_32; // 20 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd Chi_01;
 | 
			
		||||
  REGISTER Simd Chi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_10;
 | 
			
		||||
  REGISTER Simd Chi_11;
 | 
			
		||||
  REGISTER Simd Chi_12;   // 14 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_00;  // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd UChi_01;
 | 
			
		||||
  REGISTER Simd UChi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_10;
 | 
			
		||||
  REGISTER Simd UChi_11;
 | 
			
		||||
  REGISTER Simd UChi_12;  // 8 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd U_00;  // two rows of U matrix
 | 
			
		||||
  REGISTER Simd U_10;
 | 
			
		||||
  REGISTER Simd U_20;  
 | 
			
		||||
  REGISTER Simd U_01;
 | 
			
		||||
  REGISTER Simd U_11;
 | 
			
		||||
  REGISTER Simd U_21;  // 2 reg left.
 | 
			
		||||
 | 
			
		||||
#define Chimu_00 Chi_00
 | 
			
		||||
#define Chimu_01 Chi_01
 | 
			
		||||
#define Chimu_02 Chi_02
 | 
			
		||||
#define Chimu_10 Chi_10
 | 
			
		||||
#define Chimu_11 Chi_11
 | 
			
		||||
#define Chimu_12 Chi_12
 | 
			
		||||
#define Chimu_20 UChi_00
 | 
			
		||||
#define Chimu_21 UChi_01
 | 
			
		||||
#define Chimu_22 UChi_02
 | 
			
		||||
#define Chimu_30 UChi_10
 | 
			
		||||
#define Chimu_31 UChi_11
 | 
			
		||||
#define Chimu_32 UChi_12
 | 
			
		||||
 | 
			
		||||
  HAND_DECLARATIONS(ignore);
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  
 | 
			
		||||
  // Xp
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Xp);
 | 
			
		||||
  }
 | 
			
		||||
  XP_RECON;
 | 
			
		||||
 | 
			
		||||
  // Yp
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Yp);
 | 
			
		||||
  }
 | 
			
		||||
  YP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Zp
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Zp);
 | 
			
		||||
  }
 | 
			
		||||
  ZP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Tp
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Tp);
 | 
			
		||||
  }
 | 
			
		||||
  TP_RECON_ACCUM;
 | 
			
		||||
  
 | 
			
		||||
  // Xm
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Xm);
 | 
			
		||||
  }
 | 
			
		||||
  XM_RECON_ACCUM;
 | 
			
		||||
  
 | 
			
		||||
  // Ym
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Ym);
 | 
			
		||||
  }
 | 
			
		||||
  YM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Zm
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Zm);
 | 
			
		||||
  }
 | 
			
		||||
  ZM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Tm
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Tm);
 | 
			
		||||
  }
 | 
			
		||||
  TM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    SiteSpinor & ref (out._odata[ss]);
 | 
			
		||||
    vstream(ref()(0)(0),result_00);
 | 
			
		||||
    vstream(ref()(0)(1),result_01);
 | 
			
		||||
    vstream(ref()(0)(2),result_02);
 | 
			
		||||
    vstream(ref()(1)(0),result_10);
 | 
			
		||||
    vstream(ref()(1)(1),result_11);
 | 
			
		||||
    vstream(ref()(1)(2),result_12);
 | 
			
		||||
    vstream(ref()(2)(0),result_20);
 | 
			
		||||
    vstream(ref()(2)(1),result_21);
 | 
			
		||||
    vstream(ref()(2)(2),result_22);
 | 
			
		||||
    vstream(ref()(3)(0),result_30);
 | 
			
		||||
    vstream(ref()(3)(1),result_31);
 | 
			
		||||
    vstream(ref()(3)(2),result_32);
 | 
			
		||||
  }
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT(ss);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
@@ -801,74 +476,71 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
template<> void 
 | 
			
		||||
WilsonKernels<GparityWilsonImplF>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
							SiteHalfSpinor *buf,
 | 
			
		||||
							int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
 | 
			
		||||
						SiteHalfSpinor *buf,
 | 
			
		||||
						int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<> void 
 | 
			
		||||
WilsonKernels<GparityWilsonImplF>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
							   SiteHalfSpinor *buf,
 | 
			
		||||
							   int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
 | 
			
		||||
						   SiteHalfSpinor *buf,
 | 
			
		||||
						   int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<> void 
 | 
			
		||||
WilsonKernels<GparityWilsonImplD>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 | 
			
		||||
							int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
 | 
			
		||||
						int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<> void 
 | 
			
		||||
WilsonKernels<GparityWilsonImplD>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 | 
			
		||||
							   int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
 | 
			
		||||
						   int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<> void 
 | 
			
		||||
WilsonKernels<GparityWilsonImplFH>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
							SiteHalfSpinor *buf,
 | 
			
		||||
							int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
 | 
			
		||||
						 SiteHalfSpinor *buf,
 | 
			
		||||
						 int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<> void 
 | 
			
		||||
WilsonKernels<GparityWilsonImplFH>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
							   SiteHalfSpinor *buf,
 | 
			
		||||
							   int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
 | 
			
		||||
						    SiteHalfSpinor *buf,
 | 
			
		||||
						    int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<> void 
 | 
			
		||||
WilsonKernels<GparityWilsonImplDF>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 | 
			
		||||
							int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
 | 
			
		||||
						 int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<> void 
 | 
			
		||||
WilsonKernels<GparityWilsonImplDF>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 | 
			
		||||
							   int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
 | 
			
		||||
						    int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
////////////// Wilson ; uses this implementation /////////////////////
 | 
			
		||||
// Need Nc=3 though //
 | 
			
		||||
 | 
			
		||||
#define INSTANTIATE_THEM(A) \
 | 
			
		||||
template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 | 
			
		||||
						     int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); \
 | 
			
		||||
template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 | 
			
		||||
							int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior);
 | 
			
		||||
					     int ss,int sU,const FermionField &in, FermionField &out); \
 | 
			
		||||
template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
						int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
INSTANTIATE_THEM(WilsonImplF);
 | 
			
		||||
INSTANTIATE_THEM(WilsonImplD);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user