mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Avx512 changes for assembler kernels
This commit is contained in:
		@@ -30,6 +30,15 @@
 | 
			
		||||
/* GRID_DEFAULT_PRECISION is SINGLE */
 | 
			
		||||
#undef GRID_DEFAULT_PRECISION_SINGLE
 | 
			
		||||
 | 
			
		||||
/* Support Altivec instructions */
 | 
			
		||||
#undef HAVE_ALTIVEC
 | 
			
		||||
 | 
			
		||||
/* Support AVX (Advanced Vector Extensions) instructions */
 | 
			
		||||
#undef HAVE_AVX
 | 
			
		||||
 | 
			
		||||
/* Support AVX2 (Advanced Vector Extensions 2) instructions */
 | 
			
		||||
#undef HAVE_AVX2
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
 | 
			
		||||
   don't. */
 | 
			
		||||
#undef HAVE_DECL_BE64TOH
 | 
			
		||||
@@ -44,6 +53,9 @@
 | 
			
		||||
/* Define to 1 if you have the <execinfo.h> header file. */
 | 
			
		||||
#undef HAVE_EXECINFO_H
 | 
			
		||||
 | 
			
		||||
/* Support FMA3 (Fused Multiply-Add) instructions */
 | 
			
		||||
#undef HAVE_FMA
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the `gettimeofday' function. */
 | 
			
		||||
#undef HAVE_GETTIMEOFDAY
 | 
			
		||||
 | 
			
		||||
@@ -62,9 +74,30 @@
 | 
			
		||||
/* Define to 1 if you have the <memory.h> header file. */
 | 
			
		||||
#undef HAVE_MEMORY_H
 | 
			
		||||
 | 
			
		||||
/* Support mmx instructions */
 | 
			
		||||
#undef HAVE_MMX
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <mm_malloc.h> header file. */
 | 
			
		||||
#undef HAVE_MM_MALLOC_H
 | 
			
		||||
 | 
			
		||||
/* Support SSE (Streaming SIMD Extensions) instructions */
 | 
			
		||||
#undef HAVE_SSE
 | 
			
		||||
 | 
			
		||||
/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
 | 
			
		||||
#undef HAVE_SSE2
 | 
			
		||||
 | 
			
		||||
/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
 | 
			
		||||
#undef HAVE_SSE3
 | 
			
		||||
 | 
			
		||||
/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
 | 
			
		||||
#undef HAVE_SSE4_1
 | 
			
		||||
 | 
			
		||||
/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
 | 
			
		||||
#undef HAVE_SSE4_2
 | 
			
		||||
 | 
			
		||||
/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
 | 
			
		||||
#undef HAVE_SSSE3
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <stdint.h> header file. */
 | 
			
		||||
#undef HAVE_STDINT_H
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -62,6 +62,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <serialisation/Serialisation.h>
 | 
			
		||||
#include <Config.h>
 | 
			
		||||
#include <Timer.h>
 | 
			
		||||
#include <PerfCount.h>
 | 
			
		||||
#include <Log.h>
 | 
			
		||||
#include <AlignedAllocator.h>
 | 
			
		||||
#include <Simd.h>
 | 
			
		||||
 
 | 
			
		||||
@@ -34,7 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <ctime>
 | 
			
		||||
#include <chrono>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#include <unistd.h>
 | 
			
		||||
#include <sys/ioctl.h>
 | 
			
		||||
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
@@ -163,8 +163,8 @@ public:
 | 
			
		||||
  {
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
    if ( fd!= -1) {
 | 
			
		||||
      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
 | 
			
		||||
      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
 | 
			
		||||
      ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
 | 
			
		||||
      ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
 | 
			
		||||
    }
 | 
			
		||||
    begin  =cyclecount();
 | 
			
		||||
#else
 | 
			
		||||
@@ -176,7 +176,7 @@ public:
 | 
			
		||||
    count=0;
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
    if ( fd!= -1) {
 | 
			
		||||
      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
 | 
			
		||||
      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
 | 
			
		||||
      ::read(fd, &count, sizeof(long long));
 | 
			
		||||
    }
 | 
			
		||||
    elapsed = cyclecount() - begin;
 | 
			
		||||
@@ -187,16 +187,16 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
  void Report(void) {
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
 | 
			
		||||
    std::printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
 | 
			
		||||
#else
 | 
			
		||||
    printf("%llu cycles \n", elapsed );
 | 
			
		||||
    std::printf("%llu cycles \n", elapsed );
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ~PerformanceCounter()
 | 
			
		||||
  {
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
    close(fd);
 | 
			
		||||
    ::close(fd);
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -42,6 +42,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
 | 
			
		||||
#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
 | 
			
		||||
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
 | 
			
		||||
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
 | 
			
		||||
#define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
 | 
			
		||||
 
 | 
			
		||||
@@ -335,69 +335,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
						     const FermionField &in, FermionField &out,int dag) {
 | 
			
		||||
 | 
			
		||||
    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
 | 
			
		||||
    Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
    auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
 | 
			
		||||
    bool local    = true;
 | 
			
		||||
    bool nonlocal = false;
 | 
			
		||||
    if ( dag == DaggerYes ) {
 | 
			
		||||
      if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    st.HaloExchangeComplete(handle);
 | 
			
		||||
 | 
			
		||||
    local    = false;
 | 
			
		||||
    nonlocal = true;
 | 
			
		||||
    if ( dag == DaggerYes ) {
 | 
			
		||||
      if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    assert(0);
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -281,11 +281,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
					 DoubledGaugeField & U,
 | 
			
		||||
					 const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  //  if ( Impl::overlapCommsCompute () ) { 
 | 
			
		||||
  //    DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
 | 
			
		||||
  //  } else { 
 | 
			
		||||
    DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
 | 
			
		||||
    //  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -368,7 +364,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	      sU = lo.Reorder(sU);
 | 
			
		||||
	    }
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
@@ -428,130 +424,6 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
 | 
			
		||||
						     const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  alltime-=usecond();
 | 
			
		||||
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  commtime +=usecond();
 | 
			
		||||
  
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  // Not loop ordering and data layout.
 | 
			
		||||
  // Designed to create 
 | 
			
		||||
  // - per thread reuse in L1 cache for U
 | 
			
		||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
			
		||||
  bool local    = true;
 | 
			
		||||
  bool nonlocal = false;
 | 
			
		||||
  dslashtime -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=ss;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  dslashtime +=usecond();
 | 
			
		||||
 | 
			
		||||
  jointime -=usecond();
 | 
			
		||||
  st.HaloExchangeComplete(handle);
 | 
			
		||||
  jointime +=usecond();
 | 
			
		||||
 | 
			
		||||
  local    = false;
 | 
			
		||||
  nonlocal = true;
 | 
			
		||||
  dslash1time -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=ss;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  dslash1time +=usecond();
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
 
 | 
			
		||||
@@ -38,216 +38,177 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
 | 
			
		||||
template<class Impl> 
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
 | 
			
		||||
					   int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  SiteHalfSpinor  tmp;    
 | 
			
		||||
  SiteHalfSpinor  chi;    
 | 
			
		||||
  SiteHalfSpinor *chi_p;
 | 
			
		||||
  SiteHalfSpinor Uchi;
 | 
			
		||||
  SiteSpinor result;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
 | 
			
		||||
  int num = 0;
 | 
			
		||||
 | 
			
		||||
  result=zero;
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if (SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjXp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
 | 
			
		||||
    accumReconXp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
 | 
			
		||||
  spReconXp(result,Uchi);
 | 
			
		||||
    
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Yp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if ( SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjYp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
 | 
			
		||||
  accumReconYp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if ( SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjZp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
 | 
			
		||||
  accumReconZp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjTp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjTp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
 | 
			
		||||
  accumReconTp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjXm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
 | 
			
		||||
  accumReconXm(result,Uchi);
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Ym
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjYm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
 | 
			
		||||
  accumReconYm(result,Uchi);
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjZm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
 | 
			
		||||
  accumReconZm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjTm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else { 
 | 
			
		||||
      spProjTm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
 | 
			
		||||
  accumReconTm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    vstream(out._odata[sF],result);
 | 
			
		||||
  } else if ( num ) { 
 | 
			
		||||
    vstream(out._odata[sF],out._odata[sF]+result);
 | 
			
		||||
  }
 | 
			
		||||
  vstream(out._odata[sF],result);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -255,216 +216,177 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
 | 
			
		||||
template<class Impl> 
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
 | 
			
		||||
					   int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  SiteHalfSpinor  tmp;    
 | 
			
		||||
  SiteHalfSpinor  chi;    
 | 
			
		||||
  SiteHalfSpinor *chi_p;    
 | 
			
		||||
  SiteHalfSpinor Uchi;
 | 
			
		||||
  SiteSpinor result;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
 | 
			
		||||
  int num = 0;
 | 
			
		||||
 | 
			
		||||
  result=zero;
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if ( SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjXp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
 | 
			
		||||
    accumReconXp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
 | 
			
		||||
  spReconXp(result,Uchi);
 | 
			
		||||
    
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Yp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if ( SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjYp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
 | 
			
		||||
  accumReconYp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if ( SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjZp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
 | 
			
		||||
  accumReconZp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjTp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjTp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
 | 
			
		||||
  accumReconTp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjXm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
 | 
			
		||||
  accumReconXm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Ym
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjYm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
 | 
			
		||||
  accumReconYm(result,Uchi);
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjZm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
 | 
			
		||||
  accumReconZm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjTm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else { 
 | 
			
		||||
      spProjTm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
 | 
			
		||||
  accumReconTm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    vstream(out._odata[sF],result);
 | 
			
		||||
  } else if ( num ) { 
 | 
			
		||||
    vstream(out._odata[sF],out._odata[sF]+result);
 | 
			
		||||
  }
 | 
			
		||||
  vstream(out._odata[sF],result);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<class Impl> 
 | 
			
		||||
@@ -596,7 +518,7 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
  vstream(out._odata[sF],result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if ( ! defined(IMCI) )
 | 
			
		||||
#if ( ! defined(IMCI) && ! defined(AVX512) )
 | 
			
		||||
template<class Impl> 
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
 
 | 
			
		||||
@@ -48,11 +48,11 @@ namespace Grid {
 | 
			
		||||
    public:
 | 
			
		||||
     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			   int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
			   int sF,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
      
 | 
			
		||||
     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			      int sF,int sU,const FermionField &in,FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
			      int sF,int sU,const FermionField &in,FermionField &out);
 | 
			
		||||
 | 
			
		||||
     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
@@ -60,15 +60,15 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
			      int sF,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
			      int sF,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
     
 | 
			
		||||
     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
				 int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
				 int sF,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
     WilsonKernels(const ImplParams &p= ImplParams());
 | 
			
		||||
     
 | 
			
		||||
 
 | 
			
		||||
@@ -27,8 +27,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
//#if defined(AVX512) || defined (IMCI)
 | 
			
		||||
#if defined (IMCI)
 | 
			
		||||
#if defined(AVX512) || defined (IMCI)
 | 
			
		||||
//#if defined (IMCI)
 | 
			
		||||
 | 
			
		||||
#include <simd/Avx512Asm.h>
 | 
			
		||||
 | 
			
		||||
@@ -106,7 +106,7 @@ namespace QCD {
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					       int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
 | 
			
		||||
					       int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  uint64_t  now;
 | 
			
		||||
  uint64_t first ;
 | 
			
		||||
@@ -341,6 +341,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 | 
			
		||||
 | 
			
		||||
  template class WilsonKernels<WilsonImplF>;		
 | 
			
		||||
  template class WilsonKernels<WilsonImplD>; 
 | 
			
		||||
 | 
			
		||||
  template class WilsonKernels<GparityWilsonImplF>;
 | 
			
		||||
  template class WilsonKernels<GparityWilsonImplD>;
 | 
			
		||||
}}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -308,548 +308,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
template<class Impl>
 | 
			
		||||
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
{
 | 
			
		||||
  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
 | 
			
		||||
  
 | 
			
		||||
  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd Chi_01;
 | 
			
		||||
  REGISTER Simd Chi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_10;
 | 
			
		||||
  REGISTER Simd Chi_11;
 | 
			
		||||
  REGISTER Simd Chi_12;   // 14 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_00;  // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd UChi_01;
 | 
			
		||||
  REGISTER Simd UChi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_10;
 | 
			
		||||
  REGISTER Simd UChi_11;
 | 
			
		||||
  REGISTER Simd UChi_12;  // 8 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd U_00;  // two rows of U matrix
 | 
			
		||||
  REGISTER Simd U_10;
 | 
			
		||||
  REGISTER Simd U_20;  
 | 
			
		||||
  REGISTER Simd U_01;
 | 
			
		||||
  REGISTER Simd U_11;
 | 
			
		||||
  REGISTER Simd U_21;  // 2 reg left.
 | 
			
		||||
 | 
			
		||||
#define Chimu_00 Chi_00
 | 
			
		||||
#define Chimu_01 Chi_01
 | 
			
		||||
#define Chimu_02 Chi_02
 | 
			
		||||
#define Chimu_10 Chi_10
 | 
			
		||||
#define Chimu_11 Chi_11
 | 
			
		||||
#define Chimu_12 Chi_12
 | 
			
		||||
#define Chimu_20 UChi_00
 | 
			
		||||
#define Chimu_21 UChi_01
 | 
			
		||||
#define Chimu_22 UChi_02
 | 
			
		||||
#define Chimu_30 UChi_10
 | 
			
		||||
#define Chimu_31 UChi_11
 | 
			
		||||
#define Chimu_32 UChi_12
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset, ptype;
 | 
			
		||||
  int num = 0;
 | 
			
		||||
 | 
			
		||||
  // Xp
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Xp);
 | 
			
		||||
    XP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Yp
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Yp);
 | 
			
		||||
    YP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Zp
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Zp);
 | 
			
		||||
    ZP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Tp
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Tp);
 | 
			
		||||
    TP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Xm
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Xm);
 | 
			
		||||
    XM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Ym
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Ym);
 | 
			
		||||
    YM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Zm
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Zm);
 | 
			
		||||
    ZM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Tm
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Tm);
 | 
			
		||||
    TM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  SiteSpinor & ref (out._odata[ss]);
 | 
			
		||||
  if ( Local ) {
 | 
			
		||||
    vstream(ref()(0)(0),result_00);
 | 
			
		||||
    vstream(ref()(0)(1),result_01);
 | 
			
		||||
    vstream(ref()(0)(2),result_02);
 | 
			
		||||
    vstream(ref()(1)(0),result_10);
 | 
			
		||||
    vstream(ref()(1)(1),result_11);
 | 
			
		||||
    vstream(ref()(1)(2),result_12);
 | 
			
		||||
    vstream(ref()(2)(0),result_20);
 | 
			
		||||
    vstream(ref()(2)(1),result_21);
 | 
			
		||||
    vstream(ref()(2)(2),result_22);
 | 
			
		||||
    vstream(ref()(3)(0),result_30);
 | 
			
		||||
    vstream(ref()(3)(1),result_31);
 | 
			
		||||
    vstream(ref()(3)(2),result_32);
 | 
			
		||||
    return 1;
 | 
			
		||||
  } else if ( num ) { 
 | 
			
		||||
    vstream(ref()(0)(0),ref()(0)(0)+result_00);
 | 
			
		||||
    vstream(ref()(0)(1),ref()(0)(1)+result_01);
 | 
			
		||||
    vstream(ref()(0)(2),ref()(0)(2)+result_02);
 | 
			
		||||
    vstream(ref()(1)(0),ref()(1)(0)+result_10);
 | 
			
		||||
    vstream(ref()(1)(1),ref()(1)(1)+result_11);
 | 
			
		||||
    vstream(ref()(1)(2),ref()(1)(2)+result_12);
 | 
			
		||||
    vstream(ref()(2)(0),ref()(2)(0)+result_20);
 | 
			
		||||
    vstream(ref()(2)(1),ref()(2)(1)+result_21);
 | 
			
		||||
    vstream(ref()(2)(2),ref()(2)(2)+result_22);
 | 
			
		||||
    vstream(ref()(3)(0),ref()(3)(0)+result_30);
 | 
			
		||||
    vstream(ref()(3)(1),ref()(3)(1)+result_31);
 | 
			
		||||
    vstream(ref()(3)(2),ref()(3)(2)+result_32);
 | 
			
		||||
    return 1;
 | 
			
		||||
  }
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
{
 | 
			
		||||
  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
 | 
			
		||||
  
 | 
			
		||||
  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd Chi_01;
 | 
			
		||||
  REGISTER Simd Chi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_10;
 | 
			
		||||
  REGISTER Simd Chi_11;
 | 
			
		||||
  REGISTER Simd Chi_12;   // 14 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_00;  // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd UChi_01;
 | 
			
		||||
  REGISTER Simd UChi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_10;
 | 
			
		||||
  REGISTER Simd UChi_11;
 | 
			
		||||
  REGISTER Simd UChi_12;  // 8 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd U_00;  // two rows of U matrix
 | 
			
		||||
  REGISTER Simd U_10;
 | 
			
		||||
  REGISTER Simd U_20;  
 | 
			
		||||
  REGISTER Simd U_01;
 | 
			
		||||
  REGISTER Simd U_11;
 | 
			
		||||
  REGISTER Simd U_21;  // 2 reg left.
 | 
			
		||||
 | 
			
		||||
#define Chimu_00 Chi_00
 | 
			
		||||
#define Chimu_01 Chi_01
 | 
			
		||||
#define Chimu_02 Chi_02
 | 
			
		||||
#define Chimu_10 Chi_10
 | 
			
		||||
#define Chimu_11 Chi_11
 | 
			
		||||
#define Chimu_12 Chi_12
 | 
			
		||||
#define Chimu_20 UChi_00
 | 
			
		||||
#define Chimu_21 UChi_01
 | 
			
		||||
#define Chimu_22 UChi_02
 | 
			
		||||
#define Chimu_30 UChi_10
 | 
			
		||||
#define Chimu_31 UChi_11
 | 
			
		||||
#define Chimu_32 UChi_12
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset, ptype;
 | 
			
		||||
  int num = 0;
 | 
			
		||||
 | 
			
		||||
  // Xp
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Xp);
 | 
			
		||||
    XM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Yp
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Yp);
 | 
			
		||||
    YM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Zp
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Zp);
 | 
			
		||||
    ZM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Tp
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Tp);
 | 
			
		||||
    TM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Xm
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Xm);
 | 
			
		||||
    XP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Ym
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Ym);
 | 
			
		||||
    YP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Zm
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Zm);
 | 
			
		||||
    ZP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Tm
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Tm);
 | 
			
		||||
    TP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  SiteSpinor & ref (out._odata[ss]);
 | 
			
		||||
  if ( Local ) {
 | 
			
		||||
    vstream(ref()(0)(0),result_00);
 | 
			
		||||
    vstream(ref()(0)(1),result_01);
 | 
			
		||||
    vstream(ref()(0)(2),result_02);
 | 
			
		||||
    vstream(ref()(1)(0),result_10);
 | 
			
		||||
    vstream(ref()(1)(1),result_11);
 | 
			
		||||
    vstream(ref()(1)(2),result_12);
 | 
			
		||||
    vstream(ref()(2)(0),result_20);
 | 
			
		||||
    vstream(ref()(2)(1),result_21);
 | 
			
		||||
    vstream(ref()(2)(2),result_22);
 | 
			
		||||
    vstream(ref()(3)(0),result_30);
 | 
			
		||||
    vstream(ref()(3)(1),result_31);
 | 
			
		||||
    vstream(ref()(3)(2),result_32);
 | 
			
		||||
    return 1;
 | 
			
		||||
  } else if ( num ) { 
 | 
			
		||||
    vstream(ref()(0)(0),ref()(0)(0)+result_00);
 | 
			
		||||
    vstream(ref()(0)(1),ref()(0)(1)+result_01);
 | 
			
		||||
    vstream(ref()(0)(2),ref()(0)(2)+result_02);
 | 
			
		||||
    vstream(ref()(1)(0),ref()(1)(0)+result_10);
 | 
			
		||||
    vstream(ref()(1)(1),ref()(1)(1)+result_11);
 | 
			
		||||
    vstream(ref()(1)(2),ref()(1)(2)+result_12);
 | 
			
		||||
    vstream(ref()(2)(0),ref()(2)(0)+result_20);
 | 
			
		||||
    vstream(ref()(2)(1),ref()(2)(1)+result_21);
 | 
			
		||||
    vstream(ref()(2)(2),ref()(2)(2)+result_22);
 | 
			
		||||
    vstream(ref()(3)(0),ref()(3)(0)+result_30);
 | 
			
		||||
    vstream(ref()(3)(1),ref()(3)(1)+result_31);
 | 
			
		||||
    vstream(ref()(3)(2),ref()(3)(2)+result_32);
 | 
			
		||||
    return 1;
 | 
			
		||||
  }
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#else 
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					       int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
					       int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
@@ -1094,7 +557,7 @@ int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField
 | 
			
		||||
template<class Impl>
 | 
			
		||||
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					       int ss,int sU,const FermionField &in, FermionField &out,bool l, bool nl)
 | 
			
		||||
					       int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
@@ -1337,14 +800,13 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  // Specialise Gparity to simple implementation
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
 | 
			
		||||
  //check consistency of return types between these functions and the ones in WilsonKernels.cc
 | 
			
		||||
@@ -1355,7 +817,7 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Doub
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
@@ -1364,7 +826,7 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,D
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
@@ -1373,7 +835,7 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Doub
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
@@ -1383,29 +845,29 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,D
 | 
			
		||||
 | 
			
		||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out,bool l,bool n);
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
 | 
			
		||||
								  int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
 | 
			
		||||
								  int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -97,16 +97,26 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
// CONFIG IMCI/AVX512
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
#ifdef IMCI
 | 
			
		||||
#define ASM_IMCI
 | 
			
		||||
#undef  ASM_AVX512
 | 
			
		||||
#define MASK_REGS \
 | 
			
		||||
  __asm__ ("mov     $0xAAAA, %%eax \n"\
 | 
			
		||||
           "kmov    %%eax, %%k6 \n"\
 | 
			
		||||
           "knot     %%k6, %%k7 \n" : : : "%eax");
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
#define  ASM_AVX512
 | 
			
		||||
#define MASK_REGS \
 | 
			
		||||
  __asm__ ("mov     $0xAAAA, %%eax \n"\
 | 
			
		||||
           "kmovw    %%eax, %%k6 \n"\
 | 
			
		||||
           "mov     $0x5555, %%eax \n"\
 | 
			
		||||
           "kmovw    %%eax, %%k7 \n" : : : "%eax");
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Opcodes common to AVX512 and IMCI
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#define MASK_REGS \
 | 
			
		||||
  __asm__ ("mov     $0xAAAA, %%eax \n"\
 | 
			
		||||
	   "kmov    %%eax, %%k6 \n"\
 | 
			
		||||
	   "knot     %%k6, %%k7 \n" : : : "%eax");
 | 
			
		||||
 | 
			
		||||
#define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
 | 
			
		||||
#define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
 | 
			
		||||
@@ -137,8 +147,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 VACCTIMESI2f(A,ACC,tmp)			
 | 
			
		||||
 | 
			
		||||
#define  VACCTIMESI1MEMf(A,ACC,O,P)  "vaddps  " #O"*64("#P"),"#A "," #ACC"{%k7}" ";\n"
 | 
			
		||||
#ifdef ASM_IMCI
 | 
			
		||||
#define  VACCTIMESI2MEMf(A,ACC,O,P)  "vsubrps  " #O"*64("#P"),"#A "," #ACC"{%k6}" ";\n"
 | 
			
		||||
#define  VACCTIMESMINUSI1MEMf(A,ACC,O,P)  "vsubrps  " #O"*64("#P"),"#A "," #ACC"{%k7}" ";\n"
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef ASM_AVX512
 | 
			
		||||
#define  VACCTIMESI2MEMf(A,ACC,O,P)  "vsubps  " #O"*64("#P"),"#A "," #ACC"{%k6}" ";\n"  // FIXME KNOWN BUG INTRODUCED TO FORCE COMPILE CLEAN
 | 
			
		||||
#define  VACCTIMESMINUSI1MEMf(A,ACC,O,P)  "vsubps  " #O"*64("#P"),"#A "," #ACC"{%k7}" ";\n"
 | 
			
		||||
#endif
 | 
			
		||||
#define  VACCTIMESMINUSI2MEMf(A,ACC,O,P)  "vaddps  " #O"*64("#P"),"#A "," #ACC"{%k6}" ";\n"
 | 
			
		||||
 | 
			
		||||
#define VACCTIMESId(A,ACC,tmp)			\
 | 
			
		||||
@@ -163,8 +179,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
// Field prefetch
 | 
			
		||||
#ifdef ASM_IMCI
 | 
			
		||||
#define VPREFETCHNTA(O,A) "vprefetchnta "#O"*64("#A");\n" "vprefetch1 ("#O"+12)*64("#A");\n"
 | 
			
		||||
#define VPREFETCH(O,A)    "vprefetch0 "#O"*64("#A");\n" "vprefetch1 ("#O"+12)*64("#A");\n"
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef ASM_AVX512 
 | 
			
		||||
#define VPREFETCHNTA(O,A) 
 | 
			
		||||
#define VPREFETCH(O,A)    
 | 
			
		||||
#endif
 | 
			
		||||
#define VPREFETCHG(O,A) 
 | 
			
		||||
#define VPREFETCHW(O,A) 
 | 
			
		||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
 | 
			
		||||
@@ -251,11 +273,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define VSTOREf(OFF,PTR,SRC)   "vmovntps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
			
		||||
#define VSTOREd(OFF,PTR,SRC)   "vmovntpd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
			
		||||
// Swaps Re/Im
 | 
			
		||||
#define VSHUFd(A,DEST)         "vshufpd  $0x5, " #A "," #A "," #DEST  ";\n"
 | 
			
		||||
#define VSHUFf(A,DEST)         "vshufps  $0x55," #A "," #A "," #DEST  ";\n"
 | 
			
		||||
#define VSHUFd(A,DEST)         "vshufpd  $0x55," #A "," #A "," #DEST  ";\n"
 | 
			
		||||
#define VSHUFf(A,DEST)         "vshufps  $0x4e," #A "," #A "," #DEST  ";\n"
 | 
			
		||||
// Memops are useful for optimisation
 | 
			
		||||
#define VSHUFMEMd(OFF,A,DEST)       "vpshufpd  $0x4e, " #OFF"("#A ")," #DEST  ";\n"
 | 
			
		||||
#define VSHUFMEMf(OFF,A,DEST)       "vpshufps  $0xb1, " #OFF"("#A ")," #DEST  ";\n"
 | 
			
		||||
#define VSHUFMEMd(OFF,A,DEST)       "vpshufd  $0x4e, " #OFF"("#A ")," #DEST  ";\n"
 | 
			
		||||
#define VSHUFMEMf(OFF,A,DEST)       "vpshufd  $0xb1, " #OFF"("#A ")," #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// Merges accumulation for complex dot chain
 | 
			
		||||
@@ -271,7 +293,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
#define ZEND2f(Criir,Ciirr, tmp) "vsubps  " #Ciirr "," #tmp "," #Criir"{%k7}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define ZEND2d(Criir,Ciirr, tmp)	\
 | 
			
		||||
#define ZEND1d(Criir,Ciirr, tmp)	\
 | 
			
		||||
  "vshufpd $0x33," #Ciirr "," #Criir "," #tmp  ";\n"\
 | 
			
		||||
  "vaddpd  " #Criir "," #tmp "," #Criir"{%k6}" ";\n"
 | 
			
		||||
#define ZEND2d(Criir,Ciirr, tmp) "vsubpd  " #Ciirr "," #tmp "," #Criir"{%k7}" ";\n"
 | 
			
		||||
@@ -311,14 +333,41 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 | 
			
		||||
#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 | 
			
		||||
 | 
			
		||||
#define VPERM0f(A,B) "vshuff32x4 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
 | 
			
		||||
#define VPERM1f(A,B) "vshuff32x4 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
 | 
			
		||||
#define VPERM2f(A,B) "vshufps    " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
 | 
			
		||||
#define VPERM3f(A,B) "vshufps    " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
 | 
			
		||||
    static inline __m512 Permute0(__m512 in){
 | 
			
		||||
      return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m512 Permute1(__m512 in){
 | 
			
		||||
      return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m512 Permute2(__m512 in){
 | 
			
		||||
      return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m512 Permute3(__m512 in){
 | 
			
		||||
      return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
#define VPERM0d(A,B) "vshuff64x2 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
 | 
			
		||||
#define VPERM1d(A,B) "vshuff64x2 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
 | 
			
		||||
#define VPERM2d(A,B) "vshufpd    " #A "," #B "," "#B" ", " 0x55 ";\n"
 | 
			
		||||
    static inline __m512d Permute0(__m512d in){
 | 
			
		||||
      return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m512d Permute1(__m512d in){
 | 
			
		||||
      return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m512d Permute2(__m512d in){
 | 
			
		||||
      return _mm512_shuffle_pd(in,in,0x55);
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m512d Permute3(__m512d in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define VPERM0f(A,B) "vshuff32x4  $0x4e," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM1f(A,B) "vshuff32x4  $0xb1," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM2f(A,B) "vshufps     $0x4e," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM3f(A,B) "vshufps     $0xb1," #A "," #B "," #B ";\n"
 | 
			
		||||
 | 
			
		||||
#define VPERM0d(A,B) "vshuff64x2  $0x4e," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM1d(A,B) "vshuff64x2  $0xb1," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM2d(A,B) "vshufpd     $0x55," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM3d(A,B) VMOVd(A,B)
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user