diff --git a/lib/Grid.h b/lib/Grid.h index 48a59893..eb2be1d1 100644 --- a/lib/Grid.h +++ b/lib/Grid.h @@ -62,6 +62,7 @@ Author: paboyle #include #include #include +#include #include #include #include diff --git a/lib/PerfCount.h b/lib/PerfCount.h index a45b1e23..c4ee8eea 100644 --- a/lib/PerfCount.h +++ b/lib/PerfCount.h @@ -34,7 +34,7 @@ Author: paboyle #include #include #include - +#include #include #ifdef __linux__ @@ -163,8 +163,8 @@ public: { #ifdef __linux__ if ( fd!= -1) { - ioctl(fd, PERF_EVENT_IOC_RESET, 0); - ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); + ::ioctl(fd, PERF_EVENT_IOC_RESET, 0); + ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); } begin =cyclecount(); #else @@ -176,7 +176,7 @@ public: count=0; #ifdef __linux__ if ( fd!= -1) { - ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); + ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); ::read(fd, &count, sizeof(long long)); } elapsed = cyclecount() - begin; @@ -187,16 +187,16 @@ public: } void Report(void) { #ifdef __linux__ - printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count); + std::printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count); #else - printf("%llu cycles \n", elapsed ); + std::printf("%llu cycles \n", elapsed ); #endif } ~PerformanceCounter() { #ifdef __linux__ - close(fd); + ::close(fd); #endif } diff --git a/lib/Simd.h b/lib/Simd.h index ac3a5f88..27a5ec46 100644 --- a/lib/Simd.h +++ b/lib/Simd.h @@ -42,6 +42,7 @@ Author: paboyle #define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D)) +#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))" #define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H)) #define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D) #define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B) diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index 502a28bb..d874e0ac 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -335,69 +335,7 @@ PARALLEL_FOR_LOOP void WilsonFermion::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { - assert((dag==DaggerNo) ||(dag==DaggerYes)); - - Compressor compressor(dag); - - auto handle = st.HaloExchangeBegin(in,compressor); - - bool local = true; - bool nonlocal = false; - if ( dag == DaggerYes ) { - if( HandOptDslash ) { -PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); - } - } else { -PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); - } - } - } else { - if( HandOptDslash ) { -PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); - } - } else { -PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); - } - } - } - - st.HaloExchangeComplete(handle); - - local = false; - nonlocal = true; - if ( dag == DaggerYes ) { - if( HandOptDslash ) { -PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); - } - } else { -PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); - } - } - } else { - if( HandOptDslash ) { -PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); - } - } else { -PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); - } - } - } + assert(0); }; diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index d56c994c..b78f030e 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -281,11 +281,7 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { - // if ( Impl::overlapCommsCompute () ) { - // DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag); - // } else { DhopInternalCommsThenCompute(st,lo,U,in,out,dag); - // } } template @@ -368,7 +364,7 @@ PARALLEL_FOR_LOOP sU = lo.Reorder(sU); } sF = s+Ls*sU; - Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0] + Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out); } } } @@ -428,130 +424,6 @@ void WilsonFermion5D::DhopInternalCommsOverlapCompute(StencilImpl & st, Le const FermionField &in, FermionField &out,int dag) { assert(0); - // assert((dag==DaggerNo) ||(dag==DaggerYes)); - alltime-=usecond(); - - Compressor compressor(dag); - - // Assume balanced KMP_AFFINITY; this is forced in GridThread.h - - int threads = GridThread::GetThreads(); - int HT = GridThread::GetHyperThreads(); - int cores = GridThread::GetCores(); - int nwork = U._grid->oSites(); - - commtime -=usecond(); - auto handle = st.HaloExchangeBegin(in,compressor); - commtime +=usecond(); - - // Dhop takes the 4d grid from U, and makes a 5d index for fermion - // Not loop ordering and data layout. - // Designed to create - // - per thread reuse in L1 cache for U - // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. - bool local = true; - bool nonlocal = false; - dslashtime -=usecond(); - if ( dag == DaggerYes ) { - if( this->HandOptDslash ) { -PARALLEL_FOR_LOOP - for(int ss=0;ssoSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - { - int sd; - for(sd=0;sdHandOptDslash ) { -PARALLEL_FOR_LOOP - for(int ss=0;ssoSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - int sU=ss; - for(int s=0;sHandOptDslash ) { -PARALLEL_FOR_LOOP - for(int ss=0;ssoSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - { - int sd; - for(sd=0;sdHandOptDslash ) { -PARALLEL_FOR_LOOP - for(int ss=0;ssoSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - int sU=ss; - for(int s=0;s diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 7410024f..b94284f7 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -38,216 +38,177 @@ WilsonKernels::WilsonKernels(const ImplParams &p): Base(p) {}; template void WilsonKernels::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal) + int sF,int sU,const FermionField &in, FermionField &out) { SiteHalfSpinor tmp; SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; SiteHalfSpinor Uchi; SiteSpinor result; StencilEntry *SE; int ptype; - int num = 0; - - result=zero; - /////////////////////////// // Xp /////////////////////////// SE=st.GetEntry(ptype,Xp,sF); - if (local && SE->_is_local ) { + if (SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjXp(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjXp(chi,in._odata[SE->_offset]); } - } - - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; + } else { + chi_p=&buf[SE->_offset]; } - if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st); - accumReconXp(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st); + spReconXp(result,Uchi); /////////////////////////// // Yp /////////////////////////// SE=st.GetEntry(ptype,Yp,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjYp(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjYp(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st); - accumReconYp(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st); + accumReconYp(result,Uchi); /////////////////////////// // Zp /////////////////////////// SE=st.GetEntry(ptype,Zp,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjZp(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjZp(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st); - accumReconZp(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st); + accumReconZp(result,Uchi); /////////////////////////// // Tp /////////////////////////// SE=st.GetEntry(ptype,Tp,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjTp(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjTp(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st); - accumReconTp(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st); + accumReconTp(result,Uchi); /////////////////////////// // Xm /////////////////////////// SE=st.GetEntry(ptype,Xm,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjXm(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjXm(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st); - accumReconXm(result,Uchi); - num++; - } - + Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st); + accumReconXm(result,Uchi); + /////////////////////////// // Ym /////////////////////////// SE=st.GetEntry(ptype,Ym,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjYm(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjYm(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st); - accumReconYm(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st); + accumReconYm(result,Uchi); /////////////////////////// // Zm /////////////////////////// SE=st.GetEntry(ptype,Zm,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjZm(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjZm(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st); - accumReconZm(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st); + accumReconZm(result,Uchi); /////////////////////////// // Tm /////////////////////////// SE=st.GetEntry(ptype,Tm,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjTm(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjTm(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st); + accumReconTm(result,Uchi); - if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st); - accumReconTm(result,Uchi); - num++; - } - - if ( local ) { - vstream(out._odata[sF],result); - } else if ( num ) { - vstream(out._odata[sF],out._odata[sF]+result); - } + vstream(out._odata[sF],result); }; @@ -255,216 +216,177 @@ void WilsonKernels::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField template void WilsonKernels::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal) + int sF,int sU,const FermionField &in, FermionField &out) { SiteHalfSpinor tmp; SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; SiteHalfSpinor Uchi; SiteSpinor result; StencilEntry *SE; int ptype; - int num = 0; - - result=zero; - /////////////////////////// // Xp /////////////////////////// SE=st.GetEntry(ptype,Xm,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjXp(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjXp(chi,in._odata[SE->_offset]); } - } - - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; + } else { + chi_p=&buf[SE->_offset]; } - if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st); - accumReconXp(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st); + spReconXp(result,Uchi); /////////////////////////// // Yp /////////////////////////// SE=st.GetEntry(ptype,Ym,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjYp(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjYp(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st); - accumReconYp(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st); + accumReconYp(result,Uchi); /////////////////////////// // Zp /////////////////////////// SE=st.GetEntry(ptype,Zm,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjZp(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjZp(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st); - accumReconZp(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st); + accumReconZp(result,Uchi); /////////////////////////// // Tp /////////////////////////// SE=st.GetEntry(ptype,Tm,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjTp(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjTp(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st); - accumReconTp(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st); + accumReconTp(result,Uchi); /////////////////////////// // Xm /////////////////////////// SE=st.GetEntry(ptype,Xp,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjXm(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjXm(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st); - accumReconXm(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st); + accumReconXm(result,Uchi); /////////////////////////// // Ym /////////////////////////// SE=st.GetEntry(ptype,Yp,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjYm(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjYm(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st); - accumReconYm(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st); + accumReconYm(result,Uchi); /////////////////////////// // Zm /////////////////////////// SE=st.GetEntry(ptype,Zp,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjZm(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjZm(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } - - if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st); - accumReconZm(result,Uchi); - num++; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st); + accumReconZm(result,Uchi); /////////////////////////// // Tm /////////////////////////// SE=st.GetEntry(ptype,Tp,sF); - if (local && SE->_is_local ) { + if ( SE->_is_local ) { + chi_p = χ if ( SE->_permute ) { spProjTm(tmp,in._odata[SE->_offset]); permute(chi,tmp,ptype); } else { spProjTm(chi,in._odata[SE->_offset]); } + } else { + chi_p=&buf[SE->_offset]; } - if ( nonlocal && (!SE->_is_local) ) { - chi=buf[SE->_offset]; - } + Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st); + accumReconTm(result,Uchi); - if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) { - Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st); - accumReconTm(result,Uchi); - num++; - } - - if ( local ) { - vstream(out._odata[sF],result); - } else if ( num ) { - vstream(out._odata[sF],out._odata[sF]+result); - } + vstream(out._odata[sF],result); }; template @@ -596,11 +518,11 @@ void WilsonKernels::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U, vstream(out._odata[sF],result); } -#if ( ! defined(AVX512) ) && ( ! defined(IMCI) ) +#if ( ! defined(IMCI) && ! defined(AVX512) ) template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal) + int sF,int sU,const FermionField &in, FermionField &out) { DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 } diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index b7698730..68ae4c9e 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -48,11 +48,11 @@ namespace Grid { public: void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true); + int sF,int sU,const FermionField &in, FermionField &out); void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in,FermionField &out,bool local= true, bool nonlocal=true); + int sF,int sU,const FermionField &in,FermionField &out); void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, @@ -60,15 +60,15 @@ namespace Grid { void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true); + int sF,int sU,const FermionField &in, FermionField &out); int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true); + int sF,int sU,const FermionField &in, FermionField &out); int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true); + int sF,int sU,const FermionField &in, FermionField &out); WilsonKernels(const ImplParams &p= ImplParams()); diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index efabb610..3dab979b 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -28,6 +28,7 @@ Author: paboyle /* END LEGAL */ #include #if defined(AVX512) || defined (IMCI) +//#if defined (IMCI) #include @@ -105,7 +106,7 @@ namespace QCD { template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers) + int ss,int sU,const FermionField &in, FermionField &out) { uint64_t now; uint64_t first ; @@ -158,7 +159,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField else pf=(void *)&pbuf[SE->_offset]; if ( local ) { - XM_PROJMEM(&plocal[offset]); + XP_PROJMEM(&plocal[offset]); if ( perm) { PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } @@ -168,7 +169,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField { MULT_2SPIN_DIR_PFXM(Xm,pf); } - XM_RECON; + XP_RECON; // Ym offset = SE->_offset; @@ -181,7 +182,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField else pf=(void *)&pbuf[SE->_offset]; if ( local ) { - YM_PROJMEM(&plocal[offset]); + YP_PROJMEM(&plocal[offset]); if ( perm) { PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } @@ -191,7 +192,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField { MULT_2SPIN_DIR_PFYM(Ym,pf); } - YM_RECON_ACCUM; + YP_RECON_ACCUM; // Zm offset = SE->_offset; @@ -204,7 +205,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField else pf=(void *)&pbuf[SE->_offset]; if ( local ) { - ZM_PROJMEM(&plocal[offset]); + ZP_PROJMEM(&plocal[offset]); if ( perm) { PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } @@ -214,7 +215,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField { MULT_2SPIN_DIR_PFZM(Zm,pf); } - ZM_RECON_ACCUM; + ZP_RECON_ACCUM; // Tm offset = SE->_offset; @@ -227,7 +228,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField if ( local ) { - TM_PROJMEM(&plocal[offset]); + TP_PROJMEM(&plocal[offset]); if ( perm) { PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } @@ -237,7 +238,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField { MULT_2SPIN_DIR_PFTM(Tm,pf); } - TM_RECON_ACCUM; + TP_RECON_ACCUM; // Tp offset = SE->_offset; @@ -250,7 +251,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField else pf=(void *)&pbuf[SE->_offset]; if ( local ) { - TP_PROJMEM(&plocal[offset]); + TM_PROJMEM(&plocal[offset]); if ( perm) { PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } @@ -260,7 +261,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField { MULT_2SPIN_DIR_PFTP(Tp,pf); } - TP_RECON_ACCUM; + TM_RECON_ACCUM; // Zp offset = SE->_offset; @@ -273,7 +274,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField else pf=(void *)&pbuf[SE->_offset]; if ( local ) { - ZP_PROJMEM(&plocal[offset]); + ZM_PROJMEM(&plocal[offset]); if ( perm) { PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } @@ -283,7 +284,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField { MULT_2SPIN_DIR_PFZP(Zp,pf); } - ZP_RECON_ACCUM; + ZM_RECON_ACCUM; offset = SE->_offset; @@ -296,7 +297,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField else pf=(void *)&pbuf[SE->_offset]; if ( local ) { - YP_PROJMEM(&plocal[offset]); + YM_PROJMEM(&plocal[offset]); if ( perm) { PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } @@ -306,7 +307,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField { MULT_2SPIN_DIR_PFYP(Yp,pf); } - YP_RECON_ACCUM; + YM_RECON_ACCUM; // Xp perm = SE->_permute; @@ -321,7 +322,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField else pf=(void *)&pbuf[SE->_offset]; if ( local ) { - XP_PROJMEM(&plocal[offset]); + XM_PROJMEM(&plocal[offset]); if ( perm) { PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } @@ -331,7 +332,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField { MULT_2SPIN_DIR_PFXP(Xp,pf); } - XP_RECON_ACCUM; + XM_RECON_ACCUM; debug: SAVE_RESULT(&out._odata[ss]); @@ -340,6 +341,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField template class WilsonKernels; template class WilsonKernels; - + template class WilsonKernels; + template class WilsonKernels; }} #endif diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 5c6eee00..74440f16 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -308,548 +308,11 @@ Author: paboyle namespace Grid { namespace QCD { -#if 0 -template -int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal) -{ - // std::cout << "Hand op Dhop "<_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - XP_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - - } - - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Xp); - XP_RECON_ACCUM; - num++; - } - - // Yp - SE=st.GetEntry(ptype,Yp,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - YP_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Yp); - YP_RECON_ACCUM; - num++; - } - - - // Zp - SE=st.GetEntry(ptype,Zp,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - ZP_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Zp); - ZP_RECON_ACCUM; - num++; - } - - // Tp - SE=st.GetEntry(ptype,Tp,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - TP_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Tp); - TP_RECON_ACCUM; - num++; - } - - // Xm - SE=st.GetEntry(ptype,Xm,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - XM_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Xm); - XM_RECON_ACCUM; - num++; - } - - // Ym - SE=st.GetEntry(ptype,Ym,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - YM_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Ym); - YM_RECON_ACCUM; - num++; - } - - // Zm - SE=st.GetEntry(ptype,Zm,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - ZM_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Zm); - ZM_RECON_ACCUM; - num++; - } - - // Tm - SE=st.GetEntry(ptype,Tm,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - TM_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Tm); - TM_RECON_ACCUM; - num++; - } - - SiteSpinor & ref (out._odata[ss]); - if ( Local ) { - vstream(ref()(0)(0),result_00); - vstream(ref()(0)(1),result_01); - vstream(ref()(0)(2),result_02); - vstream(ref()(1)(0),result_10); - vstream(ref()(1)(1),result_11); - vstream(ref()(1)(2),result_12); - vstream(ref()(2)(0),result_20); - vstream(ref()(2)(1),result_21); - vstream(ref()(2)(2),result_22); - vstream(ref()(3)(0),result_30); - vstream(ref()(3)(1),result_31); - vstream(ref()(3)(2),result_32); - return 1; - } else if ( num ) { - vstream(ref()(0)(0),ref()(0)(0)+result_00); - vstream(ref()(0)(1),ref()(0)(1)+result_01); - vstream(ref()(0)(2),ref()(0)(2)+result_02); - vstream(ref()(1)(0),ref()(1)(0)+result_10); - vstream(ref()(1)(1),ref()(1)(1)+result_11); - vstream(ref()(1)(2),ref()(1)(2)+result_12); - vstream(ref()(2)(0),ref()(2)(0)+result_20); - vstream(ref()(2)(1),ref()(2)(1)+result_21); - vstream(ref()(2)(2),ref()(2)(2)+result_22); - vstream(ref()(3)(0),ref()(3)(0)+result_30); - vstream(ref()(3)(1),ref()(3)(1)+result_31); - vstream(ref()(3)(2),ref()(3)(2)+result_32); - return 1; - } - return 0; -} - - - - -template -int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal) -{ - // std::cout << "Hand op Dhop "<_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - XM_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Xp); - XM_RECON_ACCUM; - num++; - } - - - // Yp - SE=st.GetEntry(ptype,Yp,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - YM_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Yp); - YM_RECON_ACCUM; - num++; - } - - - // Zp - SE=st.GetEntry(ptype,Zp,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - ZM_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Zp); - ZM_RECON_ACCUM; - num++; - } - - // Tp - SE=st.GetEntry(ptype,Tp,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - TM_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Tp); - TM_RECON_ACCUM; - num++; - } - - // Xm - SE=st.GetEntry(ptype,Xm,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - XP_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Xm); - XP_RECON_ACCUM; - num++; - } - - // Ym - SE=st.GetEntry(ptype,Ym,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - YP_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Ym); - YP_RECON_ACCUM; - num++; - } - - // Zm - SE=st.GetEntry(ptype,Zm,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - ZP_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Zm); - ZP_RECON_ACCUM; - num++; - } - - // Tm - SE=st.GetEntry(ptype,Tm,ss); - offset = SE->_offset; - - if (Local && SE->_is_local ) { - LOAD_CHIMU; - TP_PROJ; - if ( SE->_permute ) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } - if ( Nonlocal && (!SE->_is_local) ) { - LOAD_CHI; - } - if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) { - MULT_2SPIN(Tm); - TP_RECON_ACCUM; - num++; - } - - SiteSpinor & ref (out._odata[ss]); - if ( Local ) { - vstream(ref()(0)(0),result_00); - vstream(ref()(0)(1),result_01); - vstream(ref()(0)(2),result_02); - vstream(ref()(1)(0),result_10); - vstream(ref()(1)(1),result_11); - vstream(ref()(1)(2),result_12); - vstream(ref()(2)(0),result_20); - vstream(ref()(2)(1),result_21); - vstream(ref()(2)(2),result_22); - vstream(ref()(3)(0),result_30); - vstream(ref()(3)(1),result_31); - vstream(ref()(3)(2),result_32); - return 1; - } else if ( num ) { - vstream(ref()(0)(0),ref()(0)(0)+result_00); - vstream(ref()(0)(1),ref()(0)(1)+result_01); - vstream(ref()(0)(2),ref()(0)(2)+result_02); - vstream(ref()(1)(0),ref()(1)(0)+result_10); - vstream(ref()(1)(1),ref()(1)(1)+result_11); - vstream(ref()(1)(2),ref()(1)(2)+result_12); - vstream(ref()(2)(0),ref()(2)(0)+result_20); - vstream(ref()(2)(1),ref()(2)(1)+result_21); - vstream(ref()(2)(2),ref()(2)(2)+result_22); - vstream(ref()(3)(0),ref()(3)(0)+result_30); - vstream(ref()(3)(1),ref()(3)(1)+result_31); - vstream(ref()(3)(2),ref()(3)(2)+result_32); - return 1; - } - return 0; -} - -#else template int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal) + int ss,int sU,const FermionField &in, FermionField &out) { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; @@ -1094,7 +557,7 @@ int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField template int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out,bool l, bool nl) + int ss,int sU,const FermionField &in, FermionField &out) { // std::cout << "Hand op Dhop "<::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi } -#endif //////////////////////////////////////////////// // Specialise Gparity to simple implementation //////////////////////////////////////////////// template<> int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal) + int sF,int sU,const FermionField &in, FermionField &out) { DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3 //check consistency of return types between these functions and the ones in WilsonKernels.cc @@ -1355,7 +817,7 @@ int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,Doub template<> int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal) + int sF,int sU,const FermionField &in, FermionField &out) { DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 return 0; @@ -1364,7 +826,7 @@ int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,D template<> int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal) + int sF,int sU,const FermionField &in, FermionField &out) { DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 return 0; @@ -1373,7 +835,7 @@ int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,Doub template<> int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal) + int sF,int sU,const FermionField &in, FermionField &out) { DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 return 0; @@ -1383,29 +845,29 @@ int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,D template int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out,bool l,bool n); + int ss,int sU,const FermionField &in, FermionField &out); template int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n); + int ss,int sU,const FermionField &in, FermionField &out); template int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n); + int ss,int sU,const FermionField &in, FermionField &out); template int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n); + int ss,int sU,const FermionField &in, FermionField &out); template int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl); + int ss,int sU,const FermionField &in, FermionField &out); template int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl); + int ss,int sU,const FermionField &in, FermionField &out); template int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl); + int ss,int sU,const FermionField &in, FermionField &out); template int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl); + int ss,int sU,const FermionField &in, FermionField &out); }} diff --git a/lib/simd/Avx512Asm.h b/lib/simd/Avx512Asm.h index c0569c1f..8363c2ab 100644 --- a/lib/simd/Avx512Asm.h +++ b/lib/simd/Avx512Asm.h @@ -69,6 +69,7 @@ Author: paboyle #define UChi_12 %zmm23 #define Uir %zmm24 +//#define ONE %zmm24 #define Uri %zmm25 #define Z0 %zmm26 @@ -97,16 +98,17 @@ Author: paboyle // CONFIG IMCI/AVX512 ////////////////////////////////////////////////////////////////////////////////////////// +#ifdef IMCI #define ASM_IMCI -#undef ASM_AVX512 +#endif + +#ifdef AVX512 +#define ASM_AVX512 +#endif //////////////////////////////////////////////////////////////////////////////////////////////////// -// Opcodes common to AVX512 and IMCI +// Opcodes common //////////////////////////////////////////////////////////////////////////////////////////////////// -#define MASK_REGS \ - __asm__ ("mov $0xAAAA, %%eax \n"\ - "kmov %%eax, %%k6 \n"\ - "knot %%k6, %%k7 \n" : : : "%eax"); #define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n" @@ -136,11 +138,6 @@ Author: paboyle VACCTIMESI1f(A,ACC,tmp) \ VACCTIMESI2f(A,ACC,tmp) -#define VACCTIMESI1MEMf(A,ACC,O,P) "vaddps " #O"*64("#P"),"#A "," #ACC"{%k7}" ";\n" -#define VACCTIMESI2MEMf(A,ACC,O,P) "vsubrps " #O"*64("#P"),"#A "," #ACC"{%k6}" ";\n" -#define VACCTIMESMINUSI1MEMf(A,ACC,O,P) "vsubrps " #O"*64("#P"),"#A "," #ACC"{%k7}" ";\n" -#define VACCTIMESMINUSI2MEMf(A,ACC,O,P) "vaddps " #O"*64("#P"),"#A "," #ACC"{%k6}" ";\n" - #define VACCTIMESId(A,ACC,tmp) \ VACCTIMESI0d(A,ACC,tmp) \ VACCTIMESI1d(A,ACC,tmp) \ @@ -157,14 +154,12 @@ Author: paboyle VACCTIMESMINUSI2d(A,ACC,tmp) #define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A ); -#define LOAD64(A,ptr) LOAD64i(A,ptr) +#define LOAD64(A,ptr) LOAD64i(A,ptr) #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" #define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n" -// Field prefetch -#define VPREFETCHNTA(O,A) "vprefetchnta "#O"*64("#A");\n" "vprefetch1 ("#O"+12)*64("#A");\n" -#define VPREFETCH(O,A) "vprefetch0 "#O"*64("#A");\n" "vprefetch1 ("#O"+12)*64("#A");\n" + #define VPREFETCHG(O,A) #define VPREFETCHW(O,A) //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n" @@ -210,8 +205,6 @@ Author: paboyle #define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp) #define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp) -// Need VSHUFMULMEMf,d for KNC -// AVX512 friendly #define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ VSHUFMEMf(O,P,tmp) \ VMULMEMf(O,P,B,Biirr) \ @@ -243,99 +236,107 @@ Author: paboyle VMADDd(tmp,C,Criir) //////////////////////////////////////////////////////////////////////////////////////////////////// -// Lane swizzling changed between AVX512 and IMCI and requires arch dependent complex support +// ISA changed between AVX512 and IMCI and requires arch dependent complex support //////////////////////////////////////////////////////////////////////////////////////////////////// -// AVX512 special (Knights Landing) +#define VPREFETCHNTA(O,A) +#define VPREFETCH(O,A) + +#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" + +// Swaps Re/Im ; could unify this with IMCI +#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n" +#define VSHUFf(A,DEST) "vpshufd $0xb1," #A "," #DEST ";\n" +#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 1,0,3,2 +#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 2,3,0,1 + + +//////////////////////////////////////////////////////////// +// Knights Landing specials +//////////////////////////////////////////////////////////// #ifdef ASM_AVX512 -#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n" -#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n" -// Swaps Re/Im -#define VSHUFd(A,DEST) "vshufpd $0x5, " #A "," #A "," #DEST ";\n" -#define VSHUFf(A,DEST) "vshufps $0x55," #A "," #A "," #DEST ";\n" -// Memops are useful for optimisation -#define VSHUFMEMd(OFF,A,DEST) "vpshufpd $0x4e, " #OFF"("#A ")," #DEST ";\n" -#define VSHUFMEMf(OFF,A,DEST) "vpshufps $0xb1, " #OFF"("#A ")," #DEST ";\n" +#define MASK_REGS \ + __asm__ ("mov $0xAAAA, %%eax \n"\ + "kmovw %%eax, %%k6 \n"\ + "mov $0x5555, %%eax \n"\ + "kmovw %%eax, %%k7 \n" : : : "%eax"); -// Merges accumulation for complex dot chain -// TODO: 12 operation saving: -// # could SWIZ op 18{cdab} and eliminate temporary // 12cycles -// # no use KNL though. Fingour something else there. -// # All swizzles become perms ops, but gain addsub; subadd must use this -// # uint32_t (0x7F << 23 ) -// # uint64_t (0x3FF<< 52 ) ; vpbroadcast -#define ZEND1f(Criir,Ciirr, tmp) \ - "vshufps $0xb1," #Ciirr "," #Criir "," #tmp ";\n"\ - "vaddps " #Criir "," #tmp "," #Criir"{%k6}" ";\n" +// Merges accumulation for complex dot chain; less efficient under avx512 +//ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n" +//ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n" +//ZEND1f(Criir,Ciirr, tmp) "vaddps " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n" +//ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n" +#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\ + "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" -#define ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "," #tmp "," #Criir"{%k7}" ";\n" +#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\ + "vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n" -#define ZEND2d(Criir,Ciirr, tmp) \ - "vshufpd $0x33," #Ciirr "," #Criir "," #tmp ";\n"\ - "vaddpd " #Criir "," #tmp "," #Criir"{%k6}" ";\n" -#define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "," #tmp "," #Criir"{%k7}" ";\n" +#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\ + "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" +#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\ + "vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii + // Further opt possible: KNC -- use swizzle operand ; no addsub. // KNL -- addsub. Saves 6 ops, 12 cycles; KNL cost of loading "1" as only fmaddsub // no swizzle on KNL. -#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST) -#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n" -#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n" +#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST) +#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n" +#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n" #define VTIMESI0d(A,DEST, Z) VSHUFd(A,DEST) -#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n" -#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n" +#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n" +#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n" #define VTIMESMINUSI0f(A,DEST,Z) VSHUFf(A,DEST) -#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n" -#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n" +#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n" +#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n" #define VTIMESMINUSI0d(A,DEST,Z) VSHUFd(A,DEST) -#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n" -#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n" +#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n" +#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n" #define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp) -#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n" -#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n" +#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n" +#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n" #define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp) -#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n" -#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n" +#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n" +#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n" -#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp) -#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n" -#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n" +#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp) +#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n" +#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n" -#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp) -#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n" -#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n" +#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp) +#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n" +#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n" -#define VPERM0f(A,B) "vshuff32x4 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n" -#define VPERM1f(A,B) "vshuff32x4 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n" -#define VPERM2f(A,B) "vshufps " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n" -#define VPERM3f(A,B) "vshufps " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n" +#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n" +#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n" +#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n" +#define VPERM3f(A,B) "vshufps $0xb1," #A "," #B "," #B ";\n" -#define VPERM0d(A,B) "vshuff64x2 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n" -#define VPERM1d(A,B) "vshuff64x2 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n" -#define VPERM2d(A,B) "vshufpd " #A "," #B "," "#B" ", " 0x55 ";\n" +#define VPERM0d(A,B) "vshuff64x2 $0x4e," #A "," #B "," #B ";\n" +#define VPERM1d(A,B) "vshuff64x2 $0xb1," #A "," #B "," #B ";\n" +#define VPERM2d(A,B) "vshufpd $0x55," #A "," #B "," #B ";\n" #define VPERM3d(A,B) VMOVd(A,B) #endif +//////////////////////////////////////////////////////////// // Knights Corner specials +//////////////////////////////////////////////////////////// + #ifdef ASM_IMCI -#define VSTOREf(OFF,PTR,SRC) "vmovnrngoaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" -#define VSTOREd(OFF,PTR,SRC) "vmovnrngoapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" - //#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" - //#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" -#define VSHUFf(A,DEST) "vmovaps " #A "{cdab} , " #DEST ";\n" -#define VSHUFd(A,DEST) "vmovapd " #A "{cdab} , " #DEST ";\n" - -// Memops are useful for optimisation -#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n" -#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n" +#define MASK_REGS \ + __asm__ ("mov $0xAAAA, %%eax \n"\ + "kmov %%eax, %%k6 \n"\ + "knot %%k6, %%k7 \n" : : : "%eax"); #define ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n" #define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n" @@ -374,12 +375,11 @@ Author: paboyle #define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" #define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" + // Acc = Acc - i A #define VACCTIMESMINUSI0d(A,ACC,tmp) #define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" #define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" -//#define ZENDf(Criir,Ciirr, tmp) - //((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e //((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1 @@ -394,18 +394,15 @@ Author: paboyle #define VPERM3d(A,B) VMOVd(A,B) #endif - // const SiteSpinor * ptr = & in._odata[offset]; #define LOAD_CHIMU(PTR) LOAD_CHIMUi(PTR) -#define LOAD_CHI(PTR) LOAD_CHIi(PTR) +#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi ); #define SAVE_UCHI(PTR) SAVE_UCHIi(PTR) #define SAVE_CHI(PTR) SAVE_CHIi(PTR) #define SAVE_RESULT(PTR) SAVE_RESULTi(PTR) -#define LOAD_CHIMUi(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ (\ +#define LOAD_CHIMUi \ LOAD_CHIMU01i \ LOAD_CHIMU23i ); @@ -437,16 +434,14 @@ Author: paboyle // const SiteHalfSpinor *ptr = &buf[offset]; -#define LOAD_CHIi(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ +#define LOAD_CHIi \ VLOAD(0,%r8,Chi_00) \ VLOAD(1,%r8,Chi_01) \ VLOAD(2,%r8,Chi_02) \ VLOAD(3,%r8,Chi_10) \ VLOAD(4,%r8,Chi_11) \ - VLOAD(5,%r8,Chi_12) \ - ); + VLOAD(5,%r8,Chi_12) + #define SAVE_UCHIi(PTR) \ LOAD64(%r8,PTR) \ @@ -585,7 +580,6 @@ Author: paboyle ZEND2(UChi_12,Z5,Chi_12) ); #define MULT_2SPIN(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG); - #define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) #define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) #define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) @@ -667,56 +661,23 @@ Author: paboyle // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); -//define VTIMESIf(A,DEST, Z) -// These don't work if DEST==Z. FIXME. -#define XP_PROJ __asm__ ( \ - VACCTIMESI(Chimu_30,Chi_00,Z0) \ - VACCTIMESI(Chimu_31,Chi_01,Z1) \ - VACCTIMESI(Chimu_32,Chi_02,Z2) \ - VACCTIMESI(Chimu_20,Chi_10,Z3) \ - VACCTIMESI(Chimu_21,Chi_11,Z4) \ - VACCTIMESI(Chimu_22,Chi_12,Z5) ); - #define XP_PROJMEM(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ + LOAD_CHIi \ SHUF_CHIMU23i \ - VACCTIMESI1MEM(Chimu_30,Chi_00,0,%r8) \ - VACCTIMESI1MEM(Chimu_31,Chi_01,1,%r8) \ - VACCTIMESI1MEM(Chimu_32,Chi_02,2,%r8) \ - VACCTIMESI1MEM(Chimu_20,Chi_10,3,%r8) \ - VACCTIMESI1MEM(Chimu_21,Chi_11,4,%r8) \ - VACCTIMESI1MEM(Chimu_22,Chi_12,5,%r8) \ - VACCTIMESI2MEM(Chimu_30,Chi_00,0,%r8) \ - VACCTIMESI2MEM(Chimu_31,Chi_01,1,%r8) \ - VACCTIMESI2MEM(Chimu_32,Chi_02,2,%r8) \ - VACCTIMESI2MEM(Chimu_20,Chi_10,3,%r8) \ - VACCTIMESI2MEM(Chimu_21,Chi_11,4,%r8) \ - VACCTIMESI2MEM(Chimu_22,Chi_12,5,%r8) ); - - -#define YP_PROJ __asm__ ( \ - VSUB(Chimu_30,Chimu_00,Chi_00)\ - VSUB(Chimu_31,Chimu_01,Chi_01)\ - VSUB(Chimu_32,Chimu_02,Chi_02)\ - VADD(Chimu_10,Chimu_20,Chi_10)\ - VADD(Chimu_11,Chimu_21,Chi_11)\ - VADD(Chimu_12,Chimu_22,Chi_12) ); - -#define EVICT_SPINOR(reg) \ - VEVICT(0,reg) \ - VEVICT(1,reg) \ - VEVICT(2,reg) \ - VEVICT(3,reg) \ - VEVICT(4,reg) \ - VEVICT(5,reg) \ - VEVICT(6,reg) \ - VEVICT(7,reg) \ - VEVICT(8,reg) \ - VEVICT(9,reg) \ - VEVICT(9,reg) \ - VEVICT(10,reg) \ - VEVICT(11,reg) + VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \ + VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \ + VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \ + VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \ + VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \ + VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \ + VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \ + VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \ + VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \ + VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \ + VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \ + VACCTIMESI2(Chi_12,Chi_12,Chimu_22) ); #define YP_PROJMEM(ptr) \ @@ -729,43 +690,24 @@ Author: paboyle VADDMEM(6,%r8,Chimu_10,Chi_10) \ VADDMEM(7,%r8,Chimu_11,Chi_11) \ VADDMEM(8,%r8,Chimu_12,Chi_12) ); - // EVICT_SPINOR(%r8) ); - -#define ZP_PROJ __asm__ ( \ - VACCTIMESI(Chimu_20,Chi_00,Z0) \ - VACCTIMESI(Chimu_21,Chi_01,Z1) \ - VACCTIMESI(Chimu_22,Chi_02,Z2) \ - VACCTIMESMINUSI(Chimu_30,Chi_10,Z3) \ - VACCTIMESMINUSI(Chimu_31,Chi_11,Z4) \ - VACCTIMESMINUSI(Chimu_32,Chi_12,Z5) ); #define ZP_PROJMEM(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ + LOAD_CHIi \ SHUF_CHIMU23i \ - VACCTIMESI1MEM(Chimu_20,Chi_00,0,%r8) \ - VACCTIMESI1MEM(Chimu_21,Chi_01,1,%r8) \ - VACCTIMESI1MEM(Chimu_22,Chi_02,2,%r8) \ - VACCTIMESMINUSI1MEM(Chimu_30,Chi_10,3,%r8) \ - VACCTIMESMINUSI1MEM(Chimu_31,Chi_11,4,%r8) \ - VACCTIMESMINUSI1MEM(Chimu_32,Chi_12,5,%r8) \ - VACCTIMESI2MEM(Chimu_20,Chi_00,0,%r8) \ - VACCTIMESI2MEM(Chimu_21,Chi_01,1,%r8) \ - VACCTIMESI2MEM(Chimu_22,Chi_02,2,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_30,Chi_10,3,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_31,Chi_11,4,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_32,Chi_12,5,%r8) \ - EVICT_SPINOR(%r8) ); - - - -#define TP_PROJ __asm__ ( \ - VADD(Chimu_00,Chimu_20,Chi_00) \ - VADD(Chimu_01,Chimu_21,Chi_01) \ - VADD(Chimu_02,Chimu_22,Chi_02) \ - VADD(Chimu_10,Chimu_30,Chi_10) \ - VADD(Chimu_11,Chimu_31,Chi_11) \ - VADD(Chimu_12,Chimu_32,Chi_12) ); + VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \ + VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \ + VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \ + VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \ + VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \ + VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \ + VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \ + VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \ + VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \ + VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \ + VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \ + VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) ); #define TP_PROJMEM(ptr) \ @@ -777,44 +719,28 @@ Author: paboyle VADDMEM(8,%r8,Chimu_02,Chi_02) \ VADDMEM(9,%r8,Chimu_10,Chi_10) \ VADDMEM(10,%r8,Chimu_11,Chi_11) \ - VADDMEM(11,%r8,Chimu_12,Chi_12) \ - EVICT_SPINOR(%r8) ); - + VADDMEM(11,%r8,Chimu_12,Chi_12) ); // hspin(0)=fspin(0)-timesI(fspin(3)) // hspin(1)=fspin(1)-timesI(fspin(2)) -#define XM_PROJ __asm__ ( \ - VACCTIMESMINUSI(Chimu_30,Chi_00,Z0) \ - VACCTIMESMINUSI(Chimu_31,Chi_01,Z1) \ - VACCTIMESMINUSI(Chimu_32,Chi_02,Z2) \ - VACCTIMESMINUSI(Chimu_20,Chi_10,Z3) \ - VACCTIMESMINUSI(Chimu_21,Chi_11,Z4) \ - VACCTIMESMINUSI(Chimu_22,Chi_12,Z5) ); #define XM_PROJMEM(PTR) \ - LOAD64(%r8,PTR) \ + LOAD64(%r8,PTR)\ __asm__ ( \ SHUF_CHIMU23i \ - VACCTIMESMINUSI1MEM(Chimu_30,Chi_00,0,%r8) \ - VACCTIMESMINUSI1MEM(Chimu_31,Chi_01,1,%r8) \ - VACCTIMESMINUSI1MEM(Chimu_32,Chi_02,2,%r8) \ - VACCTIMESMINUSI1MEM(Chimu_20,Chi_10,3,%r8) \ - VACCTIMESMINUSI1MEM(Chimu_21,Chi_11,4,%r8) \ - VACCTIMESMINUSI1MEM(Chimu_22,Chi_12,5,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_30,Chi_00,0,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_31,Chi_01,1,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_32,Chi_02,2,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_20,Chi_10,3,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_21,Chi_11,4,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_22,Chi_12,5,%r8) ); - -#define YM_PROJ __asm__ ( \ - VADD(Chimu_00,Chimu_30,Chi_00)\ - VADD(Chimu_01,Chimu_31,Chi_01)\ - VADD(Chimu_02,Chimu_32,Chi_02)\ - VSUB(Chimu_20,Chimu_10,Chi_10)\ - VSUB(Chimu_21,Chimu_11,Chi_11)\ - VSUB(Chimu_22,Chimu_12,Chi_12) ); + LOAD_CHIi \ + VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\ + VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\ + VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\ + VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\ + VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\ + VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\ + VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\ + VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\ + VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\ + VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\ + VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\ + VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) ); #define YM_PROJMEM(ptr) \ LOAD64(%r8,ptr) \ @@ -825,45 +751,25 @@ Author: paboyle VADDMEM(11,%r8,Chimu_02,Chi_02) \ VSUBMEM(6,%r8,Chimu_10,Chi_10) \ VSUBMEM(7,%r8,Chimu_11,Chi_11) \ - VSUBMEM(8,%r8,Chimu_12,Chi_12) \ - EVICT_SPINOR(%r8) ); - - -#define ZM_PROJ __asm__ ( \ - VACCTIMESMINUSI(Chimu_20,Chi_00,Z0)\ - VACCTIMESMINUSI(Chimu_21,Chi_01,Z1)\ - VACCTIMESMINUSI(Chimu_22,Chi_02,Z2)\ - VACCTIMESI(Chimu_30,Chi_10,Z3)\ - VACCTIMESI(Chimu_31,Chi_11,Z4)\ - VACCTIMESI(Chimu_32,Chi_12,Z5)); + VSUBMEM(8,%r8,Chimu_12,Chi_12) ); #define ZM_PROJMEM(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ SHUF_CHIMU23i \ - VACCTIMESMINUSI1MEM(Chimu_20,Chi_00,0,%r8) \ - VACCTIMESMINUSI1MEM(Chimu_21,Chi_01,1,%r8) \ - VACCTIMESMINUSI1MEM(Chimu_22,Chi_02,2,%r8) \ - VACCTIMESI1MEM(Chimu_30,Chi_10,3,%r8) \ - VACCTIMESI1MEM(Chimu_31,Chi_11,4,%r8) \ - VACCTIMESI1MEM(Chimu_32,Chi_12,5,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_20,Chi_00,0,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_21,Chi_01,1,%r8) \ - VACCTIMESMINUSI2MEM(Chimu_22,Chi_02,2,%r8) \ - VACCTIMESI2MEM(Chimu_30,Chi_10,3,%r8) \ - VACCTIMESI2MEM(Chimu_31,Chi_11,4,%r8) \ - VACCTIMESI2MEM(Chimu_32,Chi_12,5,%r8) \ - EVICT_SPINOR(%r8) ); - - -#define TM_PROJ __asm__ ( \ - VSUB(Chimu_20,Chimu_00,Chi_00)\ - VSUB(Chimu_21,Chimu_01,Chi_01)\ - VSUB(Chimu_22,Chimu_02,Chi_02)\ - VSUB(Chimu_30,Chimu_10,Chi_10)\ - VSUB(Chimu_31,Chimu_11,Chi_11)\ - VSUB(Chimu_32,Chimu_12,Chi_12) ); - + LOAD_CHIi \ + VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\ + VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\ + VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\ + VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\ + VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\ + VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\ + VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\ + VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\ + VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\ + VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\ + VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\ + VACCTIMESI2(Chi_12,Chi_12,Chimu_32) ); #define TM_PROJMEM(ptr) \ LOAD64(%r8,ptr) \ @@ -874,8 +780,7 @@ Author: paboyle VSUBMEM(8,%r8,Chimu_02,Chi_02) \ VSUBMEM(9,%r8,Chimu_10,Chi_10) \ VSUBMEM(10,%r8,Chimu_11,Chi_11) \ - VSUBMEM(11,%r8,Chimu_12,Chi_12) \ - EVICT_SPINOR(%r8) ); + VSUBMEM(11,%r8,Chimu_12,Chi_12) ); // fspin(0)=hspin(0) // fspin(1)=hspin(1) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 12a168ef..5d014137 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -39,7 +39,7 @@ Author: paboyle #include - +namespace Grid{ namespace Optimization { struct Vsplat{ @@ -246,26 +246,30 @@ namespace Optimization { struct TimesMinusI{ //Complex single inline __m512 operator()(__m512 in, __m512 ret){ - __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag - return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E?? + //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag + //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E?? + __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); } //Complex double inline __m512d operator()(__m512d in, __m512d ret){ - __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag - return _mm512_shuffle_pd(tmp,tmp,0x55); + //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag + //return _mm512_shuffle_pd(tmp,tmp,0x55); + __m512d tmp = _mm512_shuffle_pd(in,in,0x55); + return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); } }; struct TimesI{ //Complex single inline __m512 operator()(__m512 in, __m512 ret){ - __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); - return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); + __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); } //Complex double inline __m512d operator()(__m512d in, __m512d ret){ - __m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55); - return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); + __m512d tmp = _mm512_shuffle_pd(in,in,0x55); + return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); } @@ -345,7 +349,7 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types -namespace Grid { + typedef __m512 SIMD_Ftype; // Single precision type typedef __m512d SIMD_Dtype; // Double precision type typedef __m512i SIMD_Itype; // Integer type diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 2d74ba9b..d5025554 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -145,7 +145,7 @@ void Tester(const functor &func) int ok=0; for(int i=0;i0){ + if ( abs(reference[i]-result[i])>1.0e-7){ std::cout< using namespace Grid; using namespace Grid::QCD; + +void ZmulF(void *ptr1,void *ptr2,void *ptr3); +void Zmul(void *ptr1,void *ptr2,void *ptr3); void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3); void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3); void TimesIAvx512F(void *ptr1,void *ptr3); void TimesIAvx512(void *ptr1,void *ptr3); +void TimesMinusIAvx512F(void *ptr1,void *ptr3); +void TimesMinusIAvx512(void *ptr1,void *ptr3); @@ -63,50 +68,106 @@ int main(int argc,char **argv) vColourMatrixD mat; vHalfSpinColourVectorD vec; + vHalfSpinColourVectorD vec1; + vHalfSpinColourVectorD vec2; + vHalfSpinColourVectorD vec3; + vHalfSpinColourVectorD matvec; vHalfSpinColourVectorD ref; vComplexD err; + random(sRNG,vec1); + vec1 = std::complex(0.1,3.0); + random(sRNG,vec2); + vec2=2.0; + random(sRNG,vec3); + + //std::cout << "Zmul vec1"< U(4,UGrid); for(int mu=0;mu(Umu,mu); @@ -157,7 +219,7 @@ int main(int argc,char **argv) } t1=usecond(); - +#if 1 for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ Dw.DhopOE(srce,resulta,0); PerformanceCounter Counter(i); @@ -166,14 +228,28 @@ int main(int argc,char **argv) Counter.Stop(); Counter.Report(); } - resulta = (-0.5) * resulta; +#endif + //resulta = (-0.5) * resulta; std::cout<