diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index dfaea627..1e51c9d2 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -113,6 +113,36 @@ int main (int argc, char ** argv) std::cout<Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + zDw.CayleyReport(); \ + std::cout<Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + Dw.CayleyReport(); \ + std::cout< gamma(Ls,std::complex(1.0,0.0)); + ZMobiusFermionVec5dR zDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5,gamma,b,c); + std::cout<Barrier(); @@ -173,10 +209,13 @@ int main (int argc, char ** argv) BENCH_DW_MEO(Dhop ,src,result); BENCH_DW_MEO(DhopEO ,src_o,r_e); - BENCH_DW(Meooe ,src_o,r_e); + BENCH_DW_SSC(Meooe ,src_o,r_e); BENCH_DW(Mooee ,src_o,r_o); BENCH_DW(MooeeInv,src_o,r_o); + BENCH_ZDW(Mooee ,src_o,r_o); + BENCH_ZDW(MooeeInv,src_o,r_o); + } Grid_finalize(); diff --git a/lib/AlignedAllocator.cc b/lib/AlignedAllocator.cc new file mode 100644 index 00000000..9df4ec1c --- /dev/null +++ b/lib/AlignedAllocator.cc @@ -0,0 +1,65 @@ + + + +#include + +namespace Grid { + +int PointerCache::victim; + + PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; + +void *PointerCache::Insert(void *ptr,size_t bytes) { + + if (bytes < 4096 ) return NULL; + +#ifdef _OPENMP + assert(omp_in_parallel()==0); +#endif + void * ret = NULL; + int v = -1; + + for(int e=0;e namespace Grid { + class PointerCache { + private: + + static const int Ncache=8; + static int victim; + + typedef struct { + void *address; + size_t bytes; + int valid; + } PointerCacheEntry; + + static PointerCacheEntry Entries[Ncache]; + + public: + + + static void *Insert(void *ptr,size_t bytes) ; + static void *Lookup(size_t bytes) ; + + }; + //////////////////////////////////////////////////////////////////// // A lattice of something, but assume the something is SIMDized. //////////////////////////////////////////////////////////////////// + template class alignedAllocator { public: @@ -66,27 +89,27 @@ public: pointer allocate(size_type __n, const void* _p= 0) { + size_type bytes = __n*sizeof(_Tp); + + _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); + #ifdef HAVE_MM_MALLOC_H - _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); + if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128); #else - _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); + if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes); #endif - _Tp tmp; -#ifdef GRID_NUMA -#pragma omp parallel for schedule(static) - for(int i=0;i<__n;i++){ - ptr[i]=tmp; - } -#endif return ptr; } - void deallocate(pointer __p, size_type) { + void deallocate(pointer __p, size_type __n) { + size_type bytes = __n * sizeof(_Tp); + pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); + #ifdef HAVE_MM_MALLOC_H - _mm_free((void *)__p); + if ( __freeme ) _mm_free((void *)__freeme); #else - free((void *)__p); + if ( __freeme ) free((void *)__freeme); #endif } void construct(pointer __p, const _Tp& __val) { }; diff --git a/lib/PerfCount.h b/lib/PerfCount.h index 5ab07c02..749441c5 100644 --- a/lib/PerfCount.h +++ b/lib/PerfCount.h @@ -205,12 +205,13 @@ public: void Stop(void) { count=0; cycles=0; + size_t ign; #ifdef __linux__ if ( fd!= -1) { ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0); - ::read(fd, &count, sizeof(long long)); - ::read(cyclefd, &cycles, sizeof(long long)); + ign=::read(fd, &count, sizeof(long long)); + ign=::read(cyclefd, &cycles, sizeof(long long)); } elapsed = cyclecount() - begin; #else diff --git a/lib/Stencil.h b/lib/Stencil.h index 5c3a5ef9..89533b82 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -113,7 +113,7 @@ Gather_plane_simple_table (std::vector >& table,const Lattice { PARALLEL_FOR_LOOP for(int i=0;i *************************************************************************************/ /* END LEGAL */ +#include #include @@ -48,18 +49,18 @@ namespace QCD { FourDimGrid, FourDimRedBlackGrid,_M5,p), mass(_mass) - { } + { + } template void CayleyFermion5D::Dminus(const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - FermionField tmp(psi._grid); - this->DW(psi,tmp,DaggerNo); + this->DW(psi,this->tmp(),DaggerNo); for(int s=0;stmp(),s,s);// chi = (1-c[s] D_W) psi } } @@ -87,8 +88,8 @@ template void CayleyFermion5D::CayleyReport(void) std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; - // Flops = 9*12*Ls*vol/2 - RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting + // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex + RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; } @@ -110,12 +111,11 @@ template void CayleyFermion5D::DminusDag(const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - FermionField tmp(psi._grid); - this->DW(psi,tmp,DaggerYes); + this->DW(psi,this->tmp(),DaggerYes); for(int s=0;stmp(),s,s);// chi = (1-c[s] D_W) psi } } template @@ -138,6 +138,7 @@ void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &D lower[0] =-mass*lower[0]; M5D(psi,psi,Din,lower,diag,upper); } +// FIXME Redunant with the above routine; check this and eliminate template void CayleyFermion5D::Meo5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; @@ -259,36 +260,33 @@ template void CayleyFermion5D::Meooe (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - FermionField tmp(psi._grid); - Meooe5D(psi,tmp); + Meooe5D(psi,this->tmp()); if ( psi.checkerboard == Odd ) { - this->DhopEO(tmp,chi,DaggerNo); + this->DhopEO(this->tmp(),chi,DaggerNo); } else { - this->DhopOE(tmp,chi,DaggerNo); + this->DhopOE(this->tmp(),chi,DaggerNo); } } template void CayleyFermion5D::MeooeDag (const FermionField &psi, FermionField &chi) { - FermionField tmp(psi._grid); // Apply 4d dslash if ( psi.checkerboard == Odd ) { - this->DhopEO(psi,tmp,DaggerYes); + this->DhopEO(psi,this->tmp(),DaggerYes); } else { - this->DhopOE(psi,tmp,DaggerYes); + this->DhopOE(psi,this->tmp(),DaggerYes); } - MeooeDag5D(tmp,chi); + MeooeDag5D(this->tmp(),chi); } template void CayleyFermion5D::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){ - FermionField tmp(psi._grid); - Meo5D(psi,tmp); + Meo5D(psi,this->tmp()); // Apply 4d dslash fragment - this->DhopDir(tmp,chi,dir,disp); + this->DhopDir(this->tmp(),chi,dir,disp); } // force terms; five routines; default to Dhop on diagonal template @@ -459,9 +457,91 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vectorMooeeInternalCompute(0,inv,MatpInv,MatmInv); + this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag); + } +template +void CayleyFermion5D::MooeeInternalCompute(int dag, int inv, + Vector > & Matp, + Vector > & Matm) +{ + int Ls=this->Ls; + + GridBase *grid = this->FermionRedBlackGrid(); + int LLs = grid->_rdimensions[0]; + + if ( LLs == Ls ) return; // Not vectorised in 5th direction + + Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls); + Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls); + + for(int s=0;s::iscomplex() ) { + sp[l] = PplusMat (l*istride+s1*ostride,s2); + sm[l] = PminusMat(l*istride+s1*ostride,s2); + } else { + // if real + scalar_type tmp; + tmp = PplusMat (l*istride+s1*ostride,s2); + sp[l] = scalar_type(tmp.real(),tmp.real()); + tmp = PminusMat(l*istride+s1*ostride,s2); + sm[l] = scalar_type(tmp.real(),tmp.real()); + } + } + Matp[LLs*s2+s1] = Vp; + Matm[LLs*s2+s1] = Vm; + }} +} + FermOpTemplateInstantiate(CayleyFermion5D); GparityFermOpTemplateInstantiate(CayleyFermion5D); diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/lib/qcd/action/fermion/CayleyFermion5D.h index 6fb58234..86255be6 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.h +++ b/lib/qcd/action/fermion/CayleyFermion5D.h @@ -33,6 +33,31 @@ namespace Grid { namespace QCD { + template struct switcheroo { + static inline int iscomplex() { return 0; } + + template + static inline vec mult(vec a, vec b) { + return real_mult(a,b); + } + }; + template<> struct switcheroo { + static inline int iscomplex() { return 1; } + + template + static inline vec mult(vec a, vec b) { + return a*b; + } + }; + template<> struct switcheroo { + static inline int iscomplex() { return 1; } + template + static inline vec mult(vec a, vec b) { + return a*b; + } + }; + + template class CayleyFermion5D : public WilsonFermion5D { @@ -75,7 +100,19 @@ namespace Grid { std::vector &lower, std::vector &diag, std::vector &upper); + void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv); + void MooeeInternalCompute(int dag, int inv, Vector > & Matp, Vector > & Matm); + + void MooeeInternalAsm(const FermionField &in, FermionField &out, + int LLs, int site, + Vector > &Matp, + Vector > &Matm); + void MooeeInternalZAsm(const FermionField &in, FermionField &out, + int LLs, int site, + Vector > &Matp, + Vector > &Matm); + virtual void Instantiatable(void)=0; @@ -112,6 +149,12 @@ namespace Grid { std::vector ueem; std::vector dee; + // Matrices of 5d ee inverse params + Vector > MatpInv; + Vector > MatmInv; + Vector > MatpInvDag; + Vector > MatmInvDag; + // Constructors CayleyFermion5D(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 35a10de2..ed742ea3 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -29,13 +29,12 @@ Author: paboyle *************************************************************************************/ /* END LEGAL */ -#include + #include namespace Grid { -namespace QCD { - /* +namespace QCD { /* * Dense matrix versions of routines */ template @@ -126,7 +125,6 @@ PARALLEL_FOR_LOOP for(int v=0;v(hp_00.v); hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); @@ -165,42 +160,20 @@ PARALLEL_FOR_LOOP hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } - /* - if ( ss==0) std::cout << " dphi_00 " <::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(),hm_00); + Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(),hm_01); + Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(),hm_02); + Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(),hm_10); + Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(),hm_11); + Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(),hm_12); + Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(),hp_00); + Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(),hp_01); + Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(),hp_02); + Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(),hp_10); + Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(),hp_11); + Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(),hp_12); - - // if ( ss==0){ - /* - std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<::M5Ddag(const FermionField &psi, M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=LLs){ // adds LLs - +#if 0 alignas(64) SiteHalfSpinor hp; alignas(64) SiteHalfSpinor hm; alignas(64) SiteSpinor fp; @@ -287,9 +260,504 @@ PARALLEL_FOR_LOOP chi[ss+v] = chi[ss+v] +l[v]*fm; } +#else + for(int v=0;v(hp_00.v); + hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); + hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); + hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); + hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); + hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); + } + if ( vm>=v ) { + hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); + hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); + hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); + hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); + hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); + hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); + } + + Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(),hp_00); + Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(),hp_01); + Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(),hp_02); + Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(),hp_10); + Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(),hp_11); + Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(),hp_12); + + Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(),hm_00); + Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(),hm_01); + Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(),hm_02); + Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(),hm_10); + Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(),hm_11); + Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(),hm_12); + + vstream(chi[ss+v]()(0)(0),p_00); + vstream(chi[ss+v]()(0)(1),p_01); + vstream(chi[ss+v]()(0)(2),p_02); + vstream(chi[ss+v]()(1)(0),p_10); + vstream(chi[ss+v]()(1)(1),p_11); + vstream(chi[ss+v]()(1)(2),p_12); + vstream(chi[ss+v]()(2)(0),p_20); + vstream(chi[ss+v]()(2)(1),p_21); + vstream(chi[ss+v]()(2)(2),p_22); + vstream(chi[ss+v]()(3)(0),p_30); + vstream(chi[ss+v]()(3)(1),p_31); + vstream(chi[ss+v]()(3)(2),p_32); + } +#endif } M5Dtime+=usecond(); } + + +#ifdef AVX512 +#include +#include +#include +#endif + +template +void CayleyFermion5D::MooeeInternalAsm(const FermionField &psi, FermionField &chi, + int LLs, int site, + Vector > &Matp, + Vector > &Matm) +{ +#ifndef AVX512 + { + SiteHalfSpinor BcastP; + SiteHalfSpinor BcastM; + SiteHalfSpinor SiteChiP; + SiteHalfSpinor SiteChiM; + + // Ls*Ls * 2 * 12 * vol flops + for(int s1=0;s1); + for(int s1=0;s1 +void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionField &chi, + int LLs, int site, Vector > &Matp, Vector > &Matm) +{ +#ifndef AVX512 + { + SiteHalfSpinor BcastP; + SiteHalfSpinor BcastM; + SiteHalfSpinor SiteChiP; + SiteHalfSpinor SiteChiM; + + // Ls*Ls * 2 * 12 * vol flops + for(int s1=0;s1); + for(int s1=0;s1 void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) { @@ -299,108 +767,41 @@ void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField chi.checkerboard=psi.checkerboard; - Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls); - Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls); + Vector > Matp; + Vector > Matm; + Vector > *_Matp; + Vector > *_Matm; - for(int s=0;s > Matp(Ls*LLs); - Vector > Matm(Ls*LLs); + assert(_Matp->size()==Ls*LLs); - for(int s2=0;s2 SitePplus(LLs); - Vector SitePminus(LLs); - Vector SiteChiP(LLs); - Vector SiteChiM(LLs); - Vector SiteChi(LLs); - - SiteHalfSpinor BcastP; - SiteHalfSpinor BcastM; - -#pragma omp for - for(auto site=0;site::iscomplex() ) { + PARALLEL_FOR_LOOP + for(auto site=0;site::MooeeInternal(const Fermion template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); + }} diff --git a/lib/qcd/action/fermion/FermionOperator.h b/lib/qcd/action/fermion/FermionOperator.h index 742c6e08..676a0e83 100644 --- a/lib/qcd/action/fermion/FermionOperator.h +++ b/lib/qcd/action/fermion/FermionOperator.h @@ -48,6 +48,8 @@ namespace Grid { FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {}; + virtual FermionField &tmp(void) = 0; + GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); }; diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index 99baa8a0..04c3671f 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -61,7 +61,9 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), - UmuOdd(&Hgrid) { + UmuOdd(&Hgrid), + _tmp(&Hgrid) +{ // Allocate the required comms buffer ImportGauge(_Umu); } diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index 40fbd1bf..933be732 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -58,6 +58,9 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { GridBase *FermionGrid(void) { return _grid; } GridBase *FermionRedBlackGrid(void) { return _cbgrid; } + FermionField _tmp; + FermionField &tmp(void) { return _tmp; } + ////////////////////////////////////////////////////////////////// // override multiply; cut number routines if pass dagger argument // and also make interface more uniformly consistent diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index d2ac96e3..d70c98c3 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -60,7 +60,8 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, UmuEven(_FourDimRedBlackGrid), UmuOdd (_FourDimRedBlackGrid), Lebesgue(_FourDimGrid), - LebesgueEvenOdd(_FourDimRedBlackGrid) + LebesgueEvenOdd(_FourDimRedBlackGrid), + _tmp(&FiveDimRedBlackGrid) { if (Impl::LsVectorised) { diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index ffb5c58e..fb4fa925 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -74,6 +74,9 @@ namespace QCD { typedef WilsonKernels Kernels; PmuStat stat; + FermionField _tmp; + FermionField &tmp(void) { return _tmp; } + void Report(void); void ZeroCounters(void); double DhopCalls; diff --git a/lib/serialisation/BaseIO.h b/lib/serialisation/BaseIO.h index 7761a8e6..1095baf1 100644 --- a/lib/serialisation/BaseIO.h +++ b/lib/serialisation/BaseIO.h @@ -83,12 +83,7 @@ namespace Grid { typename std::enable_if::value, void>::type write(const std::string& s, const U &output); template - typename std::enable_if::value, void>::type - write(const std::string& s, const U &output); - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type write(const std::string& s, const U &output); private: T *upcast; @@ -107,12 +102,7 @@ namespace Grid { typename std::enable_if::value, void>::type read(const std::string& s, U &output); template - typename std::enable_if::value, void>::type - read(const std::string& s, U &output); - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type read(const std::string& s, U &output); protected: template @@ -221,17 +211,7 @@ namespace Grid { template template - typename std::enable_if::value, void>::type - Writer::write(const std::string &s, const U &output) - { - EnumIO::write(*this, s, output); - } - - template - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type Writer::write(const std::string &s, const U &output) { upcast->writeDefault(s, output); @@ -266,17 +246,7 @@ namespace Grid { template template - typename std::enable_if::value, void>::type - Reader::read(const std::string &s, U &output) - { - EnumIO::read(*this, s, output); - } - - template - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type Reader::read(const std::string &s, U &output) { upcast->readDefault(s, output); @@ -300,7 +270,6 @@ namespace Grid { abort(); } } - } #endif diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h index c78bba0c..c9137dfe 100644 --- a/lib/serialisation/MacroMagic.h +++ b/lib/serialisation/MacroMagic.h @@ -114,35 +114,33 @@ THE SOFTWARE. #define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B); #define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...) \ - \ - \ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__)) \ - \ - \ - template \ - static inline void write(Writer &WR,const std::string &s, const cname &obj){ \ - push(WR,s);\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \ - pop(WR);\ - } \ - \ - \ - template \ - static inline void read(Reader &RD,const std::string &s, cname &obj){ \ - push(RD,s);\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \ - pop(RD);\ - } \ - \ - \ - friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \ - os<<"class "<<#cname<<" {"<\ +static inline void write(Writer &WR,const std::string &s, const cname &obj){ \ + push(WR,s);\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \ + pop(WR);\ +} \ +\ +\ +template \ +static inline void read(Reader &RD,const std::string &s, cname &obj){ \ + push(RD,s);\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \ + pop(RD);\ +} \ +\ +\ +friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \ + os<<"class "<<#cname<<" {"<::type #define GRID_MACRO_ENUMVAL(A,B) A = B, @@ -150,44 +148,52 @@ THE SOFTWARE. #define GRID_MACRO_ENUMTEST(A,B) else if (buf == #A) {obj = GRID_ENUM_TYPE(obj)::A;} #define GRID_MACRO_ENUMCASEIO(A,B) case GRID_ENUM_TYPE(obj)::A: os << #A; break; -namespace Grid { - template - class EnumIO {}; -} - #define GRID_SERIALIZABLE_ENUM(name,undefname,...)\ - enum class name {\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\ - undefname = -1\ +class name: public Serializable\ +{\ +public:\ + enum EnumType\ + {\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\ + undefname = -1\ };\ +public:\ + name(void): value_(undefname) {};\ + name(EnumType value): value_(value) {};\ + template \ + static inline void write(Writer &WR,const std::string &s, const name &obj)\ + {\ + switch (obj.value_)\ + {\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\ + default: Grid::write(WR,s,#undefname); break;\ + }\ + }\ \ - template<>\ - class EnumIO {\ - public:\ - template \ - static inline void write(Writer &WR,const std::string &s, const name &obj){ \ - switch (obj) {\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\ - default: Grid::write(WR,s,#undefname); break;\ - }\ - }\ - \ - template \ - static inline void read(Reader &RD,const std::string &s, name &obj){ \ - std::string buf;\ - Grid::read(RD, s, buf);\ - if (buf == #undefname) {obj = name::undefname;}\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\ - else {obj = name::undefname;}\ - }\ - };\ - \ - inline std::ostream & operator << (std::ostream &os, const name &obj ) { \ + template \ + static inline void read(Reader &RD,const std::string &s, name &obj)\ + {\ + std::string buf;\ + Grid::read(RD, s, buf);\ + if (buf == #undefname) {obj = name::undefname;}\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\ + else {obj = name::undefname;}\ + }\ + inline operator EnumType(void) const\ + {\ + return value_;\ + }\ + inline friend std::ostream & operator<<(std::ostream &os, const name &obj)\ + {\ switch (obj) {\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\ - default: os << #undefname; break;\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\ + default: os << #undefname; break;\ }\ return os;\ - }; + }\ +private:\ + EnumType value_;\ +}; + #endif diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 36360102..724f52bb 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -213,6 +213,29 @@ namespace Optimization { } }; + struct MultRealPart{ + inline __m256 operator()(__m256 a, __m256 b){ + __m256 ymm0; + ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m256d operator()(__m256d a, __m256d b){ + __m256d ymm0; + ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m256 operator()(__m256 a, __m256 b, __m256 c){ + __m256 ymm0 = _mm256_moveldup_ps(a); // ymm0 <- ar ar, + return _mm256_add_ps(_mm256_mul_ps( ymm0, b),c); + } + inline __m256d operator()(__m256d a, __m256d b, __m256d c){ + __m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 ); + return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c); + } + }; + struct MultComplex{ // Complex float inline __m256 operator()(__m256 a, __m256 b){ @@ -627,7 +650,9 @@ namespace Optimization { typedef Optimization::Sub SubSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index d6531d57..ebf99e16 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -189,6 +189,29 @@ namespace Optimization { // 2mul,4 mac +add+sub = 8 flop type insns // 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load. + struct MultRealPart{ + inline __m512 operator()(__m512 a, __m512 b){ + __m512 ymm0; + ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, + return _mm512_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m512d operator()(__m512d a, __m512d b){ + __m512d ymm0; + ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm512_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m512 operator()(__m512 a, __m512 b, __m512 c){ + __m512 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, + return _mm512_fmadd_ps( ymm0, b, c); + } + inline __m512d operator()(__m512d a, __m512d b, __m512d c){ + __m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 ); + return _mm512_fmadd_pd( ymm0, b, c); + } + }; + struct MultComplex{ // Complex float inline __m512 operator()(__m512 a, __m512 b){ @@ -501,6 +524,8 @@ namespace Optimization { typedef Optimization::Mult MultSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 62c78afb..91e9cda2 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -224,6 +224,21 @@ namespace Optimization { #define cmul(a, b, c, i)\ c[i] = a[i]*b[i] - a[i+1]*b[i+1];\ c[i+1] = a[i]*b[i+1] + a[i+1]*b[i]; + + struct MultRealPart{ + template + inline vec operator()(vec a, vec b){ + vec out; + + VECTOR_FOR(i, W::c, 1) + { + out.v[2*i] = a[2*i]*b[2*i]; + out.v[2*i+1] = a[2*i]*b[2*i+1]; + } + return out; + }; + }; + struct MultComplex{ // Complex @@ -456,6 +471,7 @@ namespace Optimization { typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index bc86291d..99a9ea68 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -220,6 +220,14 @@ namespace Optimization { } }; + struct MultRealPart{ + // Complex double + inline vector4double operator()(vector4double a, vector4double b){ + // return vec_xmul(b, a); + return vec_xmul(a, b); + } + FLOAT_WRAP_2(operator(), inline) + }; struct MultComplex{ // Complex double inline vector4double operator()(vector4double a, vector4double b){ @@ -430,6 +438,7 @@ typedef Optimization::Sub SubSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 560eda11..943756b2 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -177,6 +177,29 @@ namespace Optimization { } }; + struct MultRealPart{ + inline __m128 operator()(__m128 a, __m128 b){ + __m128 ymm0; + ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m128d operator()(__m128d a, __m128d b){ + __m128d ymm0; + ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m128 operator()(__m128 a, __m128 b, __m128 c){ + __m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm_add_ps(_mm_mul_ps( ymm0, b),c); + } + inline __m128d operator()(__m128d a, __m128d b, __m128d c){ + __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 ); + return _mm_add_pd(_mm_mul_pd( ymm0, b),c); + } + }; + struct MultComplex{ // Complex float inline __m128 operator()(__m128 a, __m128 b){ @@ -325,9 +348,11 @@ namespace Optimization { } } +#ifndef _mm_alignr_epi64 #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) - +#endif + template static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; template static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; @@ -415,6 +440,8 @@ namespace Optimization { typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 42f28b34..8a6ab2e7 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -101,6 +101,11 @@ template using IfNotInteger = Invoke +Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) { + return op(src_1, src_2, src_3); +} + template Out binary(Input1 src_1, Input2 src_2, Operation op) { return op(src_1, src_2); @@ -178,6 +183,7 @@ class Grid_simd { const Grid_simd *__restrict__ r) { *y = (*l) * (*r); } + friend inline void sub(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r) { @@ -188,7 +194,6 @@ class Grid_simd { const Grid_simd *__restrict__ r) { *y = (*l) + (*r); } - friend inline void mac(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ a, const Grid_simd *__restrict__ x) { @@ -260,7 +265,7 @@ class Grid_simd { } //////////////////////////// - // opreator scalar * simd + // operator scalar * simd //////////////////////////// friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) { Grid_simd va; @@ -433,6 +438,11 @@ inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); } +template =0> +inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ + S* typepun =(S*) &src; + ret.v = unary(real(typepun[lane]), VsplatSIMD()); +} /////////////////////// // Splat @@ -449,6 +459,10 @@ template inline void vsplat(Grid_simd &ret, EnableIf, S> c) { vsplat(ret, real(c), imag(c)); } +template +inline void rsplat(Grid_simd &ret, EnableIf, S> c) { + vsplat(ret, real(c), real(c)); +} // if real fill with a, if complex fill with a in the real part (first function // above) @@ -550,6 +564,21 @@ inline Grid_simd operator-(Grid_simd a, Grid_simd b) { return ret; }; +// Distinguish between complex types and others +template = 0> +inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, MultRealPartSIMD()); + return ret; +}; +template = 0> +inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); + return ret; +}; + + // Distinguish between complex types and others template = 0> inline Grid_simd operator*(Grid_simd a, Grid_simd b) { diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h index 19157db4..7b5964ad 100644 --- a/lib/simd/Intel512avx.h +++ b/lib/simd/Intel512avx.h @@ -95,10 +95,14 @@ Author: paboyle #define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2 #define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n" -#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n" -#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n" -#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n" -#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n" +#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n" +#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n" +#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n" +#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n" +#define VBCASTCDUPf(OFF,A,DEST) "vbroadcastsd (" #OFF "*64 )(" #A ")," #DEST ";\n" +#define VBCASTZDUPf(OFF,A,DEST) "vbroadcastf32x4 (" #OFF "*64 )(" #A ")," #DEST ";\n" +#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST) +#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST) #define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n" #define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n" @@ -106,11 +110,15 @@ Author: paboyle #define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n" +#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" +#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" #define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" #define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" #define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" #define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" +#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" +#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index cfa20c26..e69e541c 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -87,7 +87,8 @@ Author: paboyle VACCTIMESMINUSI1d(A,ACC,tmp) \ VACCTIMESMINUSI2d(A,ACC,tmp) -#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A ); +#define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A +#define LOAD64i(A,ptr) __asm__ ( LOAD64a(A,ptr)); #define LOAD64(A,ptr) LOAD64i(A,ptr) #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" @@ -108,8 +109,8 @@ Author: paboyle //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n" // "clevict0 "#O"*64("#A");\n" -#define VLOADf(OFF,PTR,DEST) "vmovaps " #OFF "*64(" #PTR "), " #DEST ";\n" -#define VLOADd(OFF,PTR,DEST) "vmovapd " #OFF "*64(" #PTR "), " #DEST ";\n" +#define VLOADf(OFF,PTR,DEST) "vmovups " #OFF "*64(" #PTR "), " #DEST ";\n" +#define VLOADd(OFF,PTR,DEST) "vmovupd " #OFF "*64(" #PTR "), " #DEST ";\n" #define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n" #define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n" @@ -143,8 +144,8 @@ Author: paboyle #define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n" #else -#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" -#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#define VSTOREf(OFF,PTR,SRC) "vmovups " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#define VSTOREd(OFF,PTR,SRC) "vmovupd " #SRC "," #OFF "*64(" #PTR ")" ";\n" #endif // Swaps Re/Im ; could unify this with IMCI diff --git a/lib/simd/Intel512double.h b/lib/simd/Intel512double.h index 224c593d..632b5639 100644 --- a/lib/simd/Intel512double.h +++ b/lib/simd/Intel512double.h @@ -144,10 +144,12 @@ Author: paboyle #define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum) #define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum) #define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum) +#undef VMADDRDUP #undef VMADDSUBRDUP #undef VMADDSUBIDUP #undef VMULRDUP #undef VMULIDUP +#define VMADDRDUP(O,P,B,accum) VMADDRDUPd(O,P,B,accum) #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) #define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum) diff --git a/lib/simd/Intel512single.h b/lib/simd/Intel512single.h index 3fa47668..ed135651 100644 --- a/lib/simd/Intel512single.h +++ b/lib/simd/Intel512single.h @@ -144,10 +144,12 @@ Author: paboyle #define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum) #define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum) +#undef VMADDRDUP #undef VMADDSUBRDUP #undef VMADDSUBIDUP #undef VMULRDUP #undef VMULIDUP +#define VMADDRDUP(O,P,B,accum) VMADDRDUPf(O,P,B,accum) #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) #define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum) diff --git a/tests/debug/Test_cayley_even_odd_vec.cc b/tests/Test_cayley_even_odd_vec.cc similarity index 100% rename from tests/debug/Test_cayley_even_odd_vec.cc rename to tests/Test_cayley_even_odd_vec.cc