1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Merge branch 'develop' into feature/qed-fvol

This commit is contained in:
Antonin Portelli 2016-12-20 12:33:02 +01:00
commit 6f1ea96293
26 changed files with 1067 additions and 284 deletions

View File

@ -113,6 +113,36 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\ std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
std::cout<<GridLogMessage << "******************"<<std::endl; std::cout<<GridLogMessage << "******************"<<std::endl;
#define BENCH_ZDW(A,in,out) \
zDw.CayleyZeroCounters(); \
zDw. A (in,out); \
FGrid->Barrier(); \
t0=usecond(); \
for(int i=0;i<ncall;i++){ \
zDw. A (in,out); \
} \
t1=usecond(); \
FGrid->Barrier(); \
zDw.CayleyReport(); \
std::cout<<GridLogMessage << "Called ZDw " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
std::cout<<GridLogMessage << "******************"<<std::endl;
#define BENCH_DW_SSC(A,in,out) \
Dw.CayleyZeroCounters(); \
Dw. A (in,out); \
FGrid->Barrier(); \
t0=usecond(); \
for(int i=0;i<ncall;i++){ \
__SSC_START ; \
Dw. A (in,out); \
__SSC_STOP ; \
} \
t1=usecond(); \
FGrid->Barrier(); \
Dw.CayleyReport(); \
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
std::cout<<GridLogMessage << "******************"<<std::endl;
#define BENCH_DW_MEO(A,in,out) \ #define BENCH_DW_MEO(A,in,out) \
Dw.CayleyZeroCounters(); \ Dw.CayleyZeroCounters(); \
Dw. A (in,out,0); \ Dw. A (in,out,0); \
@ -148,9 +178,15 @@ int main (int argc, char ** argv)
LatticeFermion sref(sFGrid); LatticeFermion sref(sFGrid);
LatticeFermion result(sFGrid); LatticeFermion result(sFGrid);
std::cout<<GridLogMessage << "Constructing Vec5D Dw "<<std::endl; std::cout<<GridLogMessage << "Constructing Vec5D Dw "<<std::endl;
DomainWallFermionVec5dR Dw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5); DomainWallFermionVec5dR Dw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5);
RealD b=1.5;// Scale factor b+c=2, b-c=1
RealD c=0.5;
std::vector<ComplexD> gamma(Ls,std::complex<double>(1.0,0.0));
ZMobiusFermionVec5dR zDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5,gamma,b,c);
std::cout<<GridLogMessage << "Calling Dhop "<<std::endl; std::cout<<GridLogMessage << "Calling Dhop "<<std::endl;
FGrid->Barrier(); FGrid->Barrier();
@ -173,10 +209,13 @@ int main (int argc, char ** argv)
BENCH_DW_MEO(Dhop ,src,result); BENCH_DW_MEO(Dhop ,src,result);
BENCH_DW_MEO(DhopEO ,src_o,r_e); BENCH_DW_MEO(DhopEO ,src_o,r_e);
BENCH_DW(Meooe ,src_o,r_e); BENCH_DW_SSC(Meooe ,src_o,r_e);
BENCH_DW(Mooee ,src_o,r_o); BENCH_DW(Mooee ,src_o,r_o);
BENCH_DW(MooeeInv,src_o,r_o); BENCH_DW(MooeeInv,src_o,r_o);
BENCH_ZDW(Mooee ,src_o,r_o);
BENCH_ZDW(MooeeInv,src_o,r_o);
} }
Grid_finalize(); Grid_finalize();

65
lib/AlignedAllocator.cc Normal file
View File

@ -0,0 +1,65 @@
#include <Grid/Grid.h>
namespace Grid {
int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return NULL;
#ifdef _OPENMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<Ncache;e++) {
if ( Entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%Ncache;
}
if ( Entries[v].valid ) {
ret = Entries[v].address;
Entries[v].valid = 0;
Entries[v].address = NULL;
Entries[v].bytes = 0;
}
Entries[v].address=ptr;
Entries[v].bytes =bytes;
Entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes) {
if (bytes < 4096 ) return NULL;
#ifdef _OPENMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<Ncache;e++){
if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
Entries[e].valid = 0;
return Entries[e].address;
}
}
return NULL;
}
}

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -42,9 +42,32 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
class PointerCache {
private:
static const int Ncache=8;
static int victim;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[Ncache];
public:
static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ;
};
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized. // A lattice of something, but assume the something is SIMDized.
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template<typename _Tp> template<typename _Tp>
class alignedAllocator { class alignedAllocator {
public: public:
@ -66,27 +89,27 @@ public:
pointer allocate(size_type __n, const void* _p= 0) pointer allocate(size_type __n, const void* _p= 0)
{ {
size_type bytes = __n*sizeof(_Tp);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
#else #else
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
#endif #endif
_Tp tmp;
#ifdef GRID_NUMA
#pragma omp parallel for schedule(static)
for(int i=0;i<__n;i++){
ptr[i]=tmp;
}
#endif
return ptr; return ptr;
} }
void deallocate(pointer __p, size_type) { void deallocate(pointer __p, size_type __n) {
size_type bytes = __n * sizeof(_Tp);
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p); if ( __freeme ) _mm_free((void *)__freeme);
#else #else
free((void *)__p); if ( __freeme ) free((void *)__freeme);
#endif #endif
} }
void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p, const _Tp& __val) { };

View File

@ -205,12 +205,13 @@ public:
void Stop(void) { void Stop(void) {
count=0; count=0;
cycles=0; cycles=0;
size_t ign;
#ifdef __linux__ #ifdef __linux__
if ( fd!= -1) { if ( fd!= -1) {
::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0); ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
::read(fd, &count, sizeof(long long)); ign=::read(fd, &count, sizeof(long long));
::read(cyclefd, &cycles, sizeof(long long)); ign=::read(cyclefd, &cycles, sizeof(long long));
} }
elapsed = cyclecount() - begin; elapsed = cyclecount() - begin;
#else #else

View File

@ -113,7 +113,7 @@ Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice
{ {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int i=0;i<table.size();i++){ for(int i=0;i<table.size();i++){
buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]); vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second]));
} }
} }

View File

@ -29,6 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/Eigen/Dense>
#include <Grid.h> #include <Grid.h>
@ -48,18 +49,18 @@ namespace QCD {
FourDimGrid, FourDimGrid,
FourDimRedBlackGrid,_M5,p), FourDimRedBlackGrid,_M5,p),
mass(_mass) mass(_mass)
{ } {
}
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
{ {
int Ls=this->Ls; int Ls=this->Ls;
FermionField tmp(psi._grid);
this->DW(psi,tmp,DaggerNo); this->DW(psi,this->tmp(),DaggerNo);
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
} }
} }
@ -87,8 +88,8 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
// Flops = 9*12*Ls*vol/2 // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
} }
@ -110,12 +111,11 @@ template<class Impl>
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
{ {
int Ls=this->Ls; int Ls=this->Ls;
FermionField tmp(psi._grid);
this->DW(psi,tmp,DaggerYes); this->DW(psi,this->tmp(),DaggerYes);
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
} }
} }
template<class Impl> template<class Impl>
@ -138,6 +138,7 @@ void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &D
lower[0] =-mass*lower[0]; lower[0] =-mass*lower[0];
M5D(psi,psi,Din,lower,diag,upper); M5D(psi,psi,Din,lower,diag,upper);
} }
// FIXME Redunant with the above routine; check this and eliminate
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi) template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
{ {
int Ls=this->Ls; int Ls=this->Ls;
@ -259,36 +260,33 @@ template<class Impl>
void CayleyFermion5D<Impl>::Meooe (const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::Meooe (const FermionField &psi, FermionField &chi)
{ {
int Ls=this->Ls; int Ls=this->Ls;
FermionField tmp(psi._grid);
Meooe5D(psi,tmp); Meooe5D(psi,this->tmp());
if ( psi.checkerboard == Odd ) { if ( psi.checkerboard == Odd ) {
this->DhopEO(tmp,chi,DaggerNo); this->DhopEO(this->tmp(),chi,DaggerNo);
} else { } else {
this->DhopOE(tmp,chi,DaggerNo); this->DhopOE(this->tmp(),chi,DaggerNo);
} }
} }
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::MeooeDag (const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::MeooeDag (const FermionField &psi, FermionField &chi)
{ {
FermionField tmp(psi._grid);
// Apply 4d dslash // Apply 4d dslash
if ( psi.checkerboard == Odd ) { if ( psi.checkerboard == Odd ) {
this->DhopEO(psi,tmp,DaggerYes); this->DhopEO(psi,this->tmp(),DaggerYes);
} else { } else {
this->DhopOE(psi,tmp,DaggerYes); this->DhopOE(psi,this->tmp(),DaggerYes);
} }
MeooeDag5D(tmp,chi); MeooeDag5D(this->tmp(),chi);
} }
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){ void CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
FermionField tmp(psi._grid); Meo5D(psi,this->tmp());
Meo5D(psi,tmp);
// Apply 4d dslash fragment // Apply 4d dslash fragment
this->DhopDir(tmp,chi,dir,disp); this->DhopDir(this->tmp(),chi,dir,disp);
} }
// force terms; five routines; default to Dhop on diagonal // force terms; five routines; default to Dhop on diagonal
template<class Impl> template<class Impl>
@ -459,9 +457,91 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j]; for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
dee[Ls-1] += delta_d; dee[Ls-1] += delta_d;
} }
int inv=1;
this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
} }
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
Vector<iSinglet<Simd> > & Matp,
Vector<iSinglet<Simd> > & Matm)
{
int Ls=this->Ls;
GridBase *grid = this->FermionRedBlackGrid();
int LLs = grid->_rdimensions[0];
if ( LLs == Ls ) return; // Not vectorised in 5th direction
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
for(int s=0;s<Ls;s++){
Pplus(s,s) = bee[s];
Pminus(s,s)= bee[s];
}
for(int s=0;s<Ls-1;s++){
Pminus(s,s+1) = -cee[s];
}
for(int s=0;s<Ls-1;s++){
Pplus(s+1,s) = -cee[s+1];
}
Pplus (0,Ls-1) = mass*cee[0];
Pminus(Ls-1,0) = mass*cee[Ls-1];
Eigen::MatrixXcd PplusMat ;
Eigen::MatrixXcd PminusMat;
if ( inv ) {
PplusMat =Pplus.inverse();
PminusMat=Pminus.inverse();
} else {
PplusMat =Pplus;
PminusMat=Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
typedef typename SiteHalfSpinor::scalar_type scalar_type;
const int Nsimd=Simd::Nsimd();
Matp.resize(Ls*LLs);
Matm.resize(Ls*LLs);
for(int s2=0;s2<Ls;s2++){
for(int s1=0;s1<LLs;s1++){
int istride = LLs;
int ostride = 1;
Simd Vp;
Simd Vm;
scalar_type *sp = (scalar_type *)&Vp;
scalar_type *sm = (scalar_type *)&Vm;
for(int l=0;l<Nsimd;l++){
if ( switcheroo<Coeff_t>::iscomplex() ) {
sp[l] = PplusMat (l*istride+s1*ostride,s2);
sm[l] = PminusMat(l*istride+s1*ostride,s2);
} else {
// if real
scalar_type tmp;
tmp = PplusMat (l*istride+s1*ostride,s2);
sp[l] = scalar_type(tmp.real(),tmp.real());
tmp = PminusMat(l*istride+s1*ostride,s2);
sm[l] = scalar_type(tmp.real(),tmp.real());
}
}
Matp[LLs*s2+s1] = Vp;
Matm[LLs*s2+s1] = Vm;
}}
}
FermOpTemplateInstantiate(CayleyFermion5D); FermOpTemplateInstantiate(CayleyFermion5D);
GparityFermOpTemplateInstantiate(CayleyFermion5D); GparityFermOpTemplateInstantiate(CayleyFermion5D);

View File

@ -33,6 +33,31 @@ namespace Grid {
namespace QCD { namespace QCD {
template<typename T> struct switcheroo {
static inline int iscomplex() { return 0; }
template<class vec>
static inline vec mult(vec a, vec b) {
return real_mult(a,b);
}
};
template<> struct switcheroo<ComplexD> {
static inline int iscomplex() { return 1; }
template<class vec>
static inline vec mult(vec a, vec b) {
return a*b;
}
};
template<> struct switcheroo<ComplexF> {
static inline int iscomplex() { return 1; }
template<class vec>
static inline vec mult(vec a, vec b) {
return a*b;
}
};
template<class Impl> template<class Impl>
class CayleyFermion5D : public WilsonFermion5D<Impl> class CayleyFermion5D : public WilsonFermion5D<Impl>
{ {
@ -75,7 +100,19 @@ namespace Grid {
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag, std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper); std::vector<Coeff_t> &upper);
void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv); void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd> > & Matp, Vector<iSinglet<Simd> > & Matm);
void MooeeInternalAsm(const FermionField &in, FermionField &out,
int LLs, int site,
Vector<iSinglet<Simd> > &Matp,
Vector<iSinglet<Simd> > &Matm);
void MooeeInternalZAsm(const FermionField &in, FermionField &out,
int LLs, int site,
Vector<iSinglet<Simd> > &Matp,
Vector<iSinglet<Simd> > &Matm);
virtual void Instantiatable(void)=0; virtual void Instantiatable(void)=0;
@ -112,6 +149,12 @@ namespace Grid {
std::vector<Coeff_t> ueem; std::vector<Coeff_t> ueem;
std::vector<Coeff_t> dee; std::vector<Coeff_t> dee;
// Matrices of 5d ee inverse params
Vector<iSinglet<Simd> > MatpInv;
Vector<iSinglet<Simd> > MatmInv;
Vector<iSinglet<Simd> > MatpInvDag;
Vector<iSinglet<Simd> > MatmInvDag;
// Constructors // Constructors
CayleyFermion5D(GaugeField &_Umu, CayleyFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -29,13 +29,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/Eigen/Dense>
#include <Grid.h> #include <Grid.h>
namespace Grid { namespace Grid {
namespace QCD { namespace QCD { /*
/*
* Dense matrix versions of routines * Dense matrix versions of routines
*/ */
template<class Impl> template<class Impl>
@ -126,7 +125,6 @@ PARALLEL_FOR_LOOP
for(int v=0;v<LLs;v++){ for(int v=0;v<LLs;v++){
vprefetch(psi[ss+v+LLs]); vprefetch(psi[ss+v+LLs]);
// vprefetch(phi[ss+v+LLs]);
int vp= (v==LLs-1) ? 0 : v+1; int vp= (v==LLs-1) ? 0 : v+1;
int vm= (v==0 ) ? LLs-1 : v-1; int vm= (v==0 ) ? LLs-1 : v-1;
@ -145,9 +143,6 @@ PARALLEL_FOR_LOOP
Simd hm_11 = psi[ss+vm]()(1)(1); Simd hm_11 = psi[ss+vm]()(1)(1);
Simd hm_12 = psi[ss+vm]()(1)(2); Simd hm_12 = psi[ss+vm]()(1)(2);
// if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
// if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
if ( vp<=v ) { if ( vp<=v ) {
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v); hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
@ -165,42 +160,20 @@ PARALLEL_FOR_LOOP
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
} }
/* // Can force these to real arithmetic and save 2x.
if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl; Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl; Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl; Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl; Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
*/ Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
Simd p_00 = d[v]()()() * phi[ss+v]()(0)(0) + l[v]()()()*hm_00; Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
Simd p_01 = d[v]()()() * phi[ss+v]()(0)(1) + l[v]()()()*hm_01; Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
Simd p_02 = d[v]()()() * phi[ss+v]()(0)(2) + l[v]()()()*hm_02; Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
Simd p_10 = d[v]()()() * phi[ss+v]()(1)(0) + l[v]()()()*hm_10; Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
Simd p_11 = d[v]()()() * phi[ss+v]()(1)(1) + l[v]()()()*hm_11; Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
Simd p_12 = d[v]()()() * phi[ss+v]()(1)(2) + l[v]()()()*hm_12; Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
Simd p_20 = d[v]()()() * phi[ss+v]()(2)(0) + u[v]()()()*hp_00; Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
Simd p_21 = d[v]()()() * phi[ss+v]()(2)(1) + u[v]()()()*hp_01;
Simd p_22 = d[v]()()() * phi[ss+v]()(2)(2) + u[v]()()()*hp_02;
Simd p_30 = d[v]()()() * phi[ss+v]()(3)(0) + u[v]()()()*hp_10;
Simd p_31 = d[v]()()() * phi[ss+v]()(3)(1) + u[v]()()()*hp_11;
Simd p_32 = d[v]()()() * phi[ss+v]()(3)(2) + u[v]()()()*hp_12;
// if ( ss==0){
/*
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
}
*/
vstream(chi[ss+v]()(0)(0),p_00); vstream(chi[ss+v]()(0)(0),p_00);
vstream(chi[ss+v]()(0)(1),p_01); vstream(chi[ss+v]()(0)(1),p_01);
vstream(chi[ss+v]()(0)(2),p_02); vstream(chi[ss+v]()(0)(2),p_02);
@ -261,7 +234,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
M5Dtime-=usecond(); M5Dtime-=usecond();
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp; alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm; alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp; alignas(64) SiteSpinor fp;
@ -287,9 +260,504 @@ PARALLEL_FOR_LOOP
chi[ss+v] = chi[ss+v] +l[v]*fm; chi[ss+v] = chi[ss+v] +l[v]*fm;
} }
#else
for(int v=0;v<LLs;v++){
vprefetch(psi[ss+v+LLs]);
int vp= (v==LLs-1) ? 0 : v+1;
int vm= (v==0 ) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(0)(0);
Simd hp_01 = psi[ss+vp]()(0)(1);
Simd hp_02 = psi[ss+vp]()(0)(2);
Simd hp_10 = psi[ss+vp]()(1)(0);
Simd hp_11 = psi[ss+vp]()(1)(1);
Simd hp_12 = psi[ss+vp]()(1)(2);
Simd hm_00 = psi[ss+vm]()(2)(0);
Simd hm_01 = psi[ss+vm]()(2)(1);
Simd hm_02 = psi[ss+vm]()(2)(2);
Simd hm_10 = psi[ss+vm]()(3)(0);
Simd hm_11 = psi[ss+vm]()(3)(1);
Simd hm_12 = psi[ss+vm]()(3)(2);
if ( vp<=v ) {
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if ( vm>=v ) {
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
vstream(chi[ss+v]()(0)(0),p_00);
vstream(chi[ss+v]()(0)(1),p_01);
vstream(chi[ss+v]()(0)(2),p_02);
vstream(chi[ss+v]()(1)(0),p_10);
vstream(chi[ss+v]()(1)(1),p_11);
vstream(chi[ss+v]()(1)(2),p_12);
vstream(chi[ss+v]()(2)(0),p_20);
vstream(chi[ss+v]()(2)(1),p_21);
vstream(chi[ss+v]()(2)(2),p_22);
vstream(chi[ss+v]()(3)(0),p_30);
vstream(chi[ss+v]()(3)(1),p_31);
vstream(chi[ss+v]()(3)(2),p_32);
}
#endif
} }
M5Dtime+=usecond(); M5Dtime+=usecond();
} }
#ifdef AVX512
#include <simd/Intel512common.h>
#include <simd/Intel512avx.h>
#include <simd/Intel512single.h>
#endif
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
int LLs, int site,
Vector<iSinglet<Simd> > &Matp,
Vector<iSinglet<Simd> > &Matm)
{
#ifndef AVX512
{
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
SiteHalfSpinor SiteChiP;
SiteHalfSpinor SiteChiM;
// Ls*Ls * 2 * 12 * vol flops
for(int s1=0;s1<LLs;s1++){
for(int s2=0;s2<LLs;s2++){
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
int s=s2+l*LLs;
int lex=s2+LLs*site;
if ( s2==0 && l==0) {
SiteChiP=zero;
SiteChiM=zero;
}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
}}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
}}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
}}
}}
{
int lex = s1+LLs*site;
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
}}
}
}
}
#else
{
// pointers
// MASK_REGS;
#define Chi_00 %%zmm1
#define Chi_01 %%zmm2
#define Chi_02 %%zmm3
#define Chi_10 %%zmm4
#define Chi_11 %%zmm5
#define Chi_12 %%zmm6
#define Chi_20 %%zmm7
#define Chi_21 %%zmm8
#define Chi_22 %%zmm9
#define Chi_30 %%zmm10
#define Chi_31 %%zmm11
#define Chi_32 %%zmm12
#define BCAST0 %%zmm13
#define BCAST1 %%zmm14
#define BCAST2 %%zmm15
#define BCAST3 %%zmm16
#define BCAST4 %%zmm17
#define BCAST5 %%zmm18
#define BCAST6 %%zmm19
#define BCAST7 %%zmm20
#define BCAST8 %%zmm21
#define BCAST9 %%zmm22
#define BCAST10 %%zmm23
#define BCAST11 %%zmm24
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
for(int s1=0;s1<LLs;s1++){
for(int s2=0;s2<LLs;s2++){
int lex=s2+LLs*site;
uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
uint64_t a2 = (uint64_t)&psi[lex];
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
if ( (s2+l)==0 ) {
asm (
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
VBCASTCDUP(0,%2,BCAST0)
VBCASTCDUP(1,%2,BCAST1)
VBCASTCDUP(2,%2,BCAST2)
VBCASTCDUP(3,%2,BCAST3)
VBCASTCDUP(4,%2,BCAST4) VMULMEM (0,%0,BCAST0,Chi_00)
VBCASTCDUP(5,%2,BCAST5) VMULMEM (0,%0,BCAST1,Chi_01)
VBCASTCDUP(6,%2,BCAST6) VMULMEM (0,%0,BCAST2,Chi_02)
VBCASTCDUP(7,%2,BCAST7) VMULMEM (0,%0,BCAST3,Chi_10)
VBCASTCDUP(8,%2,BCAST8) VMULMEM (0,%0,BCAST4,Chi_11)
VBCASTCDUP(9,%2,BCAST9) VMULMEM (0,%0,BCAST5,Chi_12)
VBCASTCDUP(10,%2,BCAST10) VMULMEM (0,%1,BCAST6,Chi_20)
VBCASTCDUP(11,%2,BCAST11) VMULMEM (0,%1,BCAST7,Chi_21)
VMULMEM (0,%1,BCAST8,Chi_22)
VMULMEM (0,%1,BCAST9,Chi_30)
VMULMEM (0,%1,BCAST10,Chi_31)
VMULMEM (0,%1,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
} else {
asm (
VBCASTCDUP(0,%2,BCAST0) VMADDMEM (0,%0,BCAST0,Chi_00)
VBCASTCDUP(1,%2,BCAST1) VMADDMEM (0,%0,BCAST1,Chi_01)
VBCASTCDUP(2,%2,BCAST2) VMADDMEM (0,%0,BCAST2,Chi_02)
VBCASTCDUP(3,%2,BCAST3) VMADDMEM (0,%0,BCAST3,Chi_10)
VBCASTCDUP(4,%2,BCAST4) VMADDMEM (0,%0,BCAST4,Chi_11)
VBCASTCDUP(5,%2,BCAST5) VMADDMEM (0,%0,BCAST5,Chi_12)
VBCASTCDUP(6,%2,BCAST6) VMADDMEM (0,%1,BCAST6,Chi_20)
VBCASTCDUP(7,%2,BCAST7) VMADDMEM (0,%1,BCAST7,Chi_21)
VBCASTCDUP(8,%2,BCAST8) VMADDMEM (0,%1,BCAST8,Chi_22)
VBCASTCDUP(9,%2,BCAST9) VMADDMEM (0,%1,BCAST9,Chi_30)
VBCASTCDUP(10,%2,BCAST10) VMADDMEM (0,%1,BCAST10,Chi_31)
VBCASTCDUP(11,%2,BCAST11) VMADDMEM (0,%1,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
}
a0 = a0+incr;
a1 = a1+incr;
a2 = a2+sizeof(Simd::scalar_type);
}}
{
int lexa = s1+LLs*site;
asm (
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
}
}
}
#undef Chi_00
#undef Chi_01
#undef Chi_02
#undef Chi_10
#undef Chi_11
#undef Chi_12
#undef Chi_20
#undef Chi_21
#undef Chi_22
#undef Chi_30
#undef Chi_31
#undef Chi_32
#undef BCAST0
#undef BCAST1
#undef BCAST2
#undef BCAST3
#undef BCAST4
#undef BCAST5
#undef BCAST6
#undef BCAST7
#undef BCAST8
#undef BCAST9
#undef BCAST10
#undef BCAST11
#endif
};
// Z-mobius version
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionField &chi,
int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
{
#ifndef AVX512
{
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
SiteHalfSpinor SiteChiP;
SiteHalfSpinor SiteChiM;
// Ls*Ls * 2 * 12 * vol flops
for(int s1=0;s1<LLs;s1++){
for(int s2=0;s2<LLs;s2++){
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
int s=s2+l*LLs;
int lex=s2+LLs*site;
if ( s2==0 && l==0) {
SiteChiP=zero;
SiteChiM=zero;
}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
}}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
}}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co);
SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co);
}}
}}
{
int lex = s1+LLs*site;
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
}}
}
}
}
#else
{
// pointers
// MASK_REGS;
#define Chi_00 %zmm0
#define Chi_01 %zmm1
#define Chi_02 %zmm2
#define Chi_10 %zmm3
#define Chi_11 %zmm4
#define Chi_12 %zmm5
#define Chi_20 %zmm6
#define Chi_21 %zmm7
#define Chi_22 %zmm8
#define Chi_30 %zmm9
#define Chi_31 %zmm10
#define Chi_32 %zmm11
#define pChi_00 %%zmm0
#define pChi_01 %%zmm1
#define pChi_02 %%zmm2
#define pChi_10 %%zmm3
#define pChi_11 %%zmm4
#define pChi_12 %%zmm5
#define pChi_20 %%zmm6
#define pChi_21 %%zmm7
#define pChi_22 %%zmm8
#define pChi_30 %%zmm9
#define pChi_31 %%zmm10
#define pChi_32 %%zmm11
#define BCAST_00 %zmm12
#define SHUF_00 %zmm13
#define BCAST_01 %zmm14
#define SHUF_01 %zmm15
#define BCAST_02 %zmm16
#define SHUF_02 %zmm17
#define BCAST_10 %zmm18
#define SHUF_10 %zmm19
#define BCAST_11 %zmm20
#define SHUF_11 %zmm21
#define BCAST_12 %zmm22
#define SHUF_12 %zmm23
#define Mp %zmm24
#define Mps %zmm25
#define Mm %zmm26
#define Mms %zmm27
#define N 8
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
for(int s1=0;s1<LLs;s1++){
for(int s2=0;s2<LLs;s2++){
int lex=s2+LLs*site;
uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
uint64_t a2 = (uint64_t)&psi[lex];
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
if ( (s2+l)==0 ) {
LOAD64(%r8,a0);
LOAD64(%r9,a1);
LOAD64(%r10,a2);
asm (
VLOAD(0,%r8,Mp)// i r
VLOAD(0,%r9,Mm)
VSHUF(Mp,Mps) // r i
VSHUF(Mm,Mms)
VPREFETCH1(12,%r10) VPREFETCH1(13,%r10)
VPREFETCH1(14,%r10) VPREFETCH1(15,%r10)
VMULIDUP(0*N,%r10,Mps,Chi_00)
VMULIDUP(1*N,%r10,Mps,Chi_01)
VMULIDUP(2*N,%r10,Mps,Chi_02)
VMULIDUP(3*N,%r10,Mps,Chi_10)
VMULIDUP(4*N,%r10,Mps,Chi_11)
VMULIDUP(5*N,%r10,Mps,Chi_12)
VMULIDUP(6*N ,%r10,Mms,Chi_20)
VMULIDUP(7*N ,%r10,Mms,Chi_21)
VMULIDUP(8*N ,%r10,Mms,Chi_22)
VMULIDUP(9*N ,%r10,Mms,Chi_30)
VMULIDUP(10*N,%r10,Mms,Chi_31)
VMULIDUP(11*N,%r10,Mms,Chi_32)
VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
);
} else {
LOAD64(%r8,a0);
LOAD64(%r9,a1);
LOAD64(%r10,a2);
asm (
VLOAD(0,%r8,Mp)
VSHUF(Mp,Mps)
VLOAD(0,%r9,Mm)
VSHUF(Mm,Mms)
VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) // Mri * Pii +- Cir
VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) // Cir = Mir * Prr +- ( Mri * Pii +- Cir)
VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) // Ci = MiPr + Ci + MrPi ; Cr = MrPr - ( MiPi - Cr)
VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
);
}
a0 = a0+incr;
a1 = a1+incr;
a2 = a2+sizeof(Simd::scalar_type);
}}
{
int lexa = s1+LLs*site;
/*
SiteSpinor tmp;
asm (
VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
: : "r" ((uint64_t)&tmp) : "memory" );
*/
asm (
VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
// if ( 1 || (site==0) ) {
// std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
// }
}
}
}
#undef Chi_00
#undef Chi_01
#undef Chi_02
#undef Chi_10
#undef Chi_11
#undef Chi_12
#undef Chi_20
#undef Chi_21
#undef Chi_22
#undef Chi_30
#undef Chi_31
#undef Chi_32
#undef BCAST0
#undef BCAST1
#undef BCAST2
#undef BCAST3
#undef BCAST4
#undef BCAST5
#undef BCAST6
#undef BCAST7
#undef BCAST8
#undef BCAST9
#undef BCAST10
#undef BCAST11
#endif
};
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
{ {
@ -299,108 +767,41 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
chi.checkerboard=psi.checkerboard; chi.checkerboard=psi.checkerboard;
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls); Vector<iSinglet<Simd> > Matp;
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls); Vector<iSinglet<Simd> > Matm;
Vector<iSinglet<Simd> > *_Matp;
Vector<iSinglet<Simd> > *_Matm;
for(int s=0;s<Ls;s++){ // MooeeInternalCompute(dag,inv,Matp,Matm);
Pplus(s,s) = bee[s]; if ( inv && dag ) {
Pminus(s,s)= bee[s]; _Matp = &MatpInvDag;
_Matm = &MatmInvDag;
} }
if ( inv && (!dag) ) {
for(int s=0;s<Ls-1;s++){ _Matp = &MatpInv;
Pminus(s,s+1) = -cee[s]; _Matm = &MatmInv;
}
if ( !inv ) {
MooeeInternalCompute(dag,inv,Matp,Matm);
_Matp = &Matp;
_Matm = &Matm;
} }
assert(_Matp->size()==Ls*LLs);
for(int s=0;s<Ls-1;s++){
Pplus(s+1,s) = -cee[s+1];
}
Pplus (0,Ls-1) = mass*cee[0];
Pminus(Ls-1,0) = mass*cee[Ls-1];
Eigen::MatrixXcd PplusMat ;
Eigen::MatrixXcd PminusMat;
if ( inv ) {
PplusMat =Pplus.inverse();
PminusMat=Pminus.inverse();
} else {
PplusMat =Pplus;
PminusMat=Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
typedef typename SiteHalfSpinor::scalar_type scalar_type;
const int Nsimd=Simd::Nsimd();
Vector<iSinglet<Simd> > Matp(Ls*LLs);
Vector<iSinglet<Simd> > Matm(Ls*LLs);
for(int s2=0;s2<Ls;s2++){
for(int s1=0;s1<LLs;s1++){
int istride = LLs;
int ostride = 1;
Simd Vp;
Simd Vm;
scalar_type *sp = (scalar_type *)&Vp;
scalar_type *sm = (scalar_type *)&Vm;
for(int l=0;l<Nsimd;l++){
sp[l] = PplusMat (l*istride+s1*ostride ,s2);
sm[l] = PminusMat(l*istride+s1*ostride,s2);
}
Matp[LLs*s2+s1] = Vp;
Matm[LLs*s2+s1] = Vm;
}
}
MooeeInvCalls++; MooeeInvCalls++;
MooeeInvTime-=usecond(); MooeeInvTime-=usecond();
// Dynamic allocate on stack to get per thread without serialised heap acces
#pragma omp parallel
{
Vector<SiteHalfSpinor> SitePplus(LLs); if ( switcheroo<Coeff_t>::iscomplex() ) {
Vector<SiteHalfSpinor> SitePminus(LLs); PARALLEL_FOR_LOOP
Vector<SiteHalfSpinor> SiteChiP(LLs); for(auto site=0;site<vol;site++){
Vector<SiteHalfSpinor> SiteChiM(LLs); MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
Vector<SiteSpinor> SiteChi(LLs);
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
#pragma omp for
for(auto site=0;site<vol;site++){
for(int s=0;s<LLs;s++){
int lex = s+LLs*site;
spProj5p(SitePplus[s] ,psi[lex]);
spProj5m(SitePminus[s],psi[lex]);
SiteChiP[s]=zero;
SiteChiM[s]=zero;
} }
} else {
int s=0; PARALLEL_FOR_LOOP
for(int l=0; l<Simd::Nsimd();l++){ // simd lane for(auto site=0;site<vol;site++){
for(int s2=0;s2<LLs;s2++){ // Column loop of right hand side MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
vbroadcast(BcastP,SitePplus [s2],l);
vbroadcast(BcastM,SitePminus[s2],l);
for(int s1=0;s1<LLs;s1++){ // Column loop of reduction variables
SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
}
s++;
}}
for(int s=0;s<LLs;s++){
int lex = s+LLs*site;
spRecon5p(SiteChi[s],SiteChiP[s]);
accumRecon5m(SiteChi[s],SiteChiM[s]);
chi[lex] = SiteChi[s]*0.5;
} }
} }
}
MooeeInvTime+=usecond(); MooeeInvTime+=usecond();
} }
@ -414,4 +815,5 @@ template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const Fermion
template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
}} }}

View File

@ -48,6 +48,8 @@ namespace Grid {
FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {}; FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
virtual FermionField &tmp(void) = 0;
GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know
GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); }; GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

View File

@ -61,7 +61,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
LebesgueEvenOdd(_cbgrid), LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid), Umu(&Fgrid),
UmuEven(&Hgrid), UmuEven(&Hgrid),
UmuOdd(&Hgrid) { UmuOdd(&Hgrid),
_tmp(&Hgrid)
{
// Allocate the required comms buffer // Allocate the required comms buffer
ImportGauge(_Umu); ImportGauge(_Umu);
} }

View File

@ -58,6 +58,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
GridBase *FermionGrid(void) { return _grid; } GridBase *FermionGrid(void) { return _grid; }
GridBase *FermionRedBlackGrid(void) { return _cbgrid; } GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument // override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent // and also make interface more uniformly consistent

View File

@ -60,7 +60,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
UmuEven(_FourDimRedBlackGrid), UmuEven(_FourDimRedBlackGrid),
UmuOdd (_FourDimRedBlackGrid), UmuOdd (_FourDimRedBlackGrid),
Lebesgue(_FourDimGrid), Lebesgue(_FourDimGrid),
LebesgueEvenOdd(_FourDimRedBlackGrid) LebesgueEvenOdd(_FourDimRedBlackGrid),
_tmp(&FiveDimRedBlackGrid)
{ {
if (Impl::LsVectorised) { if (Impl::LsVectorised) {

View File

@ -74,6 +74,9 @@ namespace QCD {
typedef WilsonKernels<Impl> Kernels; typedef WilsonKernels<Impl> Kernels;
PmuStat stat; PmuStat stat;
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
void Report(void); void Report(void);
void ZeroCounters(void); void ZeroCounters(void);
double DhopCalls; double DhopCalls;

View File

@ -83,12 +83,7 @@ namespace Grid {
typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
write(const std::string& s, const U &output); write(const std::string& s, const U &output);
template <typename U> template <typename U>
typename std::enable_if<std::is_enum<U>::value, void>::type typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
write(const std::string& s, const U &output);
template <typename U>
typename std::enable_if<
!(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
void>::type
write(const std::string& s, const U &output); write(const std::string& s, const U &output);
private: private:
T *upcast; T *upcast;
@ -107,12 +102,7 @@ namespace Grid {
typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
read(const std::string& s, U &output); read(const std::string& s, U &output);
template <typename U> template <typename U>
typename std::enable_if<std::is_enum<U>::value, void>::type typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
read(const std::string& s, U &output);
template <typename U>
typename std::enable_if<
!(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
void>::type
read(const std::string& s, U &output); read(const std::string& s, U &output);
protected: protected:
template <typename U> template <typename U>
@ -221,17 +211,7 @@ namespace Grid {
template <typename T> template <typename T>
template <typename U> template <typename U>
typename std::enable_if<std::is_enum<U>::value, void>::type typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
Writer<T>::write(const std::string &s, const U &output)
{
EnumIO<U>::write(*this, s, output);
}
template <typename T>
template <typename U>
typename std::enable_if<
!(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
void>::type
Writer<T>::write(const std::string &s, const U &output) Writer<T>::write(const std::string &s, const U &output)
{ {
upcast->writeDefault(s, output); upcast->writeDefault(s, output);
@ -266,17 +246,7 @@ namespace Grid {
template <typename T> template <typename T>
template <typename U> template <typename U>
typename std::enable_if<std::is_enum<U>::value, void>::type typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
Reader<T>::read(const std::string &s, U &output)
{
EnumIO<U>::read(*this, s, output);
}
template <typename T>
template <typename U>
typename std::enable_if<
!(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
void>::type
Reader<T>::read(const std::string &s, U &output) Reader<T>::read(const std::string &s, U &output)
{ {
upcast->readDefault(s, output); upcast->readDefault(s, output);
@ -300,7 +270,6 @@ namespace Grid {
abort(); abort();
} }
} }
} }
#endif #endif

View File

@ -114,35 +114,33 @@ THE SOFTWARE.
#define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B); #define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B);
#define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...) \ #define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...) \
\ \
\ \
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__)) \ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__)) \
\ \
\ \
template <typename T>\ template <typename T>\
static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \ static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
push(WR,s);\ push(WR,s);\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \
pop(WR);\ pop(WR);\
} \ } \
\ \
\ \
template <typename T>\ template <typename T>\
static inline void read(Reader<T> &RD,const std::string &s, cname &obj){ \ static inline void read(Reader<T> &RD,const std::string &s, cname &obj){ \
push(RD,s);\ push(RD,s);\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \
pop(RD);\ pop(RD);\
} \ } \
\ \
\ \
friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \ friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \
os<<"class "<<#cname<<" {"<<std::endl;\ os<<"class "<<#cname<<" {"<<std::endl;\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__)) \ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__)) \
os<<"}"; \ os<<"}"; \
return os;\ return os;\
}; };
#define GRID_ENUM_TYPE(obj) std::remove_reference<decltype(obj)>::type #define GRID_ENUM_TYPE(obj) std::remove_reference<decltype(obj)>::type
#define GRID_MACRO_ENUMVAL(A,B) A = B, #define GRID_MACRO_ENUMVAL(A,B) A = B,
@ -150,44 +148,52 @@ THE SOFTWARE.
#define GRID_MACRO_ENUMTEST(A,B) else if (buf == #A) {obj = GRID_ENUM_TYPE(obj)::A;} #define GRID_MACRO_ENUMTEST(A,B) else if (buf == #A) {obj = GRID_ENUM_TYPE(obj)::A;}
#define GRID_MACRO_ENUMCASEIO(A,B) case GRID_ENUM_TYPE(obj)::A: os << #A; break; #define GRID_MACRO_ENUMCASEIO(A,B) case GRID_ENUM_TYPE(obj)::A: os << #A; break;
namespace Grid {
template <typename U>
class EnumIO {};
}
#define GRID_SERIALIZABLE_ENUM(name,undefname,...)\ #define GRID_SERIALIZABLE_ENUM(name,undefname,...)\
enum class name {\ class name: public Serializable\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\ {\
undefname = -1\ public:\
enum EnumType\
{\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\
undefname = -1\
};\ };\
public:\
name(void): value_(undefname) {};\
name(EnumType value): value_(value) {};\
template <typename T>\
static inline void write(Writer<T> &WR,const std::string &s, const name &obj)\
{\
switch (obj.value_)\
{\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
default: Grid::write(WR,s,#undefname); break;\
}\
}\
\ \
template<>\ template <typename T>\
class EnumIO<name> {\ static inline void read(Reader<T> &RD,const std::string &s, name &obj)\
public:\ {\
template <typename T>\ std::string buf;\
static inline void write(Writer<T> &WR,const std::string &s, const name &obj){ \ Grid::read(RD, s, buf);\
switch (obj) {\ if (buf == #undefname) {obj = name::undefname;}\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\
default: Grid::write(WR,s,#undefname); break;\ else {obj = name::undefname;}\
}\ }\
}\ inline operator EnumType(void) const\
\ {\
template <typename T>\ return value_;\
static inline void read(Reader<T> &RD,const std::string &s, name &obj){ \ }\
std::string buf;\ inline friend std::ostream & operator<<(std::ostream &os, const name &obj)\
Grid::read(RD, s, buf);\ {\
if (buf == #undefname) {obj = name::undefname;}\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\
else {obj = name::undefname;}\
}\
};\
\
inline std::ostream & operator << (std::ostream &os, const name &obj ) { \
switch (obj) {\ switch (obj) {\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
default: os << #undefname; break;\ default: os << #undefname; break;\
}\ }\
return os;\ return os;\
}; }\
private:\
EnumType value_;\
};
#endif #endif

View File

@ -213,6 +213,29 @@ namespace Optimization {
} }
}; };
struct MultRealPart{
inline __m256 operator()(__m256 a, __m256 b){
__m256 ymm0;
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
return _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
}
inline __m256d operator()(__m256d a, __m256d b){
__m256d ymm0;
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
return _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
}
};
struct MaddRealPart{
inline __m256 operator()(__m256 a, __m256 b, __m256 c){
__m256 ymm0 = _mm256_moveldup_ps(a); // ymm0 <- ar ar,
return _mm256_add_ps(_mm256_mul_ps( ymm0, b),c);
}
inline __m256d operator()(__m256d a, __m256d b, __m256d c){
__m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 );
return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c);
}
};
struct MultComplex{ struct MultComplex{
// Complex float // Complex float
inline __m256 operator()(__m256 a, __m256 b){ inline __m256 operator()(__m256 a, __m256 b){
@ -627,7 +650,9 @@ namespace Optimization {
typedef Optimization::Sub SubSIMD; typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD; typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD; typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD; typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD; typedef Optimization::TimesI TimesISIMD;

View File

@ -189,6 +189,29 @@ namespace Optimization {
// 2mul,4 mac +add+sub = 8 flop type insns // 2mul,4 mac +add+sub = 8 flop type insns
// 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load. // 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load.
struct MultRealPart{
inline __m512 operator()(__m512 a, __m512 b){
__m512 ymm0;
ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
return _mm512_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
}
inline __m512d operator()(__m512d a, __m512d b){
__m512d ymm0;
ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00
return _mm512_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
}
};
struct MaddRealPart{
inline __m512 operator()(__m512 a, __m512 b, __m512 c){
__m512 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
return _mm512_fmadd_ps( ymm0, b, c);
}
inline __m512d operator()(__m512d a, __m512d b, __m512d c){
__m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 );
return _mm512_fmadd_pd( ymm0, b, c);
}
};
struct MultComplex{ struct MultComplex{
// Complex float // Complex float
inline __m512 operator()(__m512 a, __m512 b){ inline __m512 operator()(__m512 a, __m512 b){
@ -501,6 +524,8 @@ namespace Optimization {
typedef Optimization::Mult MultSIMD; typedef Optimization::Mult MultSIMD;
typedef Optimization::Div DivSIMD; typedef Optimization::Div DivSIMD;
typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD; typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD; typedef Optimization::TimesI TimesISIMD;

View File

@ -224,6 +224,21 @@ namespace Optimization {
#define cmul(a, b, c, i)\ #define cmul(a, b, c, i)\
c[i] = a[i]*b[i] - a[i+1]*b[i+1];\ c[i] = a[i]*b[i] - a[i+1]*b[i+1];\
c[i+1] = a[i]*b[i+1] + a[i+1]*b[i]; c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
struct MultRealPart{
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
out.v[2*i] = a[2*i]*b[2*i];
out.v[2*i+1] = a[2*i]*b[2*i+1];
}
return out;
};
};
struct MultComplex{ struct MultComplex{
// Complex // Complex
@ -456,6 +471,7 @@ namespace Optimization {
typedef Optimization::Div DivSIMD; typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD; typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::Conj ConjSIMD; typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD; typedef Optimization::TimesI TimesISIMD;

View File

@ -220,6 +220,14 @@ namespace Optimization {
} }
}; };
struct MultRealPart{
// Complex double
inline vector4double operator()(vector4double a, vector4double b){
// return vec_xmul(b, a);
return vec_xmul(a, b);
}
FLOAT_WRAP_2(operator(), inline)
};
struct MultComplex{ struct MultComplex{
// Complex double // Complex double
inline vector4double operator()(vector4double a, vector4double b){ inline vector4double operator()(vector4double a, vector4double b){
@ -430,6 +438,7 @@ typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD; typedef Optimization::Mult MultSIMD;
typedef Optimization::Div DivSIMD; typedef Optimization::Div DivSIMD;
typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::Conj ConjSIMD; typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD; typedef Optimization::TimesI TimesISIMD;

View File

@ -177,6 +177,29 @@ namespace Optimization {
} }
}; };
struct MultRealPart{
inline __m128 operator()(__m128 a, __m128 b){
__m128 ymm0;
ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
}
inline __m128d operator()(__m128d a, __m128d b){
__m128d ymm0;
ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
}
};
struct MaddRealPart{
inline __m128 operator()(__m128 a, __m128 b, __m128 c){
__m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
return _mm_add_ps(_mm_mul_ps( ymm0, b),c);
}
inline __m128d operator()(__m128d a, __m128d b, __m128d c){
__m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
return _mm_add_pd(_mm_mul_pd( ymm0, b),c);
}
};
struct MultComplex{ struct MultComplex{
// Complex float // Complex float
inline __m128 operator()(__m128 a, __m128 b){ inline __m128 operator()(__m128 a, __m128 b){
@ -325,9 +348,11 @@ namespace Optimization {
} }
} }
#ifndef _mm_alignr_epi64
#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
#endif
template<int n> static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; template<int n> static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };
@ -415,6 +440,8 @@ namespace Optimization {
typedef Optimization::Div DivSIMD; typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD; typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD; typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD; typedef Optimization::TimesI TimesISIMD;

View File

@ -101,6 +101,11 @@ template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integr
// general forms to allow for vsplat syntax // general forms to allow for vsplat syntax
// need explicit declaration of types when used since // need explicit declaration of types when used since
// clang cannot automatically determine the output type sometimes // clang cannot automatically determine the output type sometimes
template <class Out, class Input1, class Input2, class Input3, class Operation>
Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
return op(src_1, src_2, src_3);
}
template <class Out, class Input1, class Input2, class Operation> template <class Out, class Input1, class Input2, class Operation>
Out binary(Input1 src_1, Input2 src_2, Operation op) { Out binary(Input1 src_1, Input2 src_2, Operation op) {
return op(src_1, src_2); return op(src_1, src_2);
@ -178,6 +183,7 @@ class Grid_simd {
const Grid_simd *__restrict__ r) { const Grid_simd *__restrict__ r) {
*y = (*l) * (*r); *y = (*l) * (*r);
} }
friend inline void sub(Grid_simd *__restrict__ y, friend inline void sub(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ l,
const Grid_simd *__restrict__ r) { const Grid_simd *__restrict__ r) {
@ -188,7 +194,6 @@ class Grid_simd {
const Grid_simd *__restrict__ r) { const Grid_simd *__restrict__ r) {
*y = (*l) + (*r); *y = (*l) + (*r);
} }
friend inline void mac(Grid_simd *__restrict__ y, friend inline void mac(Grid_simd *__restrict__ y,
const Scalar_type *__restrict__ a, const Scalar_type *__restrict__ a,
const Grid_simd *__restrict__ x) { const Grid_simd *__restrict__ x) {
@ -260,7 +265,7 @@ class Grid_simd {
} }
//////////////////////////// ////////////////////////////
// opreator scalar * simd // operator scalar * simd
//////////////////////////// ////////////////////////////
friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) { friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
Grid_simd va; Grid_simd va;
@ -433,6 +438,11 @@ inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
S* typepun =(S*) &src; S* typepun =(S*) &src;
vsplat(ret,typepun[lane]); vsplat(ret,typepun[lane]);
} }
template <class S, class V, IfComplex<S> =0>
inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
S* typepun =(S*) &src;
ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
}
/////////////////////// ///////////////////////
// Splat // Splat
@ -449,6 +459,10 @@ template <class S, class V>
inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) { inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
vsplat(ret, real(c), imag(c)); vsplat(ret, real(c), imag(c));
} }
template <class S, class V>
inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
vsplat(ret, real(c), real(c));
}
// if real fill with a, if complex fill with a in the real part (first function // if real fill with a, if complex fill with a in the real part (first function
// above) // above)
@ -550,6 +564,21 @@ inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
return ret; return ret;
}; };
// Distinguish between complex types and others
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
Grid_simd<S, V> ret;
ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
return ret;
};
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
Grid_simd<S, V> ret;
ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
return ret;
};
// Distinguish between complex types and others // Distinguish between complex types and others
template <class S, class V, IfComplex<S> = 0> template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) { inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {

View File

@ -95,10 +95,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2 #define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2
#define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n" #define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n"
#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n" #define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n"
#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n" #define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n"
#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n" #define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n"
#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n" #define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n"
#define VBCASTCDUPf(OFF,A,DEST) "vbroadcastsd (" #OFF "*64 )(" #A ")," #DEST ";\n"
#define VBCASTZDUPf(OFF,A,DEST) "vbroadcastf32x4 (" #OFF "*64 )(" #A ")," #DEST ";\n"
#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST)
#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST)
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n" #define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n" #define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
@ -106,11 +110,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n" #define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n"
#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" #define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" #define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
#define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" #define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
#define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" #define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" #define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
#define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" #define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"

View File

@ -87,7 +87,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VACCTIMESMINUSI1d(A,ACC,tmp) \ VACCTIMESMINUSI1d(A,ACC,tmp) \
VACCTIMESMINUSI2d(A,ACC,tmp) VACCTIMESMINUSI2d(A,ACC,tmp)
#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A ); #define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A
#define LOAD64i(A,ptr) __asm__ ( LOAD64a(A,ptr));
#define LOAD64(A,ptr) LOAD64i(A,ptr) #define LOAD64(A,ptr) LOAD64i(A,ptr)
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
@ -108,8 +109,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n" //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
// "clevict0 "#O"*64("#A");\n" // "clevict0 "#O"*64("#A");\n"
#define VLOADf(OFF,PTR,DEST) "vmovaps " #OFF "*64(" #PTR "), " #DEST ";\n" #define VLOADf(OFF,PTR,DEST) "vmovups " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VLOADd(OFF,PTR,DEST) "vmovapd " #OFF "*64(" #PTR "), " #DEST ";\n" #define VLOADd(OFF,PTR,DEST) "vmovupd " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n" #define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n"
#define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n" #define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n"
@ -143,8 +144,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#else #else
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREf(OFF,PTR,SRC) "vmovups " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovupd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#endif #endif
// Swaps Re/Im ; could unify this with IMCI // Swaps Re/Im ; could unify this with IMCI

View File

@ -144,10 +144,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum) #define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
#define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum) #define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum)
#define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum) #define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum)
#undef VMADDRDUP
#undef VMADDSUBRDUP #undef VMADDSUBRDUP
#undef VMADDSUBIDUP #undef VMADDSUBIDUP
#undef VMULRDUP #undef VMULRDUP
#undef VMULIDUP #undef VMULIDUP
#define VMADDRDUP(O,P,B,accum) VMADDRDUPd(O,P,B,accum)
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum)
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum)
#define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum) #define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum)

View File

@ -144,10 +144,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum) #define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum) #define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
#undef VMADDRDUP
#undef VMADDSUBRDUP #undef VMADDSUBRDUP
#undef VMADDSUBIDUP #undef VMADDSUBIDUP
#undef VMULRDUP #undef VMULRDUP
#undef VMULIDUP #undef VMULIDUP
#define VMADDRDUP(O,P,B,accum) VMADDRDUPf(O,P,B,accum)
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum)
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum)
#define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum) #define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum)