mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-13 04:37:05 +01:00
Merge remote-tracking branch 'upstream/master'
Conflicts: lib/simd/Grid_vector_types.h tests/Makefile.am
This commit is contained in:
@ -86,7 +86,7 @@ namespace Grid {
|
||||
// Common parsing chores
|
||||
std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
|
||||
bool GridCmdOptionExists(char** begin, char** end, const std::string& option);
|
||||
void GridParseIntVector(std::string &str,std::vector<int> & vec);
|
||||
std::string GridCmdVectorIntToString(const std::vector<int> & vec);
|
||||
|
||||
void GridParseLayout(char **argv,int argc,
|
||||
std::vector<int> &latt,
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <iostream>
|
||||
#include <Grid.h>
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
|
||||
#undef __X86_64
|
||||
#define MAC
|
||||
@ -111,6 +112,11 @@ void GridParseLayout(char **argv,int argc,
|
||||
|
||||
}
|
||||
|
||||
std::string GridCmdVectorIntToString(const std::vector<int> & vec){
|
||||
std::ostringstream oss;
|
||||
std::copy(vec.begin(), vec.end(),std::ostream_iterator<int>(oss, " "));
|
||||
return oss.str();
|
||||
}
|
||||
/////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////
|
||||
void Grid_init(int *argc,char ***argv)
|
||||
@ -120,6 +126,15 @@ void Grid_init(int *argc,char ***argv)
|
||||
#endif
|
||||
// Parse command line args.
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
|
||||
std::cout<<"--help : this message"<<std::endl;
|
||||
std::cout<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
|
||||
std::cout<<"--debug-stdout : print stdout from EVERY node"<<std::endl;
|
||||
std::cout<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
||||
std::cout<<"--mpi n.n.n.n : default MPI decomposition"<<std::endl;
|
||||
std::cout<<"--omp n : default number of OMP threads"<<std::endl;
|
||||
std::cout<<"--grid n.n.n.n : default Grid size"<<std::endl;
|
||||
}
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
||||
Grid_debug_handler_init();
|
||||
}
|
||||
@ -129,6 +144,15 @@ void Grid_init(int *argc,char ***argv)
|
||||
GridParseLayout(*argv,*argc,
|
||||
Grid_default_latt,
|
||||
Grid_default_mpi);
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
||||
std::cout<<"Grid Decomposition\n";
|
||||
std::cout<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
||||
std::cout<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
|
||||
std::cout<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
|
||||
std::cout<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
|
||||
std::cout<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
|
||||
std::cout<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -50,15 +50,20 @@ namespace Grid {
|
||||
typedef std::complex<Real> Complex;
|
||||
|
||||
inline RealF adj(const RealF & r){ return r; }
|
||||
inline RealF conj(const RealF & r){ return r; }
|
||||
inline RealF conjugate(const RealF & r){ return r; }
|
||||
inline RealF real(const RealF & r){ return r; }
|
||||
|
||||
inline RealD adj(const RealD & r){ return r; }
|
||||
inline RealD conj(const RealD & r){ return r; }
|
||||
inline RealD conjugate(const RealD & r){ return r; }
|
||||
inline RealD real(const RealD & r){ return r; }
|
||||
|
||||
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
|
||||
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
|
||||
inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
|
||||
inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
|
||||
inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
|
||||
inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }
|
||||
|
||||
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }
|
||||
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
|
||||
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
|
||||
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
|
||||
|
||||
@ -70,15 +75,14 @@ namespace Grid {
|
||||
inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
|
||||
inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
|
||||
inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
|
||||
inline ComplexD adj(const ComplexD& r){ return(conj(r)); }
|
||||
// conj already supported for complex
|
||||
// conjugate already supported for complex
|
||||
|
||||
inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
|
||||
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline ComplexF adj(const ComplexF& r ){ return(conj(r)); }
|
||||
//conj already supported for complex
|
||||
|
||||
//conjugate already supported for complex
|
||||
|
||||
inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
|
||||
inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
|
||||
|
@ -7,8 +7,8 @@ namespace Grid {
|
||||
// LinearOperators Take a something and return a something.
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugate (transpose if real):
|
||||
//
|
||||
// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real):
|
||||
//SBase
|
||||
// i) F(a x + b y) = aF(x) + b F(y).
|
||||
// ii) <x|Op|y> = <y|AdjOp|x>^\ast
|
||||
//
|
||||
@ -25,12 +25,13 @@ namespace Grid {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Field> class HermitianOperatorBase : public LinearOperatorBase<Field> {
|
||||
public:
|
||||
virtual RealD OpAndNorm(const Field &in, Field &out);
|
||||
virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2);
|
||||
void AdjOp(const Field &in, Field &out) {
|
||||
Op(in,out);
|
||||
};
|
||||
void Op(const Field &in, Field &out) {
|
||||
OpAndNorm(in,out);
|
||||
double n1,n2;
|
||||
OpAndNorm(in,out,n1,n2);
|
||||
};
|
||||
};
|
||||
|
||||
@ -80,8 +81,8 @@ namespace Grid {
|
||||
Matrix &_Mat;
|
||||
public:
|
||||
HermitianOperator(Matrix &Mat): _Mat(Mat) {};
|
||||
RealD OpAndNorm(const Field &in, Field &out){
|
||||
return _Mat.MdagM(in,out);
|
||||
void OpAndNorm(const Field &in, Field &out,double &n1,double &n2){
|
||||
return _Mat.MdagM(in,out,n1,n2);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -18,6 +18,7 @@ public:
|
||||
std::cout << Tolerance<<std::endl;
|
||||
};
|
||||
|
||||
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi) {assert(0);};
|
||||
void operator() (HermitianOperatorBase<Field> &Linop,const Field &src, Field &psi){
|
||||
|
||||
RealD cp,c,a,d,b,ssq,qq,b_pred;
|
||||
@ -61,21 +62,27 @@ public:
|
||||
|
||||
Linop.OpAndNorm(p,mmp,d,qq);
|
||||
|
||||
// std::cout <<std::setprecision(4)<< "ConjugateGradient: d,qq "<<d<< " "<<qq <<std::endl;
|
||||
|
||||
a = c/d;
|
||||
b_pred = a*(a*qq-d)/c;
|
||||
|
||||
cp = axpy_norm(r,mmp,r,-a);
|
||||
// std::cout <<std::setprecision(4)<< "ConjugateGradient: a,bp "<<a<< " "<<b_pred <<std::endl;
|
||||
|
||||
cp = axpy_norm(r,-a,mmp,r);
|
||||
b = cp/c;
|
||||
// std::cout <<std::setprecision(4)<< "ConjugateGradient: cp,b "<<cp<< " "<<b <<std::endl;
|
||||
|
||||
// Fuse these loops ; should be really easy
|
||||
psi= a*p+psi;
|
||||
p = p*b+r;
|
||||
|
||||
std::cout << "Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
|
||||
std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
|
||||
|
||||
// Stopping condition
|
||||
if ( cp <= rsq ) {
|
||||
|
||||
Linop.Op(p,mmp);
|
||||
Linop.Op(psi,mmp);
|
||||
p=mmp-src;
|
||||
|
||||
RealD mmpnorm = sqrt(norm2(mmp));
|
||||
@ -83,8 +90,11 @@ public:
|
||||
RealD srcnorm = sqrt(norm2(src));
|
||||
RealD resnorm = sqrt(norm2(p));
|
||||
RealD true_residual = resnorm/srcnorm;
|
||||
|
||||
std::cout<<"ConjugateGradient: Converged on iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
|
||||
std::cout<<"ConjugateGradient: true residual is "<<true_residual<<" sol "<<psinorm<<" src "<<srcnorm<<std::endl;
|
||||
std::cout<<"ConjugateGradient: target residual was "<<Tolerance<<std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
std::cout<<"ConjugateGradient did NOT converge"<<std::endl;
|
||||
|
@ -102,7 +102,7 @@ template <class arg> struct name\
|
||||
|
||||
GridUnopClass(UnarySub,-a);
|
||||
GridUnopClass(UnaryAdj,adj(a));
|
||||
GridUnopClass(UnaryConj,conj(a));
|
||||
GridUnopClass(UnaryConj,conjugate(a));
|
||||
GridUnopClass(UnaryTrace,trace(a));
|
||||
GridUnopClass(UnaryTranspose,transpose(a));
|
||||
|
||||
@ -178,7 +178,7 @@ template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,con
|
||||
|
||||
GRID_DEF_UNOP(operator -,UnarySub);
|
||||
GRID_DEF_UNOP(adj,UnaryAdj);
|
||||
GRID_DEF_UNOP(conj,UnaryConj);
|
||||
GRID_DEF_UNOP(conjugate,UnaryConj);
|
||||
GRID_DEF_UNOP(trace,UnaryTrace);
|
||||
GRID_DEF_UNOP(transpose,UnaryTranspose);
|
||||
|
||||
|
@ -144,14 +144,44 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
|
||||
template<class sobj,class vobj> strong_inline
|
||||
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
|
||||
conformable(lhs,rhs);
|
||||
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||
conformable(x,y);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
vobj tmp = a*lhs._odata[ss];
|
||||
vstream(ret._odata[ss],tmp+rhs._odata[ss]);
|
||||
for(int ss=0;ss<x._grid->oSites();ss++){
|
||||
vobj tmp = a*x._odata[ss]+y._odata[ss];
|
||||
vstream(ret._odata[ss],tmp);
|
||||
}
|
||||
}
|
||||
template<class sobj,class vobj> strong_inline
|
||||
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||
conformable(x,y);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<x._grid->oSites();ss++){
|
||||
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
|
||||
vstream(ret._odata[ss],tmp);
|
||||
}
|
||||
}
|
||||
|
||||
template<class sobj,class vobj> strong_inline
|
||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||
conformable(x,y);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<x._grid->oSites();ss++){
|
||||
vobj tmp = a*x._odata[ss]+y._odata[ss];
|
||||
vstream(ret._odata[ss],tmp);
|
||||
}
|
||||
return norm2(ret);
|
||||
}
|
||||
template<class sobj,class vobj> strong_inline
|
||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||
conformable(x,y);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<x._grid->oSites();ss++){
|
||||
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
|
||||
vstream(ret._odata[ss],tmp);
|
||||
}
|
||||
return norm2(ret); // FIXME implement parallel norm in ss loop
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
|
@ -9,7 +9,7 @@ namespace Grid {
|
||||
// Functionality:
|
||||
// -=,+=,*=,()
|
||||
// add,+,sub,-,mult,mac,*
|
||||
// adj,conj
|
||||
// adj,conjugate
|
||||
// real,imag
|
||||
// transpose,transposeIndex
|
||||
// trace,traceIndex
|
||||
|
@ -18,11 +18,11 @@ PARALLEL_FOR_LOOP
|
||||
return ret;
|
||||
};
|
||||
|
||||
template<class vobj> inline Lattice<vobj> conj(const Lattice<vobj> &lhs){
|
||||
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
||||
Lattice<vobj> ret(lhs._grid);
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
ret._odata[ss] = conj(lhs._odata[ss]);
|
||||
ret._odata[ss] = conjugate(lhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
@ -98,26 +98,26 @@ template<class vtype,int N> inline void timesMinusI(iMatrix<vtype,N> &ret,const
|
||||
///////////////////////////////////////////////
|
||||
// Conj function for scalar, vector, matrix
|
||||
///////////////////////////////////////////////
|
||||
template<class vtype> inline iScalar<vtype> conj(const iScalar<vtype>&r)
|
||||
template<class vtype> inline iScalar<vtype> conjugate(const iScalar<vtype>&r)
|
||||
{
|
||||
iScalar<vtype> ret;
|
||||
ret._internal = conj(r._internal);
|
||||
ret._internal = conjugate(r._internal);
|
||||
return ret;
|
||||
}
|
||||
template<class vtype,int N> inline iVector<vtype,N> conj(const iVector<vtype,N>&r)
|
||||
template<class vtype,int N> inline iVector<vtype,N> conjugate(const iVector<vtype,N>&r)
|
||||
{
|
||||
iVector<vtype,N> ret;
|
||||
for(int i=0;i<N;i++){
|
||||
ret._internal[i] = conj(r._internal[i]);
|
||||
ret._internal[i] = conjugate(r._internal[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<class vtype,int N> inline iMatrix<vtype,N> conj(const iMatrix<vtype,N>&r)
|
||||
template<class vtype,int N> inline iMatrix<vtype,N> conjugate(const iMatrix<vtype,N>&r)
|
||||
{
|
||||
iMatrix<vtype,N> ret;
|
||||
for(int i=0;i<N;i++){
|
||||
for(int j=0;j<N;j++){
|
||||
ret._internal[i][j] = conj(r._internal[i][j]);
|
||||
ret._internal[i][j] = conjugate(r._internal[i][j]);
|
||||
}}
|
||||
return ret;
|
||||
}
|
||||
|
@ -21,7 +21,13 @@ const int WilsonMatrix::Tm = 7;
|
||||
class WilsonCompressor {
|
||||
public:
|
||||
int mu;
|
||||
|
||||
int dag;
|
||||
|
||||
WilsonCompressor(int _dag){
|
||||
mu=0;
|
||||
dag=_dag;
|
||||
assert((dag==0)||(dag==1));
|
||||
}
|
||||
void Point(int p) {
|
||||
mu=p;
|
||||
};
|
||||
@ -29,7 +35,11 @@ const int WilsonMatrix::Tm = 7;
|
||||
vHalfSpinColourVector operator () (const vSpinColourVector &in)
|
||||
{
|
||||
vHalfSpinColourVector ret;
|
||||
switch(mu) {
|
||||
int mudag=mu;
|
||||
if (dag) {
|
||||
mudag=(mu+Nd)%(2*Nd);
|
||||
}
|
||||
switch(mudag) {
|
||||
case WilsonMatrix::Xp:
|
||||
spProjXp(ret,in);
|
||||
break;
|
||||
@ -87,48 +97,49 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF
|
||||
|
||||
RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out)
|
||||
{
|
||||
Dhop(in,out);
|
||||
return 0.0;
|
||||
Dhop(in,out,0);
|
||||
out = (4+mass)*in - 0.5*out ; // FIXME : axpby_norm! fusion fun
|
||||
return norm2(out);
|
||||
}
|
||||
RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out)
|
||||
{
|
||||
Dhop(in,out);
|
||||
return 0.0;
|
||||
Dhop(in,out,1);
|
||||
out = (4+mass)*in - 0.5*out ; // FIXME : axpby_norm! fusion fun
|
||||
return norm2(out);
|
||||
}
|
||||
|
||||
void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out)
|
||||
{
|
||||
Dhop(in,out);
|
||||
Dhop(in,out,0);
|
||||
out = 0.5*out; // FIXME : scale factor in Dhop
|
||||
}
|
||||
void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out)
|
||||
{
|
||||
Dhop(in,out);
|
||||
Dhop(in,out,1);
|
||||
}
|
||||
void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out)
|
||||
{
|
||||
out = (4.0+mass)*in;
|
||||
return ;
|
||||
}
|
||||
void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
|
||||
{
|
||||
out = (1.0/(4.0+mass))*in;
|
||||
return ;
|
||||
}
|
||||
void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out)
|
||||
{
|
||||
out = (1.0/(4.0+mass))*in;
|
||||
return ;
|
||||
}
|
||||
void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
|
||||
{
|
||||
out = (1.0/(4.0+mass))*in;
|
||||
return ;
|
||||
}
|
||||
|
||||
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out)
|
||||
{
|
||||
WilsonCompressor compressor;
|
||||
Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<grid->oSites();sss++){
|
||||
|
||||
vHalfSpinColourVector tmp;
|
||||
vHalfSpinColourVector chi;
|
||||
vSpinColourVector result;
|
||||
@ -136,17 +147,15 @@ PARALLEL_FOR_LOOP
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
// int ss = Stencil._LebesgueReorder[sss];
|
||||
int ss = sss;
|
||||
int ssu= ss;
|
||||
|
||||
// Xp
|
||||
offset = Stencil._offsets [Xp][ss];
|
||||
local = Stencil._is_local[Xp][ss];
|
||||
perm = Stencil._permute[Xp][ss];
|
||||
ptype = Stencil._permute_type[Xp];
|
||||
|
||||
if ( local && perm )
|
||||
{
|
||||
ptype = Stencil._permute_type[Xp];
|
||||
if ( local && perm ) {
|
||||
spProjXp(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
@ -155,7 +164,6 @@ PARALLEL_FOR_LOOP
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
|
||||
//prefetch(Umu._odata[ssu](Yp));
|
||||
spReconXp(result,Uchi);
|
||||
|
||||
// Yp
|
||||
@ -163,9 +171,7 @@ PARALLEL_FOR_LOOP
|
||||
local = Stencil._is_local[Yp][ss];
|
||||
perm = Stencil._permute[Yp][ss];
|
||||
ptype = Stencil._permute_type[Yp];
|
||||
|
||||
if ( local && perm )
|
||||
{
|
||||
if ( local && perm ) {
|
||||
spProjYp(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
@ -174,7 +180,6 @@ PARALLEL_FOR_LOOP
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
|
||||
// prefetch(Umu._odata[ssu](Zp));
|
||||
accumReconYp(result,Uchi);
|
||||
|
||||
// Zp
|
||||
@ -182,9 +187,7 @@ PARALLEL_FOR_LOOP
|
||||
local = Stencil._is_local[Zp][ss];
|
||||
perm = Stencil._permute[Zp][ss];
|
||||
ptype = Stencil._permute_type[Zp];
|
||||
|
||||
if ( local && perm )
|
||||
{
|
||||
if ( local && perm ) {
|
||||
spProjZp(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
@ -193,7 +196,6 @@ PARALLEL_FOR_LOOP
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
|
||||
// prefetch(Umu._odata[ssu](Tp));
|
||||
accumReconZp(result,Uchi);
|
||||
|
||||
// Tp
|
||||
@ -201,9 +203,7 @@ PARALLEL_FOR_LOOP
|
||||
local = Stencil._is_local[Tp][ss];
|
||||
perm = Stencil._permute[Tp][ss];
|
||||
ptype = Stencil._permute_type[Tp];
|
||||
|
||||
if ( local && perm )
|
||||
{
|
||||
if ( local && perm ) {
|
||||
spProjTp(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
@ -212,7 +212,6 @@ PARALLEL_FOR_LOOP
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
|
||||
// prefetch(Umu._odata[ssu](Xm));
|
||||
accumReconTp(result,Uchi);
|
||||
|
||||
// Xm
|
||||
@ -240,8 +239,7 @@ PARALLEL_FOR_LOOP
|
||||
perm = Stencil._permute[Ym][ss];
|
||||
ptype = Stencil._permute_type[Ym];
|
||||
|
||||
if ( local && perm )
|
||||
{
|
||||
if ( local && perm ) {
|
||||
spProjYm(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
@ -257,9 +255,7 @@ PARALLEL_FOR_LOOP
|
||||
local = Stencil._is_local[Zm][ss];
|
||||
perm = Stencil._permute[Zm][ss];
|
||||
ptype = Stencil._permute_type[Zm];
|
||||
|
||||
if ( local && perm )
|
||||
{
|
||||
if ( local && perm ) {
|
||||
spProjZm(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
@ -275,9 +271,7 @@ PARALLEL_FOR_LOOP
|
||||
local = Stencil._is_local[Tm][ss];
|
||||
perm = Stencil._permute[Tm][ss];
|
||||
ptype = Stencil._permute_type[Tm];
|
||||
|
||||
if ( local && perm )
|
||||
{
|
||||
if ( local && perm ) {
|
||||
spProjTm(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
@ -289,16 +283,175 @@ PARALLEL_FOR_LOOP
|
||||
accumReconTm(result,Uchi);
|
||||
|
||||
vstream(out._odata[ss],result);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void WilsonMatrix::Dw(const LatticeFermion &in, LatticeFermion &out)
|
||||
void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out)
|
||||
{
|
||||
return;
|
||||
vHalfSpinColourVector tmp;
|
||||
vHalfSpinColourVector chi;
|
||||
vSpinColourVector result;
|
||||
vHalfSpinColourVector Uchi;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
int ssu= ss;
|
||||
|
||||
// Xp
|
||||
offset = Stencil._offsets [Xm][ss];
|
||||
local = Stencil._is_local[Xm][ss];
|
||||
perm = Stencil._permute[Xm][ss];
|
||||
|
||||
ptype = Stencil._permute_type[Xm];
|
||||
if ( local && perm ) {
|
||||
spProjXp(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
spProjXp(chi,in._odata[offset]);
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
|
||||
spReconXp(result,Uchi);
|
||||
|
||||
// Yp
|
||||
offset = Stencil._offsets [Ym][ss];
|
||||
local = Stencil._is_local[Ym][ss];
|
||||
perm = Stencil._permute[Ym][ss];
|
||||
ptype = Stencil._permute_type[Ym];
|
||||
if ( local && perm ) {
|
||||
spProjYp(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
spProjYp(chi,in._odata[offset]);
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
|
||||
accumReconYp(result,Uchi);
|
||||
|
||||
// Zp
|
||||
offset = Stencil._offsets [Zm][ss];
|
||||
local = Stencil._is_local[Zm][ss];
|
||||
perm = Stencil._permute[Zm][ss];
|
||||
ptype = Stencil._permute_type[Zm];
|
||||
if ( local && perm ) {
|
||||
spProjZp(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
spProjZp(chi,in._odata[offset]);
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
|
||||
accumReconZp(result,Uchi);
|
||||
|
||||
// Tp
|
||||
offset = Stencil._offsets [Tm][ss];
|
||||
local = Stencil._is_local[Tm][ss];
|
||||
perm = Stencil._permute[Tm][ss];
|
||||
ptype = Stencil._permute_type[Tm];
|
||||
if ( local && perm ) {
|
||||
spProjTp(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
spProjTp(chi,in._odata[offset]);
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
|
||||
accumReconTp(result,Uchi);
|
||||
|
||||
// Xm
|
||||
offset = Stencil._offsets [Xp][ss];
|
||||
local = Stencil._is_local[Xp][ss];
|
||||
perm = Stencil._permute[Xp][ss];
|
||||
ptype = Stencil._permute_type[Xp];
|
||||
|
||||
if ( local && perm )
|
||||
{
|
||||
spProjXm(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
spProjXm(chi,in._odata[offset]);
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
|
||||
accumReconXm(result,Uchi);
|
||||
|
||||
|
||||
// Ym
|
||||
offset = Stencil._offsets [Yp][ss];
|
||||
local = Stencil._is_local[Yp][ss];
|
||||
perm = Stencil._permute[Yp][ss];
|
||||
ptype = Stencil._permute_type[Yp];
|
||||
|
||||
if ( local && perm ) {
|
||||
spProjYm(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
spProjYm(chi,in._odata[offset]);
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
|
||||
accumReconYm(result,Uchi);
|
||||
|
||||
// Zm
|
||||
offset = Stencil._offsets [Zp][ss];
|
||||
local = Stencil._is_local[Zp][ss];
|
||||
perm = Stencil._permute[Zp][ss];
|
||||
ptype = Stencil._permute_type[Zp];
|
||||
if ( local && perm ) {
|
||||
spProjZm(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
spProjZm(chi,in._odata[offset]);
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
|
||||
accumReconZm(result,Uchi);
|
||||
|
||||
// Tm
|
||||
offset = Stencil._offsets [Tp][ss];
|
||||
local = Stencil._is_local[Tp][ss];
|
||||
perm = Stencil._permute[Tp][ss];
|
||||
ptype = Stencil._permute_type[Tp];
|
||||
if ( local && perm ) {
|
||||
spProjTm(tmp,in._odata[offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else if ( local ) {
|
||||
spProjTm(chi,in._odata[offset]);
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
|
||||
accumReconTm(result,Uchi);
|
||||
|
||||
vstream(out._odata[ss],result);
|
||||
}
|
||||
|
||||
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
|
||||
{
|
||||
assert((dag==0) ||(dag==1));
|
||||
|
||||
WilsonCompressor compressor(dag);
|
||||
Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
|
||||
|
||||
if ( dag ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<grid->oSites();sss++){
|
||||
DhopSiteDag(sss,in,out);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<grid->oSites();sss++){
|
||||
DhopSite(sss,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}}
|
||||
|
@ -45,10 +45,9 @@ namespace Grid {
|
||||
virtual void MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
|
||||
|
||||
// non-hermitian hopping term; half cb or both
|
||||
void Dhop(const LatticeFermion &in, LatticeFermion &out);
|
||||
|
||||
// m+4r -1/2 Dhop; both cb's
|
||||
void Dw(const LatticeFermion &in, LatticeFermion &out);
|
||||
void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
|
||||
void DhopSite (int ss,const LatticeFermion &in, LatticeFermion &out);
|
||||
void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out);
|
||||
|
||||
typedef iScalar<iMatrix<vComplex, Nc> > matrix;
|
||||
|
||||
|
@ -32,7 +32,7 @@ namespace Grid {
|
||||
friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);}
|
||||
friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);}
|
||||
friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);}
|
||||
friend inline vComplexD adj(const vComplexD &in){ return conj(in); }
|
||||
friend inline vComplexD adj(const vComplexD &in){ return conjugate(in); }
|
||||
|
||||
//////////////////////////////////
|
||||
// Initialise to 1,0,i
|
||||
@ -166,11 +166,11 @@ namespace Grid {
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
/*
|
||||
friend inline void permute(vComplexD &y,vComplexD b,int perm)
|
||||
{
|
||||
Gpermute<vComplexD>(y,b,perm);
|
||||
}
|
||||
/*
|
||||
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
|
||||
{
|
||||
Gmerge<vComplexD,ComplexD >(y,extracted);
|
||||
@ -269,7 +269,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
////////////////////////
|
||||
// Conjugate
|
||||
////////////////////////
|
||||
friend inline vComplexD conj(const vComplexD &in){
|
||||
friend inline vComplexD conjugate(const vComplexD &in){
|
||||
vComplexD ret ; vzero(ret);
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
// addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
|
||||
@ -345,17 +345,17 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
// REDUCE FIXME must be a cleaner implementation
|
||||
friend inline ComplexD Reduce(const vComplexD & in)
|
||||
{
|
||||
#if defined SSE4
|
||||
return ComplexD(in.v[0], in.v[1]); // inefficient
|
||||
#ifdef SSE4
|
||||
return ComplexD(in.v[0],in.v[1]);
|
||||
#endif
|
||||
#if defined(AVX1) || defined (AVX2)
|
||||
vComplexD v1;
|
||||
permute(v1,in,0); // sse 128; paired complex single
|
||||
v1=v1+in;
|
||||
return ComplexD(v1.v[0],v1.v[1]);
|
||||
#endif
|
||||
#if defined (AVX1) || defined(AVX2)
|
||||
// return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
|
||||
__attribute__ ((aligned(32))) double c_[4];
|
||||
_mm256_store_pd(c_,in.v);
|
||||
return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
|
||||
return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#endif
|
||||
@ -387,7 +387,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
};
|
||||
|
||||
|
||||
inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
|
||||
inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conjugate(l)*r; }
|
||||
|
||||
|
||||
typedef vComplexD vDComplex;
|
||||
|
@ -47,7 +47,7 @@ namespace Grid {
|
||||
friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
friend inline vComplexF adj(const vComplexF &in){ return conj(in); }
|
||||
friend inline vComplexF adj(const vComplexF &in){ return conjugate(in); }
|
||||
|
||||
//////////////////////////////////
|
||||
// Initialise to 1,0,i
|
||||
@ -228,42 +228,25 @@ namespace Grid {
|
||||
ret.v = {a,b,a,b};
|
||||
#endif
|
||||
}
|
||||
friend inline ComplexF Reduce(const vComplexF & in)
|
||||
{
|
||||
friend inline void permute(vComplexF &y,vComplexF b,int perm)
|
||||
{
|
||||
Gpermute<vComplexF>(y,b,perm);
|
||||
}
|
||||
friend inline ComplexF Reduce(const vComplexF & in)
|
||||
{
|
||||
#ifdef SSE4
|
||||
union {
|
||||
cvec v1; // SSE 4 x float vector
|
||||
float f[4]; // scalar array of 4 floats
|
||||
} u128;
|
||||
u128.v1= _mm_add_ps(in.v, _mm_shuffle_ps(in.v,in.v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
|
||||
return ComplexF(u128.f[0], u128.f[1]);
|
||||
vComplexF v1;
|
||||
permute(v1,in,0); // sse 128; paired complex single
|
||||
v1=v1+in;
|
||||
return ComplexF(v1.v[0],v1.v[1]);
|
||||
#endif
|
||||
#ifdef AVX1
|
||||
//it would be better passing 2 arguments to saturate the vector lanes
|
||||
union {
|
||||
__m256 v1;
|
||||
float f[8];
|
||||
} u256;
|
||||
//SWAP lanes
|
||||
// FIXME .. icc complains with lib/lattice/Grid_lattice_reduction.h (49): (col. 20) warning #13211: Immediate parameter to intrinsic call too large
|
||||
__m256 t0 = _mm256_permute2f128_ps(in.v, in.v, 1);
|
||||
__m256 t1 = _mm256_permute_ps(in.v , 0b11011000);//real (0,2,1,3)
|
||||
__m256 t2 = _mm256_permute_ps(t0 , 0b10001101);//imag (1,3,0,2)
|
||||
t0 = _mm256_blend_ps(t1, t2, 0b0101000001010000);// (0,0,1,1,0,0,1,1)
|
||||
t1 = _mm256_hadd_ps(t0,t0);
|
||||
u256.v1 = _mm256_hadd_ps(t1, t1);
|
||||
return ComplexF(u256.f[0], u256.f[4]);
|
||||
#endif
|
||||
#ifdef AVX2
|
||||
union {
|
||||
__m256 v1;
|
||||
float f[8];
|
||||
} u256;
|
||||
const __m256i mask= _mm256_set_epi32( 7, 5, 3, 1, 6, 4, 2, 0);
|
||||
__m256 tmp1 = _mm256_permutevar8x32_ps(in.v, mask);
|
||||
__m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
|
||||
u256.v1 = _mm256_hadd_ps(tmp2, tmp2);
|
||||
return ComplexF(u256.f[0], u256.f[4]);
|
||||
#if defined(AVX1) || defined (AVX2)
|
||||
vComplexF v1,v2;
|
||||
permute(v1,in,0); // sse 128; paired complex single
|
||||
v1=v1+in;
|
||||
permute(v2,v1,1); // avx 256; quad complex single
|
||||
v1=v1+v2;
|
||||
return ComplexF(v1.v[0],v1.v[1]);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
|
||||
@ -345,13 +328,10 @@ namespace Grid {
|
||||
// Conjugate
|
||||
///////////////////////
|
||||
|
||||
friend inline vComplexF conj(const vComplexF &in){
|
||||
friend inline vComplexF conjugate(const vComplexF &in){
|
||||
vComplexF ret ; vzero(ret);
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
cvec tmp;
|
||||
tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
|
||||
ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
|
||||
|
||||
ret.v = _mm256_xor_ps(_mm256_addsub_ps(ret.v,in.v), _mm256_set1_ps(-0.f));
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_xor_ps(_mm_addsub_ps(ret.v,in.v), _mm_set1_ps(-0.f));
|
||||
@ -433,10 +413,6 @@ namespace Grid {
|
||||
return *this;
|
||||
}
|
||||
|
||||
friend inline void permute(vComplexF &y,vComplexF b,int perm)
|
||||
{
|
||||
Gpermute<vComplexF>(y,b,perm);
|
||||
}
|
||||
/*
|
||||
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
|
||||
{
|
||||
@ -460,7 +436,7 @@ namespace Grid {
|
||||
|
||||
inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r)
|
||||
{
|
||||
return conj(l)*r;
|
||||
return conjugate(l)*r;
|
||||
}
|
||||
|
||||
inline void zeroit(vComplexF &z){ vzero(z);}
|
||||
|
@ -117,7 +117,7 @@ namespace Grid {
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// mult, sub, add, adj,conj, mac functions
|
||||
// mult, sub, add, adj,conjugate, mac functions
|
||||
///////////////////////////////////////////////
|
||||
friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
|
||||
friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}
|
||||
|
@ -26,7 +26,7 @@ namespace Grid {
|
||||
friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
|
||||
friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
|
||||
friend inline vRealD adj(const vRealD &in) { return in; }
|
||||
friend inline vRealD conj(const vRealD &in){ return in; }
|
||||
friend inline vRealD conjugate(const vRealD &in){ return in; }
|
||||
|
||||
friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
|
||||
#if defined (AVX1) || defined (SSE4)
|
||||
@ -112,11 +112,12 @@ namespace Grid {
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
/*
|
||||
|
||||
friend inline void permute(vRealD &y,vRealD b,int perm)
|
||||
{
|
||||
Gpermute<vRealD>(y,b,perm);
|
||||
}
|
||||
/*
|
||||
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
|
||||
{
|
||||
Gmerge<vRealD,RealD >(y,extracted);
|
||||
@ -209,48 +210,26 @@ namespace Grid {
|
||||
|
||||
friend inline RealD Reduce(const vRealD & in)
|
||||
{
|
||||
#if defined (SSE4)
|
||||
// FIXME Hack
|
||||
const RealD * ptr =(const RealD *) ∈
|
||||
RealD ret = 0;
|
||||
for(int i=0;i<vRealD::Nsimd();i++){
|
||||
ret = ret+ptr[i];
|
||||
}
|
||||
return ret;
|
||||
#ifdef SSE4
|
||||
vRealD v1;
|
||||
permute(v1,in,0); // sse 128; paired real double
|
||||
v1=v1+in;
|
||||
return RealD(v1.v[0]);
|
||||
#endif
|
||||
#if defined (AVX1) || defined(AVX2)
|
||||
typedef union {
|
||||
uint64_t l;
|
||||
double d;
|
||||
} my_conv_t;
|
||||
my_conv_t converter;
|
||||
// more reduce_add
|
||||
/*
|
||||
__attribute__ ((aligned(32))) double c_[16];
|
||||
__m256d tmp = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
|
||||
__m256d hadd = _mm256_hadd_pd(in.v,tmp); // hadd = 1+0,3+2,3+2,1+0
|
||||
tmp = _mm256_permute2f128_pd(hadd,hadd,0x01);// tmp = 3+2,1+0,1+0,3+2
|
||||
hadd = _mm256_hadd_pd(tmp,tmp); // tmp = 3+2+1+0,3+2+1+0,1+0+3+2,1+0+3+2
|
||||
_mm256_store_pd(c_,hadd);<3B>
|
||||
return c[0]
|
||||
*/
|
||||
__m256d tmp = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
|
||||
__m256d hadd = _mm256_hadd_pd(in.v,tmp); // hadd = 1+0,3+2,3+2,1+0
|
||||
hadd = _mm256_hadd_pd(hadd,hadd); // hadd = 1+0+3+2...
|
||||
converter.l = _mm256_extract_epi64((ivec)hadd,0);
|
||||
return converter.d;
|
||||
#if defined(AVX1) || defined (AVX2)
|
||||
vRealD v1,v2;
|
||||
permute(v1,in,0); // avx 256; quad double
|
||||
v1=v1+in;
|
||||
permute(v2,v1,1);
|
||||
v1=v1+v2;
|
||||
return v1.v[0];
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
return _mm512_reduce_add_pd(in.v);
|
||||
/*
|
||||
__attribute__ ((aligned(32))) double c_[8];
|
||||
_mm512_store_pd(c_,in.v);
|
||||
return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7];
|
||||
*/
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// *=,+=,-= operators
|
||||
inline vRealD &operator *=(const vRealD &r) {
|
||||
@ -270,7 +249,7 @@ namespace Grid {
|
||||
static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
|
||||
};
|
||||
|
||||
inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
|
||||
inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conjugate(l)*r; }
|
||||
inline void zeroit(vRealD &z){ vzero(z);}
|
||||
|
||||
inline vRealD outerProduct(const vRealD &l, const vRealD& r)
|
||||
|
@ -92,13 +92,13 @@ namespace Grid {
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// mult, sub, add, adj,conj, mac functions
|
||||
// mult, sub, add, adj,conjugate, mac functions
|
||||
///////////////////////////////////////////////
|
||||
friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);}
|
||||
friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);}
|
||||
friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);}
|
||||
friend inline vRealF adj(const vRealF &in) { return in; }
|
||||
friend inline vRealF conj(const vRealF &in){ return in; }
|
||||
friend inline vRealF conjugate(const vRealF &in){ return in; }
|
||||
|
||||
friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
|
||||
#if defined (AVX1) || defined (SSE4)
|
||||
@ -133,11 +133,12 @@ namespace Grid {
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
/*
|
||||
|
||||
friend inline void permute(vRealF &y,vRealF b,int perm)
|
||||
{
|
||||
Gpermute<vRealF>(y,b,perm);
|
||||
}
|
||||
/*
|
||||
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
|
||||
{
|
||||
Gmerge<vRealF,RealF >(y,extracted);
|
||||
@ -155,7 +156,6 @@ namespace Grid {
|
||||
Gextract<vRealF,RealF>(y,extracted);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Broadcast a value across Nsimd copies.
|
||||
@ -243,33 +243,26 @@ friend inline void vstore(const vRealF &ret, float *a){
|
||||
}
|
||||
friend inline RealF Reduce(const vRealF & in)
|
||||
{
|
||||
#if defined (SSE4)
|
||||
// FIXME Hack
|
||||
const RealF * ptr = (const RealF *) ∈
|
||||
RealF ret = 0;
|
||||
for(int i=0;i<vRealF::Nsimd();i++){
|
||||
ret = ret+ptr[i];
|
||||
}
|
||||
return ret;
|
||||
#ifdef SSE4
|
||||
vRealF v1,v2;
|
||||
permute(v1,in,0); // sse 128; quad single
|
||||
v1=v1+in;
|
||||
permute(v2,v1,1);
|
||||
v1=v1+v2;
|
||||
return v1.v[0];
|
||||
#endif
|
||||
#if defined (AVX1) || defined(AVX2)
|
||||
__attribute__ ((aligned(32))) float c_[16];
|
||||
__m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01);
|
||||
__m256 hadd = _mm256_hadd_ps(in.v,tmp);
|
||||
tmp = _mm256_permute2f128_ps(hadd,hadd,0x01);
|
||||
hadd = _mm256_hadd_ps(tmp,tmp);
|
||||
_mm256_store_ps(c_,hadd);
|
||||
return (float)c_[0];
|
||||
|
||||
#if defined(AVX1) || defined (AVX2)
|
||||
vRealF v1,v2;
|
||||
permute(v1,in,0); // avx 256; octo-double
|
||||
v1=v1+in;
|
||||
permute(v2,v1,1);
|
||||
v1=v1+v2;
|
||||
permute(v2,v1,2);
|
||||
v1=v1+v2;
|
||||
return v1.v[0];
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
return _mm512_reduce_add_ps(in.v);
|
||||
/*
|
||||
__attribute__ ((aligned(64))) float c_[16];
|
||||
_mm512_store_ps(c_,in.v);
|
||||
return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7]
|
||||
+c_[8]+c_[9]+c_[10]+c_[11]+c_[12]+c_[13]+c_[14]+c_[15];
|
||||
*/
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#endif
|
||||
@ -291,7 +284,7 @@ friend inline void vstore(const vRealF &ret, float *a){
|
||||
public:
|
||||
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
|
||||
};
|
||||
inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
|
||||
inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conjugate(l)*r; }
|
||||
inline void zeroit(vRealF &z){ vzero(z);}
|
||||
|
||||
inline vRealF outerProduct(const vRealF &l, const vRealF& r)
|
||||
|
@ -2,7 +2,7 @@
|
||||
/*! @file Grid_vector_types.h
|
||||
@brief Defines templated class Grid_simd to deal with inner vector types
|
||||
*/
|
||||
// Time-stamp: <2015-05-20 17:21:52 neo>
|
||||
// Time-stamp: <2015-05-20 17:31:55 neo>
|
||||
//---------------------------------------------------------------------------
|
||||
#ifndef GRID_VECTOR_TYPES
|
||||
#define GRID_VECTOR_TYPES
|
||||
@ -96,8 +96,9 @@ namespace Grid {
|
||||
friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
|
||||
friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
|
||||
friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
//not for integer types... FIXME
|
||||
friend inline Grid_simd adj(const Grid_simd &in){ return conj(in); }
|
||||
friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Initialise to 1,0,i for the correct types
|
||||
@ -247,13 +248,13 @@ namespace Grid {
|
||||
// Conjugate
|
||||
///////////////////////
|
||||
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
|
||||
friend inline Grid_simd conj(const Grid_simd &in){
|
||||
friend inline Grid_simd conjugate(const Grid_simd &in){
|
||||
Grid_simd ret ;
|
||||
ret.v = unary<Vector_type>(in.v, ConjSIMD());
|
||||
return ret;
|
||||
}
|
||||
template < class S = Scalar_type, NotEnableIf<is_complex < S >, int> = 0 >
|
||||
friend inline Grid_simd conj(const Grid_simd &in){
|
||||
friend inline Grid_simd conjugate(const Grid_simd &in){
|
||||
return in; // for real objects
|
||||
}
|
||||
|
||||
@ -345,7 +346,7 @@ namespace Grid {
|
||||
template<class scalar_type, class vector_type >
|
||||
inline Grid_simd< scalar_type, vector_type> innerProduct(const Grid_simd< scalar_type, vector_type> & l, const Grid_simd< scalar_type, vector_type> & r)
|
||||
{
|
||||
return conj(l)*r;
|
||||
return conjugate(l)*r;
|
||||
}
|
||||
|
||||
template<class scalar_type, class vector_type >
|
||||
|
Reference in New Issue
Block a user