1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 01:05:38 +01:00

Changes to remove warnings under icc; disambiguate AVX512 from IMCI correctly

and drop swizzles in AVX512. Don't know why these compiled.
This commit is contained in:
Peter Boyle 2015-09-23 05:23:45 -07:00
parent 2f38ebc446
commit 5ef42add2d
22 changed files with 997 additions and 129 deletions

15
configure vendored
View File

@ -1384,9 +1384,9 @@ Optional Features:
--disable-dependency-tracking --disable-dependency-tracking
speeds up one-time build speeds up one-time build
--disable-openmp do not use OpenMP --disable-openmp do not use OpenMP
--enable-simd=SSE4|AVX|AVX2|AVX512|MIC --enable-simd=SSE4|AVX|AVX2|AVX512|IMCI
Select instructions to be SSE4.0, AVX 1.0, AVX Select instructions to be SSE4.0, AVX 1.0, AVX
2.0+FMA, AVX 512, MIC 2.0+FMA, AVX 512, IMCI
--enable-precision=single|double --enable-precision=single|double
Select default word size of Real Select default word size of Real
--enable-comms=none|mpi Select communications --enable-comms=none|mpi Select communications
@ -6414,13 +6414,20 @@ $as_echo "#define AVX2 1" >>confdefs.h
$as_echo "$as_me: WARNING: Your processor does not support AVX2 instructions" >&2;} $as_echo "$as_me: WARNING: Your processor does not support AVX2 instructions" >&2;}
fi fi
;; ;;
AVX512|MIC) AVX512)
echo Configuring for AVX512 and MIC echo Configuring for AVX512
$as_echo "#define AVX512 1" >>confdefs.h $as_echo "#define AVX512 1" >>confdefs.h
supported="cross compilation" supported="cross compilation"
;; ;;
IMCI)
echo Configuring for IMCI
$as_echo "#define IMCI 1" >>confdefs.h
supported="cross compilation"
;;
NEONv8) NEONv8)
echo Configuring for experimental ARMv8a support echo Configuring for experimental ARMv8a support

View File

@ -65,8 +65,8 @@ AC_CHECK_FUNCS([gettimeofday])
#Please install or provide the correct path to your installation #Please install or provide the correct path to your installation
#Info at: http://www.mpfr.org/)]) #Info at: http://www.mpfr.org/)])
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|MIC],\ AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|IMCI],\
[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, MIC])],\ [Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2]) [ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
supported=no supported=no
@ -99,9 +99,14 @@ case ${ac_SIMD} in
AC_MSG_WARN([Your processor does not support AVX2 instructions]) AC_MSG_WARN([Your processor does not support AVX2 instructions])
fi fi
;; ;;
AVX512|MIC) AVX512)
echo Configuring for AVX512 and MIC echo Configuring for AVX512
AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] ) AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] )
supported="cross compilation"
;;
IMCI)
echo Configuring for IMCI
AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
supported="cross compilation" supported="cross compilation"
;; ;;
NEONv8) NEONv8)

View File

@ -17,6 +17,9 @@
#include <algorithms/iterative/ConjugateGradientMultiShift.h> #include <algorithms/iterative/ConjugateGradientMultiShift.h>
// Lanczos support
#include <algorithms/iterative/MatrixUtils.h>
#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <algorithms/CoarsenedMatrix.h> #include <algorithms/CoarsenedMatrix.h>

View File

@ -72,7 +72,6 @@ operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return t
template<typename _Tp> inline bool template<typename _Tp> inline bool
operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
}; // namespace Grid }; // namespace Grid
#endif #endif

View File

@ -6,7 +6,7 @@
/* AVX2 Intrinsics */ /* AVX2 Intrinsics */
#undef AVX2 #undef AVX2
/* AVX512 Intrinsics for Knights Corner */ /* AVX512 Intrinsics for Knights Landing */
#undef AVX512 #undef AVX512
/* EMPTY_SIMD only for DEBUGGING */ /* EMPTY_SIMD only for DEBUGGING */
@ -110,6 +110,9 @@
/* Define to 1 if you have the <unistd.h> header file. */ /* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H #undef HAVE_UNISTD_H
/* IMCI Intrinsics for Knights Corner */
#undef IMCI
/* NEON ARMv8 Experimental support */ /* NEON ARMv8 Experimental support */
#undef NEONv8 #undef NEONv8

View File

@ -0,0 +1,388 @@
#ifndef GRID_IRL_H
#define GRID_IRL_H
namespace Grid {
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class ImplicitlyRestartedLanczos {
public:
int Niter;
int Nk;
int Np;
RealD enorm;
RealD vthr;
LinearOperatorBase<Field> &_Linop;
OperatorFunction<Field> &_poly;
ImplicitlyRestartedLanczos(
LinearOperatorBase<Field> &Linop,
OperatorFunction<Field> & poly,
int _Nk,
int _Np,
RealD _enorm,
RealD _vthrs,
int _Niter) :
_Linop(Linop),
_poly(poly),
Nk(_Nk),
Np(_Np),
enorm(_enorm),
vthr(_vthrs)
{
vthr=_vthrs;
Niter=_Niter;
};
void step(Vector<RealD>& lmda,
Vector<RealD>& lmdb,
Vector<Field>& evec,
Field& f,int Nm,int k)
{
assert( k< Nm );
w = opr_->mult(evec[k]);
if(k==0){ // Initial step
RealD wnorm= w*w;
std::cout<<"wnorm="<<wnorm<<std::endl;
RealD alph = evec[k] * w;
w -= alph * evec[k];
lmd[k] = alph;
RealD beta = w * w;
beta = sqrt(beta);
RealD betar = 1.0/beta;
evec[k+1] = betar * w;
lme[k] = beta;
} else { // Iteration step
w -= lme[k-1] * evec[k-1];
RealD alph = evec[k] * w;
w -= alph * evec[k];
RealD beta = w * w;
beta = sqrt(beta);
RealD betar = 1.0/beta;
w *= betar;
lmd[k] = alph;
lme[k] = beta;
orthogonalize(w,evec,k);
if(k < Nm-1) evec[k+1] = w;
}
}
void qr_decomp(Vector<RealD>& lmda,
Vector<RealD>& lmdb,
int Nk,
int Nm,
Vector<RealD>& Qt,
RealD Dsft,
int kmin,
int kmax)
{
int k = kmin-1;
RealD x;
RealD Fden = 1.0/sqrt((lmd[k]-Dsh)*(lmd[k]-Dsh) +lme[k]*lme[k]);
RealD c = ( lmd[k] -Dsh) *Fden;
RealD s = -lme[k] *Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k+1];
RealD tmpb = lme[k];
lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
x = -s*lme[k+1];
lme[k+1] = c*lme[k+1];
for(int i=0; i<Nk; ++i){
RealD Qtmp1 = Qt[i+Nm*k ];
RealD Qtmp2 = Qt[i+Nm*(k+1)];
Qt[i+Nm*k ] = c*Qtmp1 - s*Qtmp2;
Qt[i+Nm*(k+1)] = s*Qtmp1 + c*Qtmp2;
}
// Givens transformations
for(int k = kmin; k < kmax-1; ++k){
RealD Fden = 1.0/sqrt( x*x +lme[k-1]*lme[k-1]);
RealD c = lme[k-1]*Fden;
RealD s = - x*Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k+1];
RealD tmpb = lme[k];
lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
lme[k-1] = c*lme[k-1] -s*x;
if(k != kmax-2){
x = -s*lme[k+1];
lme[k+1] = c*lme[k+1];
}
for(int i=0; i<Nk; ++i){
RealD Qtmp1 = Qt[i+Nm*k ];
RealD Qtmp2 = Qt[i+Nm*(k+1)];
Qt[i+Nm*k ] = c*Qtmp1 -s*Qtmp2;
Qt[i+Nm*(k+1)] = s*Qtmp1 +c*Qtmp2;
}
}
}
void diagonalize(Vector<RealD>& lmda,
Vector<RealD>& lmdb,
int Nm2,
int Nm,
Vector<RealD>& Qt)
{
int Niter = 100*Nm;
int kmin = 1;
int kmax = Nk;
// (this should be more sophisticated)
for(int iter=0; iter<Niter; ++iter){
// determination of 2x2 leading submatrix
RealD dsub = lmd[kmax-1]-lmd[kmax-2];
RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
// (Dsh: shift)
// transformation
qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax);
// Convergence criterion (redef of kmin and kamx)
for(int j=kmax-1; j>= kmin; --j){
RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
if(fabs(lme[j-1])+dds > dds){
kmax = j+1;
goto continued;
}
}
Niter = iter;
return;
continued:
for(int j=0; j<kmax-1; ++j){
RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
if(fabs(lme[j])+dds > dds){
kmin = j+1;
break;
}
}
}
std::cout << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
abort();
}
void orthogonalize(Field& w,
const Vector<Field>& evec,
int k)
{
// Schmidt orthogonalization
size_t size = w.size();
assert(size%2 ==0);
std::slice re(0,size/2,2);
std::slice im(1,size/2,2);
for(int j=0; j<k; ++j){
RealD prdr = evec[j]*w;
RealD prdi = evec[j].im_prod(w);
valarray<RealD> evr(evec[j][re]);
valarray<RealD> evi(evec[j][im]);
w.add(re, -prdr*evr +prdi*evi);
w.add(im, -prdr*evi -prdi*evr);
}
}
void calc(Vector<RealD>& lmd,
Vector<Field>& evec,
const Field& b,
int& Nsbt,
int& Nconv)
{
const size_t fsize = evec[0].size();
Nconv = -1;
Nsbt = 0;
int Nm = Nk_+Np_;
std::cout << " -- Nk = " << Nk_ << " Np = "<< Np_ << endl;
std::cout << " -- Nm = " << Nm << endl;
std::cout << " -- size of lmd = " << lmd.size() << endl;
std::cout << " -- size of evec = " << evec.size() << endl;
assert(Nm < evec.size() && Nm < lmd.size());
vector<RealD> lme(Nm);
vector<RealD> lmd2(Nm);
vector<RealD> lme2(Nm);
vector<RealD> Qt(Nm*Nm);
vector<int> Iconv(Nm);
vector<Field> B(Nm);
for(int k=0; k<Nm; ++k) B[k].resize(fsize);
Field f(fsize);
Field v(fsize);
int k1 = 1;
int k2 = Nk_;
int kconv = 0;
int Kdis = 0;
int Kthrs = 0;
RealD beta_k;
// Set initial vector
evec[0] = 1.0;
RealD vnorm = evec[0]*evec[0];
evec[0] = 1.0/sqrt(vnorm);
// (uniform vector)
// Initial Nk steps
for(int k=0; k<k2; ++k) step(lmd,lme,evec,f,Nm,k);
// Restarting loop begins
for(int iter = 0; iter<Niter_; ++iter){
std::cout<<"\n iteration = "<< iter << endl;
int Nm2 = Nm - kconv;
for(int k=k2; k<Nm; ++k) step(lmd,lme,evec,f,Nm,k);
f *= lme[Nm-1];
// getting eigenvalues
for(int k=0; k<Nm2; ++k){
lmd2[k] = lmd[k+k1-1];
lme2[k] = lme[k+k1-1];
}
setUnit_Qt(Nm,Qt);
diagonalize(lmd2,lme2,Nm2,Nm,Qt);
// sorting
sort_->push(lmd2,Nm);
// Implicitly shifted QR transformations
setUnit_Qt(Nm,Qt);
for(int ip=k2; ip<Nm; ++ip)
qr_decomp(lmd,lme,Nm,Nm,Qt,lmd2[ip],k1,Nm);
for(int i=0; i<(Nk_+1); ++i) B[i] = 0.0;
for(int j=k1-1; j<k2+1; ++j){
for(int k=0; k<Nm; ++k){
B[j] += Qt[k+Nm*j] * evec[k];
}
}
for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j];
// Compressed vector f and beta(k2)
f *= Qt[Nm-1+Nm*(k2-1)];
f += lme[k2-1] * evec[k2];
beta_k = f * f;
beta_k = sqrt(beta_k);
std::cout<<" beta(k) = "<<beta_k<<endl;
RealD betar = 1.0/beta_k;
evec[k2] = betar * f;
lme[k2-1] = beta_k;
// Convergence test
for(int k=0; k<Nm2; ++k){
lmd2[k] = lmd[k];
lme2[k] = lme[k];
}
setUnit_Qt(Nm,Qt);
diagonalize(lmd2,lme2,Nk_,Nm,Qt);
for(int k = 0; k<Nk_; ++k) B[k]=0.0;
for(int j = 0; j<Nk_; ++j){
for(int k = 0; k<Nk_; ++k){
B[j] += Qt[k+j*Nm] * evec[k];
}
}
Kdis = 0;
Kthrs = 0;
std::cout << setiosflags(ios_base::scientific);
for(int i=0; i<Nk_; ++i){
v = opr_->mult(B[i]);
//std::cout<<"vv="<<v*v<<std::endl;
RealD vnum = B[i]*v;
RealD vden = B[i]*B[i];
lmd2[i] = vnum/vden;
v -= lmd2[i]*B[i];
RealD vv = v*v;
std::cout << " [" << setw(3)<< setiosflags(ios_base::right) <<i<<"] ";
std::cout << setw(25)<< setiosflags(ios_base::left)<< lmd2[i];
std::cout <<" "<< setw(25)<< setiosflags(ios_base::right)<< vv<< endl;
if(vv<enorm_){
Iconv[Kdis] = i;
++Kdis;
if(sort_->saturated(lmd2[i],vthr)) ++Kthrs;
std::cout<<"Kthrs="<<Kthrs<<endl;
}
} // i-loop end
std::cout << resetiosflags(ios_base::scientific);
std::cout<<" #modes converged: "<<Kdis<<endl;
if(Kthrs > 0){
// (there is a converged eigenvalue larger than Vthrs.)
Nconv = iter;
goto converged;
}
} // end of iter loop
std::cout<<"\n NOT converged.\n";
abort();
converged:
// Sorting
lmd.clear();
evec.clear();
for(int i=0; i<Kdis; ++i){
lmd.push_back(lmd2[Iconv[i]]);
evec.push_back(B[Iconv[i]]);
}
sort_->push(lmd,evec,Kdis);
Nsbt = Kdis - Kthrs;
std::cout << "\n Converged\n Summary :\n";
std::cout << " -- Iterations = "<< Nconv << "\n";
std::cout << " -- beta(k) = "<< beta_k << "\n";
std::cout << " -- Kdis = "<< Kdis << "\n";
std::cout << " -- Nsbt = "<< Nsbt << "\n";
}
};
}
#endif

View File

@ -0,0 +1,48 @@
#ifndef GRID_MATRIX_UTILS_H
#define GRID_MATRIX_UTILS_H
namespace Grid {
namespace MatrixUtils {
template<class T> inline void Size(Matrix<T>& A,int &N,int &M){
N=A.size(); assert(N>0);
M=A[0].size();
for(int i=0;i<N;i++){
assert(A[i].size()==M);
}
}
template<class T> inline void SizeSquare(Matrix<T>& A,int &N)
{
int M;
Size(A,N,M);
assert(N==M);
}
template<class T> inline void Fill(Matrix<T>& A,T & val)
{
int N,M;
Size(A,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
A[i][j]=val;
}}
}
template<class T> inline void Diagonal(Matrix<T>& A,T & val)
{
int N;
SizeSquare(A,N);
for(int i=0;i<N;i++){
A[i][i]=val;
}
}
template<class T> inline void Identity(Matrix<T>& A)
{
Fill(A,0.0);
Diagonal(A,1.0);
}
};
}
#endif

View File

@ -29,6 +29,9 @@ extern int GridCshiftPermuteMap[4][16];
class LatticeBase {}; class LatticeBase {};
class LatticeExpressionBase {}; class LatticeExpressionBase {};
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; // Aligned allocator??
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
template <typename Op, typename T1> template <typename Op, typename T1>
class LatticeUnaryExpression : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase { class LatticeUnaryExpression : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
public: public:
@ -59,7 +62,7 @@ public:
GridBase *_grid; GridBase *_grid;
int checkerboard; int checkerboard;
std::vector<vobj,alignedAllocator<vobj> > _odata; Vector<vobj> _odata;
// to pthread need a computable loop where loop induction is not required // to pthread need a computable loop where loop induction is not required
int begin(void) { return 0;}; int begin(void) { return 0;};

View File

@ -42,7 +42,7 @@ namespace Grid{
// Staple in direction mu // Staple in direction mu
WilsonLoops<GaugeField>::Staple(dSdU_mu,U,mu); WilsonLoops<GaugeField>::Staple(dSdU_mu,U,mu);
dSdU_mu = Ta(Umu*adj(dSdU_mu))*factor; dSdU_mu = Ta(Umu*adj(dSdU_mu))*factor;
pokeLorentz(dSdU, dSdU_mu, mu); PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
} }
}; };
}; };

View File

@ -10,15 +10,15 @@ public:
virtual void push(const std::string &s) = 0; virtual void push(const std::string &s) = 0;
virtual void pop(void) =0; virtual void pop(void) =0;
virtual void write( const std::string& s,const std::string &output ) =0; virtual void write( const std::string& s,const std::string &output ) =0;
virtual void write( const std::string& s, int16_t output ) =0; virtual void write( const std::string& s,const int16_t output ) =0;
virtual void write( const std::string& s, uint16_t output ) =0; virtual void write( const std::string& s,const uint16_t output ) =0;
virtual void write( const std::string& s, int32_t output ) =0; virtual void write( const std::string& s,const int32_t output ) =0;
virtual void write( const std::string& s, uint32_t output ) =0; virtual void write( const std::string& s,const uint32_t output ) =0;
virtual void write( const std::string& s, int64_t output ) =0; virtual void write( const std::string& s,const int64_t output ) =0;
virtual void write( const std::string& s, uint64_t output ) =0; virtual void write( const std::string& s,const uint64_t output ) =0;
virtual void write( const std::string& s, float output ) =0; virtual void write( const std::string& s,const float output ) =0;
virtual void write( const std::string& s, double output ) =0; virtual void write( const std::string& s,const double output ) =0;
virtual void write( const std::string& s, bool output ) =0; virtual void write( const std::string& s,const bool output ) =0;
}; };

View File

@ -35,19 +35,19 @@ public:
write(s,cstr[c]); write(s,cstr[c]);
} }
}; };
void write( const std::string& s, char output ) { writeInternal(s,output); }; void write( const std::string& s,const char output ) { writeInternal(s,output); };
void write( const std::string& s, int16_t output ) { writeInternal(s,output); }; void write( const std::string& s,const int16_t output ) { writeInternal(s,output); };
void write( const std::string& s, uint16_t output ) { writeInternal(s,output); }; void write( const std::string& s,const uint16_t output ) { writeInternal(s,output); };
void write( const std::string& s, int32_t output ) { writeInternal(s,output); }; void write( const std::string& s,const int32_t output ) { writeInternal(s,output); };
void write( const std::string& s, uint32_t output ) { writeInternal(s,output); }; void write( const std::string& s,const uint32_t output ) { writeInternal(s,output); };
void write( const std::string& s, int64_t output ) { writeInternal(s,output); }; void write( const std::string& s,const int64_t output ) { writeInternal(s,output); };
void write( const std::string& s, uint64_t output ) { writeInternal(s,output); }; void write( const std::string& s,const uint64_t output ) { writeInternal(s,output); };
void write( const std::string& s, float output ) { writeInternal(s,output); }; void write( const std::string& s,const float output ) { writeInternal(s,output); };
void write( const std::string& s, double output ) { writeInternal(s,output); }; void write( const std::string& s,const double output ) { writeInternal(s,output); };
void write( const std::string& s, bool output ) { writeInternal(s,output); }; void write( const std::string& s,const bool output ) { writeInternal(s,output); };
private: private:
template<class T> void writeInternal( const std::string& s, T output ){ template<class T> void writeInternal( const std::string& s,const T output ){
// FIXME --- htons, htonl, htno64 etc.. // FIXME --- htons, htonl, htno64 etc..
file.write((char *)&output,sizeof(T)); file.write((char *)&output,sizeof(T));
} }

View File

@ -120,14 +120,14 @@ THE SOFTWARE.
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__)) \ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__)) \
\ \
\ \
template<class Writer> friend void write(Writer &WR,const std::string &s, const cname &obj){ \ friend void write(Writer &WR,const std::string &s, const cname &obj){ \
push(WR,s);\ push(WR,s);\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \
pop(WR);\ pop(WR);\
} \ } \
\ \
\ \
template<class Reader> friend void read(Reader &RD,const std::string &s, cname &obj){ \ friend void read(Reader &RD,const std::string &s, cname &obj){ \
push(RD,s);\ push(RD,s);\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \
pop(RD);\ pop(RD);\

View File

@ -3,6 +3,7 @@
#include <serialisation/MacroMagic.h> #include <serialisation/MacroMagic.h>
#include <serialisation/BaseIO.h> #include <serialisation/BaseIO.h>
#include <stdint.h>
namespace Grid { namespace Grid {
@ -12,17 +13,17 @@ namespace Grid {
inline void push(Writer & WR,const char *s) { WR.push(std::string(s));} inline void push(Writer & WR,const char *s) { WR.push(std::string(s));}
inline void pop (Writer & WR) { WR.pop();} inline void pop (Writer & WR) { WR.pop();}
inline void write(Writer& wr, const std::string& s,const char * output ) { wr.write(s,std::string(output)); }; // inline void write(Writer& wr, const std::string& s,const char * output ) { wr.write(s,std::string(output)); };
inline void write(Writer& wr, const std::string& s,const std::string &output) { wr.write(s,output); }; inline void write(Writer& wr, const std::string& s,const std::string &output) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s, int16_t output ) { wr.write(s,output); }; inline void write(Writer& wr, const std::string& s,const int16_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s, uint16_t output ) { wr.write(s,output); }; inline void write(Writer& wr, const std::string& s,const uint16_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s, int32_t output ) { wr.write(s,output); }; inline void write(Writer& wr, const std::string& s,const int32_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s, uint32_t output ) { wr.write(s,output); }; inline void write(Writer& wr, const std::string& s,const uint32_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s, int64_t output ) { wr.write(s,output); }; inline void write(Writer& wr, const std::string& s,const int64_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s, uint64_t output ) { wr.write(s,output); }; inline void write(Writer& wr, const std::string& s,const uint64_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s, float output ) { wr.write(s,output); }; inline void write(Writer& wr, const std::string& s,const float output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s, double output ) { wr.write(s,output); }; inline void write(Writer& wr, const std::string& s,const double output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s, bool output ) { wr.write(s,output); }; inline void write(Writer& wr, const std::string& s,const bool output ) { wr.write(s,output); };
inline void push(Reader & WR,const std::string &s) { WR.push(s);} inline void push(Reader & WR,const std::string &s) { WR.push(s);}
inline void push(Reader & WR,const char *s) { WR.push(std::string(s));} inline void push(Reader & WR,const char *s) { WR.push(std::string(s));}

View File

@ -43,19 +43,19 @@ public:
indent(); indent();
file<<output<<std::endl; file<<output<<std::endl;
}; };
void write( const std::string& s, int16_t output ) { writeInternal(s,output); }; void write( const std::string& s,const int16_t output ) { writeInternal(s,output); };
void write( const std::string& s, uint16_t output ) { writeInternal(s,output); }; void write( const std::string& s,const uint16_t output ) { writeInternal(s,output); };
void write( const std::string& s, int32_t output ) { writeInternal(s,output); }; void write( const std::string& s,const int32_t output ) { writeInternal(s,output); };
void write( const std::string& s, uint32_t output ) { writeInternal(s,output); }; void write( const std::string& s,const uint32_t output ) { writeInternal(s,output); };
void write( const std::string& s, int64_t output ) { writeInternal(s,output); }; void write( const std::string& s,const int64_t output ) { writeInternal(s,output); };
void write( const std::string& s, uint64_t output ) { writeInternal(s,output); }; void write( const std::string& s,const uint64_t output ) { writeInternal(s,output); };
void write( const std::string& s, float output ) { writeInternal(s,output); }; void write( const std::string& s,const float output ) { writeInternal(s,output); };
void write( const std::string& s, double output ) { writeInternal(s,output); }; void write( const std::string& s,const double output ) { writeInternal(s,output); };
void write( const std::string& s, bool output ) { writeInternal(s,output); }; void write( const std::string& s,const bool output ) { writeInternal(s,output); };
private: private:
template<class T> void writeInternal( const std::string& s, T output ){ template<class T> void writeInternal( const std::string& s,const T output ){
indent(); indent();
file << std::boolalpha << output<<std::endl; file << std::boolalpha << output<<std::endl;
} }

View File

@ -49,19 +49,20 @@ public:
pugi::xml_node leaf=node.append_child(s.c_str()); pugi::xml_node leaf=node.append_child(s.c_str());
leaf.append_child(pugi::node_pcdata).set_value(output.c_str()); leaf.append_child(pugi::node_pcdata).set_value(output.c_str());
}; };
void write( const std::string& s, int16_t output ) { writeInternal(s,output); };
void write( const std::string& s, uint16_t output ) { writeInternal(s,output); }; void write( const std::string& s,const int16_t output ) { writeInternal(s,output); };
void write( const std::string& s, int32_t output ) { writeInternal(s,output); }; void write( const std::string& s,const uint16_t output ) { writeInternal(s,output); };
void write( const std::string& s, uint32_t output ) { writeInternal(s,output); }; void write( const std::string& s,const int32_t output ) { writeInternal(s,output); };
void write( const std::string& s, int64_t output ) { writeInternal(s,output); }; void write( const std::string& s,const uint32_t output ) { writeInternal(s,output); };
void write( const std::string& s, uint64_t output ) { writeInternal(s,output); }; void write( const std::string& s,const int64_t output ) { writeInternal(s,output); };
void write( const std::string& s, float output ) { writeInternal(s,output); }; void write( const std::string& s,const uint64_t output ) { writeInternal(s,output); };
void write( const std::string& s, double output ) { writeInternal(s,output); }; void write( const std::string& s,const float output ) { writeInternal(s,output); };
void write( const std::string& s, bool output ) { writeInternal(s,output); }; void write( const std::string& s,const double output ) { writeInternal(s,output); };
void write( const std::string& s,const bool output ) { writeInternal(s,output); };
private: private:
template<class T> void writeInternal( const std::string& s, T output ){ template<class T> void writeInternal( const std::string& s,const T output ){
std::ostringstream os; std::ostringstream os;
os << std::boolalpha << output; os << std::boolalpha << output;
write(s,os.str()); write(s,os.str());

View File

@ -149,49 +149,33 @@ namespace Optimization {
} }
}; };
// Note, we can beat the shuf overhead in chain with two temporaries
// Ar Ai , Br Bi, Ai Ar // one shuf
//tmpr Ar Br, Ai Bi // Mul/Mac/Mac
//tmpi Br Ai, Bi Ar // Mul/Mac/Mac
// add tmpi,shuf(tmpi)
// sub tmpr,shuf(tmpi)
// shuf(tmpr,tmpi). // Could drop/trade for write mask
// Gives
// 2mul,4 mac +add+sub = 8 flop type insns
// 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load.
struct MultComplex{ struct MultComplex{
// Complex float // Complex float
inline __m512 operator()(__m512 a, __m512 b){ inline __m512 operator()(__m512 a, __m512 b){
__m512 vzero,ymm0,ymm1,real, imag; // dup, dup, perm, mul, madd
vzero = _mm512_setzero_ps(); __m512 a_real = _mm512_moveldup_ps( a ); // Ar Ar
ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // __m512 a_imag = _mm512_movehdup_ps( a ); // Ai Ai
real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0); a_imag = _mm512_mul_ps( a_imag, _mm512_permute_ps( b, 0xB1 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0); return _mm512_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
ymm1 = _mm512_mul_ps(real, b);
ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
return _mm512_fmadd_ps(ymm0,imag,ymm1);
} }
// Complex double // Complex double
inline __m512d operator()(__m512d a, __m512d b){ inline __m512d operator()(__m512d a, __m512d b){
/* This is from __m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
* Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets __m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
* @inproceedings{McFarlin:2011:ASV:1995896.1995938, a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
* author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus}, return _mm512_fmaddsub_pd( a_real, b, a_imag );
* title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
* booktitle = {Proceedings of the International Conference on Supercomputing},
* series = {ICS '11},
* year = {2011},
* isbn = {978-1-4503-0102-2},
* location = {Tucson, Arizona, USA},
* pages = {265--274},
* numpages = {10},
* url = {http://doi.acm.org/10.1145/1995896.1995938},
* doi = {10.1145/1995896.1995938},
* acmid = {1995938},
* publisher = {ACM},
* address = {New York, NY, USA},
* keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
* }
*/
__m512d vzero,ymm0,ymm1,real,imag;
vzero =_mm512_setzero_pd();
ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); //
real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
ymm1 = _mm512_mul_pd(real, b);
ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
return _mm512_fmadd_pd(ymm0,imag,ymm1);
} }
}; };
@ -227,12 +211,12 @@ namespace Optimization {
//Complex single //Complex single
inline __m512 operator()(__m512 in, __m512 ret){ inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
} }
//Complex double //Complex double
inline __m512d operator()(__m512d in, __m512d ret){ inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
} }
@ -241,13 +225,13 @@ namespace Optimization {
struct TimesI{ struct TimesI{
//Complex single //Complex single
inline __m512 operator()(__m512 in, __m512 ret){ inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
} }
//Complex double //Complex double
inline __m512d operator()(__m512d in, __m512d ret){ inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK __m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
} }
@ -325,8 +309,8 @@ namespace Grid {
} conv; } conv;
conv.v = b.v; conv.v = b.v;
switch(perm){ switch(perm){
case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break; case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
default: assert(0); break; default: assert(0); break;

355
lib/simd/Grid_imci.h Normal file
View File

@ -0,0 +1,355 @@
//----------------------------------------------------------------------
/*! @file Grid_knc.h
@brief Optimization libraries for AVX512 instructions set for KNC
Using intrinsics
*/
// Time-stamp: <2015-06-09 14:27:28 neo>
//----------------------------------------------------------------------
#include <immintrin.h>
#ifndef KNC_ONLY_STORES
#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512
#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512
#endif
namespace Optimization {
struct Vsplat{
//Complex float
inline __m512 operator()(float a, float b){
return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
}
// Real float
inline __m512 operator()(float a){
return _mm512_set1_ps(a);
}
//Complex double
inline __m512d operator()(double a, double b){
return _mm512_set_pd(b,a,b,a,b,a,b,a);
}
//Real double
inline __m512d operator()(double a){
return _mm512_set1_pd(a);
}
//Integer
inline __m512i operator()(Integer a){
return _mm512_set1_epi32(a);
}
};
struct Vstore{
//Float
inline void operator()(__m512 a, float* F){
_mm512_store_ps(F,a);
}
//Double
inline void operator()(__m512d a, double* D){
_mm512_store_pd(D,a);
}
//Integer
inline void operator()(__m512i a, Integer* I){
_mm512_store_si512((__m512i *)I,a);
}
};
struct Vstream{
//Float
inline void operator()(float * a, __m512 b){
_mm512_storenrngo_ps(a,b);
}
//Double
inline void operator()(double * a, __m512d b){
_mm512_storenrngo_pd(a,b);
}
};
struct Vset{
// Complex float
inline __m512 operator()(Grid::ComplexF *a){
return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Complex double
inline __m512d operator()(Grid::ComplexD *a){
return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Real float
inline __m512 operator()(float *a){
return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
// Real double
inline __m512d operator()(double *a){
return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
// Integer
inline __m512i operator()(Integer *a){
return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
};
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Complex/Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_add_ps(a,b);
}
//Complex/Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_add_pd(a,b);
}
//Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_add_epi32(a,b);
}
};
struct Sub{
//Complex/Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_sub_ps(a,b);
}
//Complex/Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_sub_pd(a,b);
}
//Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_sub_epi32(a,b);
}
};
struct MultComplex{
// Complex float
inline __m512 operator()(__m512 a, __m512 b){
__m512 vzero,ymm0,ymm1,real, imag;
vzero = _mm512_setzero_ps();
ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); //
real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
ymm1 = _mm512_mul_ps(real, b);
ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
return _mm512_fmadd_ps(ymm0,imag,ymm1);
}
// Complex double
inline __m512d operator()(__m512d a, __m512d b){
/* This is from
* Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets
* @inproceedings{McFarlin:2011:ASV:1995896.1995938,
* author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
* title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
* booktitle = {Proceedings of the International Conference on Supercomputing},
* series = {ICS '11},
* year = {2011},
* isbn = {978-1-4503-0102-2},
* location = {Tucson, Arizona, USA},
* pages = {265--274},
* numpages = {10},
* url = {http://doi.acm.org/10.1145/1995896.1995938},
* doi = {10.1145/1995896.1995938},
* acmid = {1995938},
* publisher = {ACM},
* address = {New York, NY, USA},
* keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
* }
*/
__m512d vzero,ymm0,ymm1,real,imag;
vzero =_mm512_setzero_pd();
ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); //
real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
ymm1 = _mm512_mul_pd(real, b);
ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
return _mm512_fmadd_pd(ymm0,imag,ymm1);
}
};
struct Mult{
// Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_mul_ps(a,b);
}
// Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_mul_pd(a,b);
}
// Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_mullo_epi32(a,b);
}
};
struct Conj{
// Complex single
inline __m512 operator()(__m512 in){
return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag
}
// Complex double
inline __m512d operator()(__m512d in){
return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
}
};
struct TimesI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
}
};
//////////////////////////////////////////////
// Some Template specialization
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
return _mm512_reduce_add_ps(in);
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
return _mm512_reduce_add_pd(in);
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
namespace Grid {
typedef __m512 SIMD_Ftype; // Single precision type
typedef __m512d SIMD_Dtype; // Double precision type
typedef __m512i SIMD_Itype; // Integer type
// prefecth
inline void v_prefetch0(int size, const char *ptr){
for(int i=0;i<size;i+=64){ // Define L1 linesize above
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
_mm_prefetch(ptr+i+512,_MM_HINT_T0);
}
}
inline void prefetch_HINT_T0(const char *ptr){
_mm_prefetch(ptr,_MM_HINT_T0);
}
// Gpermute utilities consider coalescing into 1 Gpermute
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
union {
__m512 f;
decltype(VectorSIMD::v) v;
} conv;
conv.v = b.v;
switch(perm){
case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
default: assert(0); break;
}
y.v=conv.v;
};
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

View File

@ -19,6 +19,9 @@
#if defined AVX512 #if defined AVX512
#include "Grid_avx512.h" #include "Grid_avx512.h"
#endif #endif
#if defined IMCI
#include "Grid_imci.h"
#endif
#if defined QPX #if defined QPX
#include "Grid_qpx.h" #include "Grid_qpx.h"
#endif #endif
@ -263,15 +266,13 @@ namespace Grid {
// this is only for the complex version // this is only for the complex version
template <class S, class V, IfComplex<S> =0, class ABtype> template <class S, class V, IfComplex<S> =0, class ABtype>
inline void vsplat(Grid_simd<S,V> &ret,ABtype a, ABtype b){ inline void vsplat(Grid_simd<S,V> &ret,ABtype a, ABtype b){
ret.v = binary<V>(a, b, VsplatSIMD()); ret.v = binary<V>(a, b, VsplatSIMD());
} }
// overload if complex // overload if complex
template <class S,class V> inline void vsplat(Grid_simd<S,V> &ret, EnableIf<is_complex < S >, S> c) { template <class S,class V> inline void vsplat(Grid_simd<S,V> &ret, EnableIf<is_complex < S >, S> c) {
Real a = real(c); vsplat(ret,real(c),imag(c));
Real b = imag(c);
vsplat(ret,a,b);
} }
//if real fill with a, if complex fill with a in the real part (first function above) //if real fill with a, if complex fill with a in the real part (first function above)
@ -290,8 +291,8 @@ namespace Grid {
template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));}
// if not complex overload here // if not complex overload here
template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,1.0); } template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret) { vsplat(ret,0.0); } template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
// For integral types // For integral types
template <class S,class V,IfInteger<S> = 0 > inline void vone(Grid_simd<S,V> &ret) {vsplat(ret,1); } template <class S,class V,IfInteger<S> = 0 > inline void vone(Grid_simd<S,V> &ret) {vsplat(ret,1); }
@ -304,13 +305,18 @@ namespace Grid {
/////////////////////// ///////////////////////
// Vstream // Vstream
/////////////////////// ///////////////////////
template <class S,class V, IfNotInteger<S> = 0 > template <class S,class V, IfReal<S> = 0 >
inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){ inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
binary<void>((Real*)&out.v, in.v, VstreamSIMD()); binary<void>((S *)&out.v, in.v, VstreamSIMD());
} }
template <class S,class V, IfComplex<S> = 0 >
inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
typedef typename S::value_type T;
binary<void>((T *)&out.v, in.v, VstreamSIMD());
}
template <class S,class V, IfInteger<S> = 0 > template <class S,class V, IfInteger<S> = 0 >
inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){ inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
out=in; out=in;
} }

View File

@ -44,7 +44,10 @@ icpc-avx512)
CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none
;; ;;
icpc-mic) icpc-mic)
CXX=icpc ../../configure --host=none --enable-simd=AVX512 CXXFLAGS="-mmic -O3 -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-mmic -O3 -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
;;
icpc-mic-avx512)
CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3 -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
;; ;;
clang-sse) clang-sse)
CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none

View File

@ -1,5 +1,5 @@
bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_partfrac_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
Test_GaugeAction_SOURCES=Test_GaugeAction.cc Test_GaugeAction_SOURCES=Test_GaugeAction.cc
@ -85,6 +85,8 @@ Test_dwf_fpgcr_LDADD=-lGrid
Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc
Test_dwf_hdcr_LDADD=-lGrid Test_dwf_hdcr_LDADD=-lGrid
#Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
#Test_dwf_lanczos_LDADD=-lGrid
Test_gamma_SOURCES=Test_gamma.cc Test_gamma_SOURCES=Test_gamma.cc
Test_gamma_LDADD=-lGrid Test_gamma_LDADD=-lGrid

57
tests/Test_dwf_lanczos.cc Normal file
View File

@ -0,0 +1,57 @@
#include <Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
const int Ls=8;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeFermion src(FGrid); gaussian(RNG5,src);
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4, Umu);
std::vector<LatticeColourMatrix> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
RealD mass=0.1;
RealD M5=1.8;
DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
const int Nk = 10;
const int Np = 1;
RealD enorm = 1.0;
RealD vthrs = 1;
const int Nit= 1000;
ImplicitlyRestartedLanczos<LatticeFermion> IRL(HermOp,PolyX,
Nk,Np,enorm,vthrs,Nit);
std::vector<RealD> eval(Nk);
std::vector<LatticeFermion> evec(Nk,FGrid);
IRL.calc(eval,evec,
src,
Nsbt,
Nconv);
Grid_finalize();
}

View File

@ -1,6 +1,6 @@
#include <Grid.h> #include <Grid.h>
using namespace Grid; namespace Grid {
class myclass { class myclass {
public: public:
@ -24,29 +24,32 @@ public:
}; };
}
uint16_t i16 = 1; int16_t i16 = 1;
uint16_t u16 = 2; uint16_t u16 = 2;
uint32_t i32 = 3; int32_t i32 = 3;
uint32_t u32 = 4; uint32_t u32 = 4;
uint64_t i64 = 5; int64_t i64 = 5;
uint64_t u64 = 6; uint64_t u64 = 6;
float f = M_PI; float f = M_PI;
double d = 2*M_PI; double d = 2*M_PI;
bool b = false; bool b = false;
using namespace Grid;
int main(int argc,char **argv) int main(int argc,char **argv)
{ {
{ {
XMLWriter WR("bother.xml"); XMLWriter WR("bother.xml");
push(WR,"BasicTypes"); push(WR,"BasicTypes");
write(WR,"i16",i16); write(WR,std::string("i16"),i16);
write(WR,"u16",u16); write(WR,"u16",u16);
write(WR,"i32",i32); write(WR,"i32",i32);
write(WR,"i32",u32); write(WR,"u32",u32);
write(WR,"i64",i64); write(WR,"i64",i64);
write(WR,"i64",u64); write(WR,"u64",u64);
write(WR,"f",f); write(WR,"f",f);
write(WR,"d",d); write(WR,"d",d);
write(WR,"b",b); write(WR,"b",b);