1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Merge remote-tracking branch 'upstream/master'

Conflicts:
	lib/simd/Grid_vector_types.h
	tests/Makefile.am
This commit is contained in:
neo 2015-05-20 17:32:46 +09:00
commit f8d8958884
36 changed files with 670 additions and 237 deletions

View File

@ -10,6 +10,8 @@ int main (int argc, char ** argv)
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi(); std::vector<int> mpi_layout = GridDefaultMpi();
int threads = GridThread::GetThreads();
std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
int Nloop=10; int Nloop=10;
int nmu=0; int nmu=0;

View File

@ -16,6 +16,9 @@ int main (int argc, char ** argv)
std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd()); std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi(); std::vector<int> mpi_layout = GridDefaultMpi();
int threads = GridThread::GetThreads();
std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout << "===================================================================================================="<<std::endl; std::cout << "===================================================================================================="<<std::endl;
std::cout << "= Benchmarking fused AXPY bandwidth"<<std::endl; std::cout << "= Benchmarking fused AXPY bandwidth"<<std::endl;
std::cout << "===================================================================================================="<<std::endl; std::cout << "===================================================================================================="<<std::endl;

View File

@ -13,6 +13,9 @@ int main (int argc, char ** argv)
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi(); std::vector<int> mpi_layout = GridDefaultMpi();
int threads = GridThread::GetThreads();
std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout << "===================================================================================================="<<std::endl; std::cout << "===================================================================================================="<<std::endl;
std::cout << "= Benchmarking SU3xSU3 x= x*y"<<std::endl; std::cout << "= Benchmarking SU3xSU3 x= x*y"<<std::endl;
std::cout << "===================================================================================================="<<std::endl; std::cout << "===================================================================================================="<<std::endl;

View File

@ -26,6 +26,9 @@ int main (int argc, char ** argv)
std::vector<int> mpi_layout = GridDefaultMpi(); std::vector<int> mpi_layout = GridDefaultMpi();
GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridCartesian Grid(latt_size,simd_layout,mpi_layout);
int threads = GridThread::GetThreads();
std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> seeds({1,2,3,4}); std::vector<int> seeds({1,2,3,4});
GridParallelRNG pRNG(&Grid); GridParallelRNG pRNG(&Grid);
@ -46,6 +49,13 @@ int main (int argc, char ** argv)
volume=volume*latt_size[mu]; volume=volume*latt_size[mu];
} }
// Only one non-zero (y)
Umu=zero;
for(int nn=0;nn<Nd;nn++){
random(pRNG,U[nn]);
pokeIndex<LorentzIndex>(Umu,U[nn],nn);
}
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
U[mu] = peekIndex<LorentzIndex>(Umu,mu); U[mu] = peekIndex<LorentzIndex>(Umu,mu);
} }
@ -53,7 +63,6 @@ int main (int argc, char ** argv)
{ // Naive wilson implementation { // Naive wilson implementation
ref = zero; ref = zero;
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
// ref = src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x // ref = src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
tmp = U[mu]*Cshift(src,mu,1); tmp = U[mu]*Cshift(src,mu,1);
for(int i=0;i<ref._odata.size();i++){ for(int i=0;i<ref._odata.size();i++){
@ -75,7 +84,7 @@ int main (int argc, char ** argv)
int ncall=1000; int ncall=1000;
double t0=usecond(); double t0=usecond();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
Dw.M(src,result); Dw.Dhop(src,result,0);
} }
double t1=usecond(); double t1=usecond();
double flops=1320*volume*ncall; double flops=1320*volume*ncall;
@ -99,5 +108,30 @@ int main (int argc, char ** argv)
} }
} }
{ // Naive wilson dag implementation
ref = zero;
for(int mu=0;mu<Nd;mu++){
// ref = src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
tmp = U[mu]*Cshift(src,mu,1);
for(int i=0;i<ref._odata.size();i++){
ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
}
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu,-1);
for(int i=0;i<ref._odata.size();i++){
ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
}
}
}
Dw.Dhop(src,result,1);
std::cout << "Called DwDag"<<std::endl;
std::cout << "norm result "<< norm2(result)<<std::endl;
std::cout << "norm ref "<< norm2(ref)<<std::endl;
err = ref -result;
std::cout << "norm diff "<< norm2(err)<<std::endl;
Grid_finalize(); Grid_finalize();
} }

View File

@ -0,0 +1,56 @@
#include <Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
template<class d>
struct scal {
d internal;
};
Gamma::GammaMatrix Gmu [] = {
Gamma::GammaX,
Gamma::GammaY,
Gamma::GammaZ,
Gamma::GammaT
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::vector<int> latt_size = GridDefaultLatt();
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
std::vector<int> seeds({1,2,3,4});
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
LatticeFermion src(&Grid); random(pRNG,src);
std::cout << "src norm" << norm2(src)<<std::endl;
LatticeFermion result(&Grid); result=zero;
LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid);
double volume=1;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
for(int mu=0;mu<Nd;mu++){
U[mu] = peekIndex<LorentzIndex>(Umu,mu);
}
RealD mass=0.5;
WilsonMatrix Dw(Umu,mass);
HermitianOperator<WilsonMatrix,LatticeFermion> HermOp(Dw);
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
CG(HermOp,src,result);
Grid_finalize();
}

View File

@ -5,12 +5,14 @@ AM_LDFLAGS = -L$(top_builddir)/lib
# #
# Test code # Test code
# #
bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec
Grid_wilson_SOURCES = Grid_wilson.cc Grid_wilson_SOURCES = Grid_wilson.cc
Grid_wilson_LDADD = -lGrid Grid_wilson_LDADD = -lGrid
Grid_wilson_cg_unprec_SOURCES = Grid_wilson_cg_unprec.cc
Grid_wilson_cg_unprec_LDADD = -lGrid
Grid_comms_SOURCES = Grid_comms.cc Grid_comms_SOURCES = Grid_comms.cc
Grid_comms_LDADD = -lGrid Grid_comms_LDADD = -lGrid

View File

@ -86,7 +86,7 @@ namespace Grid {
// Common parsing chores // Common parsing chores
std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option); std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
bool GridCmdOptionExists(char** begin, char** end, const std::string& option); bool GridCmdOptionExists(char** begin, char** end, const std::string& option);
void GridParseIntVector(std::string &str,std::vector<int> & vec); std::string GridCmdVectorIntToString(const std::vector<int> & vec);
void GridParseLayout(char **argv,int argc, void GridParseLayout(char **argv,int argc,
std::vector<int> &latt, std::vector<int> &latt,

View File

@ -13,6 +13,7 @@
#include <iostream> #include <iostream>
#include <Grid.h> #include <Grid.h>
#include <algorithm> #include <algorithm>
#include <iterator>
#undef __X86_64 #undef __X86_64
#define MAC #define MAC
@ -111,6 +112,11 @@ void GridParseLayout(char **argv,int argc,
} }
std::string GridCmdVectorIntToString(const std::vector<int> & vec){
std::ostringstream oss;
std::copy(vec.begin(), vec.end(),std::ostream_iterator<int>(oss, " "));
return oss.str();
}
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
void Grid_init(int *argc,char ***argv) void Grid_init(int *argc,char ***argv)
@ -120,6 +126,15 @@ void Grid_init(int *argc,char ***argv)
#endif #endif
// Parse command line args. // Parse command line args.
if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
std::cout<<"--help : this message"<<std::endl;
std::cout<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
std::cout<<"--debug-stdout : print stdout from EVERY node"<<std::endl;
std::cout<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
std::cout<<"--mpi n.n.n.n : default MPI decomposition"<<std::endl;
std::cout<<"--omp n : default number of OMP threads"<<std::endl;
std::cout<<"--grid n.n.n.n : default Grid size"<<std::endl;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
Grid_debug_handler_init(); Grid_debug_handler_init();
} }
@ -129,6 +144,15 @@ void Grid_init(int *argc,char ***argv)
GridParseLayout(*argv,*argc, GridParseLayout(*argv,*argc,
Grid_default_latt, Grid_default_latt,
Grid_default_mpi); Grid_default_mpi);
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
std::cout<<"Grid Decomposition\n";
std::cout<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
std::cout<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
std::cout<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
std::cout<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
std::cout<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
std::cout<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
}
} }

View File

@ -50,15 +50,20 @@ namespace Grid {
typedef std::complex<Real> Complex; typedef std::complex<Real> Complex;
inline RealF adj(const RealF & r){ return r; } inline RealF adj(const RealF & r){ return r; }
inline RealF conj(const RealF & r){ return r; } inline RealF conjugate(const RealF & r){ return r; }
inline RealF real(const RealF & r){ return r; } inline RealF real(const RealF & r){ return r; }
inline RealD adj(const RealD & r){ return r; } inline RealD adj(const RealD & r){ return r; }
inline RealD conj(const RealD & r){ return r; } inline RealD conjugate(const RealD & r){ return r; }
inline RealD real(const RealD & r){ return r; } inline RealD real(const RealD & r){ return r; }
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; } inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; } inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; } inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; } inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
@ -70,15 +75,14 @@ namespace Grid {
inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);} inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);} inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);} inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
inline ComplexD adj(const ComplexD& r){ return(conj(r)); } // conjugate already supported for complex
// conj already supported for complex
inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); } inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
inline ComplexF adj(const ComplexF& r ){ return(conj(r)); }
//conj already supported for complex //conjugate already supported for complex
inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}

View File

@ -7,8 +7,8 @@ namespace Grid {
// LinearOperators Take a something and return a something. // LinearOperators Take a something and return a something.
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// //
// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugate (transpose if real): // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real):
// //SBase
// i) F(a x + b y) = aF(x) + b F(y). // i) F(a x + b y) = aF(x) + b F(y).
// ii) <x|Op|y> = <y|AdjOp|x>^\ast // ii) <x|Op|y> = <y|AdjOp|x>^\ast
// //
@ -25,12 +25,13 @@ namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class HermitianOperatorBase : public LinearOperatorBase<Field> { template<class Field> class HermitianOperatorBase : public LinearOperatorBase<Field> {
public: public:
virtual RealD OpAndNorm(const Field &in, Field &out); virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2);
void AdjOp(const Field &in, Field &out) { void AdjOp(const Field &in, Field &out) {
Op(in,out); Op(in,out);
}; };
void Op(const Field &in, Field &out) { void Op(const Field &in, Field &out) {
OpAndNorm(in,out); double n1,n2;
OpAndNorm(in,out,n1,n2);
}; };
}; };
@ -80,8 +81,8 @@ namespace Grid {
Matrix &_Mat; Matrix &_Mat;
public: public:
HermitianOperator(Matrix &Mat): _Mat(Mat) {}; HermitianOperator(Matrix &Mat): _Mat(Mat) {};
RealD OpAndNorm(const Field &in, Field &out){ void OpAndNorm(const Field &in, Field &out,double &n1,double &n2){
return _Mat.MdagM(in,out); return _Mat.MdagM(in,out,n1,n2);
} }
}; };

View File

@ -18,6 +18,7 @@ public:
std::cout << Tolerance<<std::endl; std::cout << Tolerance<<std::endl;
}; };
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi) {assert(0);};
void operator() (HermitianOperatorBase<Field> &Linop,const Field &src, Field &psi){ void operator() (HermitianOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD cp,c,a,d,b,ssq,qq,b_pred; RealD cp,c,a,d,b,ssq,qq,b_pred;
@ -61,21 +62,27 @@ public:
Linop.OpAndNorm(p,mmp,d,qq); Linop.OpAndNorm(p,mmp,d,qq);
// std::cout <<std::setprecision(4)<< "ConjugateGradient: d,qq "<<d<< " "<<qq <<std::endl;
a = c/d; a = c/d;
b_pred = a*(a*qq-d)/c; b_pred = a*(a*qq-d)/c;
cp = axpy_norm(r,mmp,r,-a); // std::cout <<std::setprecision(4)<< "ConjugateGradient: a,bp "<<a<< " "<<b_pred <<std::endl;
cp = axpy_norm(r,-a,mmp,r);
b = cp/c; b = cp/c;
// std::cout <<std::setprecision(4)<< "ConjugateGradient: cp,b "<<cp<< " "<<b <<std::endl;
// Fuse these loops ; should be really easy // Fuse these loops ; should be really easy
psi= a*p+psi; psi= a*p+psi;
p = p*b+r; p = p*b+r;
std::cout << "Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl; std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
// Stopping condition // Stopping condition
if ( cp <= rsq ) { if ( cp <= rsq ) {
Linop.Op(p,mmp); Linop.Op(psi,mmp);
p=mmp-src; p=mmp-src;
RealD mmpnorm = sqrt(norm2(mmp)); RealD mmpnorm = sqrt(norm2(mmp));
@ -83,8 +90,11 @@ public:
RealD srcnorm = sqrt(norm2(src)); RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p)); RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm/srcnorm; RealD true_residual = resnorm/srcnorm;
std::cout<<"ConjugateGradient: Converged on iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
std::cout<<"ConjugateGradient: true residual is "<<true_residual<<" sol "<<psinorm<<" src "<<srcnorm<<std::endl; std::cout<<"ConjugateGradient: true residual is "<<true_residual<<" sol "<<psinorm<<" src "<<srcnorm<<std::endl;
std::cout<<"ConjugateGradient: target residual was "<<Tolerance<<std::endl; std::cout<<"ConjugateGradient: target residual was "<<Tolerance<<std::endl;
return;
} }
} }
std::cout<<"ConjugateGradient did NOT converge"<<std::endl; std::cout<<"ConjugateGradient did NOT converge"<<std::endl;

View File

@ -102,7 +102,7 @@ template <class arg> struct name\
GridUnopClass(UnarySub,-a); GridUnopClass(UnarySub,-a);
GridUnopClass(UnaryAdj,adj(a)); GridUnopClass(UnaryAdj,adj(a));
GridUnopClass(UnaryConj,conj(a)); GridUnopClass(UnaryConj,conjugate(a));
GridUnopClass(UnaryTrace,trace(a)); GridUnopClass(UnaryTrace,trace(a));
GridUnopClass(UnaryTranspose,transpose(a)); GridUnopClass(UnaryTranspose,transpose(a));
@ -178,7 +178,7 @@ template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,con
GRID_DEF_UNOP(operator -,UnarySub); GRID_DEF_UNOP(operator -,UnarySub);
GRID_DEF_UNOP(adj,UnaryAdj); GRID_DEF_UNOP(adj,UnaryAdj);
GRID_DEF_UNOP(conj,UnaryConj); GRID_DEF_UNOP(conjugate,UnaryConj);
GRID_DEF_UNOP(trace,UnaryTrace); GRID_DEF_UNOP(trace,UnaryTrace);
GRID_DEF_UNOP(transpose,UnaryTranspose); GRID_DEF_UNOP(transpose,UnaryTranspose);

View File

@ -144,14 +144,44 @@ PARALLEL_FOR_LOOP
} }
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> strong_inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
conformable(lhs,rhs); conformable(x,y);
#pragma omp parallel for #pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){ for(int ss=0;ss<x._grid->oSites();ss++){
vobj tmp = a*lhs._odata[ss]; vobj tmp = a*x._odata[ss]+y._odata[ss];
vstream(ret._odata[ss],tmp+rhs._odata[ss]); vstream(ret._odata[ss],tmp);
} }
} }
template<class sobj,class vobj> strong_inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
conformable(x,y);
#pragma omp parallel for
for(int ss=0;ss<x._grid->oSites();ss++){
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vstream(ret._odata[ss],tmp);
}
}
template<class sobj,class vobj> strong_inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
conformable(x,y);
#pragma omp parallel for
for(int ss=0;ss<x._grid->oSites();ss++){
vobj tmp = a*x._odata[ss]+y._odata[ss];
vstream(ret._odata[ss],tmp);
}
return norm2(ret);
}
template<class sobj,class vobj> strong_inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
conformable(x,y);
#pragma omp parallel for
for(int ss=0;ss<x._grid->oSites();ss++){
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vstream(ret._odata[ss],tmp);
}
return norm2(ret); // FIXME implement parallel norm in ss loop
}
} }
#endif #endif

View File

@ -9,7 +9,7 @@ namespace Grid {
// Functionality: // Functionality:
// -=,+=,*=,() // -=,+=,*=,()
// add,+,sub,-,mult,mac,* // add,+,sub,-,mult,mac,*
// adj,conj // adj,conjugate
// real,imag // real,imag
// transpose,transposeIndex // transpose,transposeIndex
// trace,traceIndex // trace,traceIndex

View File

@ -18,11 +18,11 @@ PARALLEL_FOR_LOOP
return ret; return ret;
}; };
template<class vobj> inline Lattice<vobj> conj(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid); Lattice<vobj> ret(lhs._grid);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){ for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = conj(lhs._odata[ss]); ret._odata[ss] = conjugate(lhs._odata[ss]);
} }
return ret; return ret;
}; };

View File

@ -98,26 +98,26 @@ template<class vtype,int N> inline void timesMinusI(iMatrix<vtype,N> &ret,const
/////////////////////////////////////////////// ///////////////////////////////////////////////
// Conj function for scalar, vector, matrix // Conj function for scalar, vector, matrix
/////////////////////////////////////////////// ///////////////////////////////////////////////
template<class vtype> inline iScalar<vtype> conj(const iScalar<vtype>&r) template<class vtype> inline iScalar<vtype> conjugate(const iScalar<vtype>&r)
{ {
iScalar<vtype> ret; iScalar<vtype> ret;
ret._internal = conj(r._internal); ret._internal = conjugate(r._internal);
return ret; return ret;
} }
template<class vtype,int N> inline iVector<vtype,N> conj(const iVector<vtype,N>&r) template<class vtype,int N> inline iVector<vtype,N> conjugate(const iVector<vtype,N>&r)
{ {
iVector<vtype,N> ret; iVector<vtype,N> ret;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
ret._internal[i] = conj(r._internal[i]); ret._internal[i] = conjugate(r._internal[i]);
} }
return ret; return ret;
} }
template<class vtype,int N> inline iMatrix<vtype,N> conj(const iMatrix<vtype,N>&r) template<class vtype,int N> inline iMatrix<vtype,N> conjugate(const iMatrix<vtype,N>&r)
{ {
iMatrix<vtype,N> ret; iMatrix<vtype,N> ret;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
ret._internal[i][j] = conj(r._internal[i][j]); ret._internal[i][j] = conjugate(r._internal[i][j]);
}} }}
return ret; return ret;
} }

View File

@ -21,7 +21,13 @@ const int WilsonMatrix::Tm = 7;
class WilsonCompressor { class WilsonCompressor {
public: public:
int mu; int mu;
int dag;
WilsonCompressor(int _dag){
mu=0;
dag=_dag;
assert((dag==0)||(dag==1));
}
void Point(int p) { void Point(int p) {
mu=p; mu=p;
}; };
@ -29,7 +35,11 @@ const int WilsonMatrix::Tm = 7;
vHalfSpinColourVector operator () (const vSpinColourVector &in) vHalfSpinColourVector operator () (const vSpinColourVector &in)
{ {
vHalfSpinColourVector ret; vHalfSpinColourVector ret;
switch(mu) { int mudag=mu;
if (dag) {
mudag=(mu+Nd)%(2*Nd);
}
switch(mudag) {
case WilsonMatrix::Xp: case WilsonMatrix::Xp:
spProjXp(ret,in); spProjXp(ret,in);
break; break;
@ -87,48 +97,49 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF
RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out) RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out)
{ {
Dhop(in,out); Dhop(in,out,0);
return 0.0; out = (4+mass)*in - 0.5*out ; // FIXME : axpby_norm! fusion fun
return norm2(out);
} }
RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out) RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out)
{ {
Dhop(in,out); Dhop(in,out,1);
return 0.0; out = (4+mass)*in - 0.5*out ; // FIXME : axpby_norm! fusion fun
return norm2(out);
} }
void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out) void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out)
{ {
Dhop(in,out); Dhop(in,out,0);
out = 0.5*out; // FIXME : scale factor in Dhop
} }
void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out) void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out)
{ {
Dhop(in,out); Dhop(in,out,1);
} }
void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out) void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out)
{ {
out = (4.0+mass)*in;
return ; return ;
} }
void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out) void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
{ {
out = (1.0/(4.0+mass))*in;
return ; return ;
} }
void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out) void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out)
{ {
out = (1.0/(4.0+mass))*in;
return ; return ;
} }
void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out) void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
{ {
out = (1.0/(4.0+mass))*in;
return ; return ;
} }
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out)
{ {
WilsonCompressor compressor;
Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
PARALLEL_FOR_LOOP
for(int sss=0;sss<grid->oSites();sss++){
vHalfSpinColourVector tmp; vHalfSpinColourVector tmp;
vHalfSpinColourVector chi; vHalfSpinColourVector chi;
vSpinColourVector result; vSpinColourVector result;
@ -136,17 +147,15 @@ PARALLEL_FOR_LOOP
int offset,local,perm, ptype; int offset,local,perm, ptype;
// int ss = Stencil._LebesgueReorder[sss]; // int ss = Stencil._LebesgueReorder[sss];
int ss = sss;
int ssu= ss; int ssu= ss;
// Xp // Xp
offset = Stencil._offsets [Xp][ss]; offset = Stencil._offsets [Xp][ss];
local = Stencil._is_local[Xp][ss]; local = Stencil._is_local[Xp][ss];
perm = Stencil._permute[Xp][ss]; perm = Stencil._permute[Xp][ss];
ptype = Stencil._permute_type[Xp];
if ( local && perm ) ptype = Stencil._permute_type[Xp];
{ if ( local && perm ) {
spProjXp(tmp,in._odata[offset]); spProjXp(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
} else if ( local ) { } else if ( local ) {
@ -155,7 +164,6 @@ PARALLEL_FOR_LOOP
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](Xp),&chi()); mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
//prefetch(Umu._odata[ssu](Yp));
spReconXp(result,Uchi); spReconXp(result,Uchi);
// Yp // Yp
@ -163,9 +171,7 @@ PARALLEL_FOR_LOOP
local = Stencil._is_local[Yp][ss]; local = Stencil._is_local[Yp][ss];
perm = Stencil._permute[Yp][ss]; perm = Stencil._permute[Yp][ss];
ptype = Stencil._permute_type[Yp]; ptype = Stencil._permute_type[Yp];
if ( local && perm ) {
if ( local && perm )
{
spProjYp(tmp,in._odata[offset]); spProjYp(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
} else if ( local ) { } else if ( local ) {
@ -174,7 +180,6 @@ PARALLEL_FOR_LOOP
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](Yp),&chi()); mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
// prefetch(Umu._odata[ssu](Zp));
accumReconYp(result,Uchi); accumReconYp(result,Uchi);
// Zp // Zp
@ -182,9 +187,7 @@ PARALLEL_FOR_LOOP
local = Stencil._is_local[Zp][ss]; local = Stencil._is_local[Zp][ss];
perm = Stencil._permute[Zp][ss]; perm = Stencil._permute[Zp][ss];
ptype = Stencil._permute_type[Zp]; ptype = Stencil._permute_type[Zp];
if ( local && perm ) {
if ( local && perm )
{
spProjZp(tmp,in._odata[offset]); spProjZp(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
} else if ( local ) { } else if ( local ) {
@ -193,7 +196,6 @@ PARALLEL_FOR_LOOP
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](Zp),&chi()); mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
// prefetch(Umu._odata[ssu](Tp));
accumReconZp(result,Uchi); accumReconZp(result,Uchi);
// Tp // Tp
@ -201,9 +203,7 @@ PARALLEL_FOR_LOOP
local = Stencil._is_local[Tp][ss]; local = Stencil._is_local[Tp][ss];
perm = Stencil._permute[Tp][ss]; perm = Stencil._permute[Tp][ss];
ptype = Stencil._permute_type[Tp]; ptype = Stencil._permute_type[Tp];
if ( local && perm ) {
if ( local && perm )
{
spProjTp(tmp,in._odata[offset]); spProjTp(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
} else if ( local ) { } else if ( local ) {
@ -212,7 +212,6 @@ PARALLEL_FOR_LOOP
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](Tp),&chi()); mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
// prefetch(Umu._odata[ssu](Xm));
accumReconTp(result,Uchi); accumReconTp(result,Uchi);
// Xm // Xm
@ -240,8 +239,7 @@ PARALLEL_FOR_LOOP
perm = Stencil._permute[Ym][ss]; perm = Stencil._permute[Ym][ss];
ptype = Stencil._permute_type[Ym]; ptype = Stencil._permute_type[Ym];
if ( local && perm ) if ( local && perm ) {
{
spProjYm(tmp,in._odata[offset]); spProjYm(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
} else if ( local ) { } else if ( local ) {
@ -257,9 +255,7 @@ PARALLEL_FOR_LOOP
local = Stencil._is_local[Zm][ss]; local = Stencil._is_local[Zm][ss];
perm = Stencil._permute[Zm][ss]; perm = Stencil._permute[Zm][ss];
ptype = Stencil._permute_type[Zm]; ptype = Stencil._permute_type[Zm];
if ( local && perm ) {
if ( local && perm )
{
spProjZm(tmp,in._odata[offset]); spProjZm(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
} else if ( local ) { } else if ( local ) {
@ -275,9 +271,7 @@ PARALLEL_FOR_LOOP
local = Stencil._is_local[Tm][ss]; local = Stencil._is_local[Tm][ss];
perm = Stencil._permute[Tm][ss]; perm = Stencil._permute[Tm][ss];
ptype = Stencil._permute_type[Tm]; ptype = Stencil._permute_type[Tm];
if ( local && perm ) {
if ( local && perm )
{
spProjTm(tmp,in._odata[offset]); spProjTm(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
} else if ( local ) { } else if ( local ) {
@ -289,16 +283,175 @@ PARALLEL_FOR_LOOP
accumReconTm(result,Uchi); accumReconTm(result,Uchi);
vstream(out._odata[ss],result); vstream(out._odata[ss],result);
}
} }
void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out)
void WilsonMatrix::Dw(const LatticeFermion &in, LatticeFermion &out)
{ {
return; vHalfSpinColourVector tmp;
vHalfSpinColourVector chi;
vSpinColourVector result;
vHalfSpinColourVector Uchi;
int offset,local,perm, ptype;
int ssu= ss;
// Xp
offset = Stencil._offsets [Xm][ss];
local = Stencil._is_local[Xm][ss];
perm = Stencil._permute[Xm][ss];
ptype = Stencil._permute_type[Xm];
if ( local && perm ) {
spProjXp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjXp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
spReconXp(result,Uchi);
// Yp
offset = Stencil._offsets [Ym][ss];
local = Stencil._is_local[Ym][ss];
perm = Stencil._permute[Ym][ss];
ptype = Stencil._permute_type[Ym];
if ( local && perm ) {
spProjYp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjYp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
accumReconYp(result,Uchi);
// Zp
offset = Stencil._offsets [Zm][ss];
local = Stencil._is_local[Zm][ss];
perm = Stencil._permute[Zm][ss];
ptype = Stencil._permute_type[Zm];
if ( local && perm ) {
spProjZp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjZp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
accumReconZp(result,Uchi);
// Tp
offset = Stencil._offsets [Tm][ss];
local = Stencil._is_local[Tm][ss];
perm = Stencil._permute[Tm][ss];
ptype = Stencil._permute_type[Tm];
if ( local && perm ) {
spProjTp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjTp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
accumReconTp(result,Uchi);
// Xm
offset = Stencil._offsets [Xp][ss];
local = Stencil._is_local[Xp][ss];
perm = Stencil._permute[Xp][ss];
ptype = Stencil._permute_type[Xp];
if ( local && perm )
{
spProjXm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjXm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
accumReconXm(result,Uchi);
// Ym
offset = Stencil._offsets [Yp][ss];
local = Stencil._is_local[Yp][ss];
perm = Stencil._permute[Yp][ss];
ptype = Stencil._permute_type[Yp];
if ( local && perm ) {
spProjYm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjYm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
accumReconYm(result,Uchi);
// Zm
offset = Stencil._offsets [Zp][ss];
local = Stencil._is_local[Zp][ss];
perm = Stencil._permute[Zp][ss];
ptype = Stencil._permute_type[Zp];
if ( local && perm ) {
spProjZm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjZm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
accumReconZm(result,Uchi);
// Tm
offset = Stencil._offsets [Tp][ss];
local = Stencil._is_local[Tp][ss];
perm = Stencil._permute[Tp][ss];
ptype = Stencil._permute_type[Tp];
if ( local && perm ) {
spProjTm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjTm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
accumReconTm(result,Uchi);
vstream(out._odata[ss],result);
} }
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
{
assert((dag==0) ||(dag==1));
WilsonCompressor compressor(dag);
Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
if ( dag ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<grid->oSites();sss++){
DhopSiteDag(sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<grid->oSites();sss++){
DhopSite(sss,in,out);
}
}
}
}} }}

View File

@ -45,10 +45,9 @@ namespace Grid {
virtual void MooeeInvDag (const LatticeFermion &in, LatticeFermion &out); virtual void MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
// non-hermitian hopping term; half cb or both // non-hermitian hopping term; half cb or both
void Dhop(const LatticeFermion &in, LatticeFermion &out); void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
void DhopSite (int ss,const LatticeFermion &in, LatticeFermion &out);
// m+4r -1/2 Dhop; both cb's void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out);
void Dw(const LatticeFermion &in, LatticeFermion &out);
typedef iScalar<iMatrix<vComplex, Nc> > matrix; typedef iScalar<iMatrix<vComplex, Nc> > matrix;

View File

@ -32,7 +32,7 @@ namespace Grid {
friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);} friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);}
friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);} friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);}
friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);} friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);}
friend inline vComplexD adj(const vComplexD &in){ return conj(in); } friend inline vComplexD adj(const vComplexD &in){ return conjugate(in); }
////////////////////////////////// //////////////////////////////////
// Initialise to 1,0,i // Initialise to 1,0,i
@ -166,11 +166,11 @@ namespace Grid {
// all subtypes; may not be a good assumption, but could // all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example // add the vector width as a template param for BG/Q for example
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
/*
friend inline void permute(vComplexD &y,vComplexD b,int perm) friend inline void permute(vComplexD &y,vComplexD b,int perm)
{ {
Gpermute<vComplexD>(y,b,perm); Gpermute<vComplexD>(y,b,perm);
} }
/*
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted) friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
{ {
Gmerge<vComplexD,ComplexD >(y,extracted); Gmerge<vComplexD,ComplexD >(y,extracted);
@ -269,7 +269,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
//////////////////////// ////////////////////////
// Conjugate // Conjugate
//////////////////////// ////////////////////////
friend inline vComplexD conj(const vComplexD &in){ friend inline vComplexD conjugate(const vComplexD &in){
vComplexD ret ; vzero(ret); vComplexD ret ; vzero(ret);
#if defined (AVX1)|| defined (AVX2) #if defined (AVX1)|| defined (AVX2)
// addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ... // addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
@ -345,17 +345,17 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
// REDUCE FIXME must be a cleaner implementation // REDUCE FIXME must be a cleaner implementation
friend inline ComplexD Reduce(const vComplexD & in) friend inline ComplexD Reduce(const vComplexD & in)
{ {
#if defined SSE4 #ifdef SSE4
return ComplexD(in.v[0], in.v[1]); // inefficient return ComplexD(in.v[0],in.v[1]);
#endif
#if defined(AVX1) || defined (AVX2)
vComplexD v1;
permute(v1,in,0); // sse 128; paired complex single
v1=v1+in;
return ComplexD(v1.v[0],v1.v[1]);
#endif #endif
#if defined (AVX1) || defined(AVX2)
// return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
__attribute__ ((aligned(32))) double c_[4];
_mm256_store_pd(c_,in.v);
return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
#endif
#ifdef AVX512 #ifdef AVX512
return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v)); return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
#endif #endif
#ifdef QPX #ifdef QPX
#endif #endif
@ -387,7 +387,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
}; };
inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; } inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conjugate(l)*r; }
typedef vComplexD vDComplex; typedef vComplexD vDComplex;

View File

@ -47,7 +47,7 @@ namespace Grid {
friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); } friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); } friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); } friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
friend inline vComplexF adj(const vComplexF &in){ return conj(in); } friend inline vComplexF adj(const vComplexF &in){ return conjugate(in); }
////////////////////////////////// //////////////////////////////////
// Initialise to 1,0,i // Initialise to 1,0,i
@ -228,42 +228,25 @@ namespace Grid {
ret.v = {a,b,a,b}; ret.v = {a,b,a,b};
#endif #endif
} }
friend inline ComplexF Reduce(const vComplexF & in) friend inline void permute(vComplexF &y,vComplexF b,int perm)
{ {
Gpermute<vComplexF>(y,b,perm);
}
friend inline ComplexF Reduce(const vComplexF & in)
{
#ifdef SSE4 #ifdef SSE4
union { vComplexF v1;
cvec v1; // SSE 4 x float vector permute(v1,in,0); // sse 128; paired complex single
float f[4]; // scalar array of 4 floats v1=v1+in;
} u128; return ComplexF(v1.v[0],v1.v[1]);
u128.v1= _mm_add_ps(in.v, _mm_shuffle_ps(in.v,in.v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
return ComplexF(u128.f[0], u128.f[1]);
#endif #endif
#ifdef AVX1 #if defined(AVX1) || defined (AVX2)
//it would be better passing 2 arguments to saturate the vector lanes vComplexF v1,v2;
union { permute(v1,in,0); // sse 128; paired complex single
__m256 v1; v1=v1+in;
float f[8]; permute(v2,v1,1); // avx 256; quad complex single
} u256; v1=v1+v2;
//SWAP lanes return ComplexF(v1.v[0],v1.v[1]);
// FIXME .. icc complains with lib/lattice/Grid_lattice_reduction.h (49): (col. 20) warning #13211: Immediate parameter to intrinsic call too large
__m256 t0 = _mm256_permute2f128_ps(in.v, in.v, 1);
__m256 t1 = _mm256_permute_ps(in.v , 0b11011000);//real (0,2,1,3)
__m256 t2 = _mm256_permute_ps(t0 , 0b10001101);//imag (1,3,0,2)
t0 = _mm256_blend_ps(t1, t2, 0b0101000001010000);// (0,0,1,1,0,0,1,1)
t1 = _mm256_hadd_ps(t0,t0);
u256.v1 = _mm256_hadd_ps(t1, t1);
return ComplexF(u256.f[0], u256.f[4]);
#endif
#ifdef AVX2
union {
__m256 v1;
float f[8];
} u256;
const __m256i mask= _mm256_set_epi32( 7, 5, 3, 1, 6, 4, 2, 0);
__m256 tmp1 = _mm256_permutevar8x32_ps(in.v, mask);
__m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
u256.v1 = _mm256_hadd_ps(tmp2, tmp2);
return ComplexF(u256.f[0], u256.f[4]);
#endif #endif
#ifdef AVX512 #ifdef AVX512
return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v)); return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
@ -345,13 +328,10 @@ namespace Grid {
// Conjugate // Conjugate
/////////////////////// ///////////////////////
friend inline vComplexF conj(const vComplexF &in){ friend inline vComplexF conjugate(const vComplexF &in){
vComplexF ret ; vzero(ret); vComplexF ret ; vzero(ret);
#if defined (AVX1)|| defined (AVX2) #if defined (AVX1)|| defined (AVX2)
cvec tmp; ret.v = _mm256_xor_ps(_mm256_addsub_ps(ret.v,in.v), _mm256_set1_ps(-0.f));
tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
#endif #endif
#ifdef SSE4 #ifdef SSE4
ret.v = _mm_xor_ps(_mm_addsub_ps(ret.v,in.v), _mm_set1_ps(-0.f)); ret.v = _mm_xor_ps(_mm_addsub_ps(ret.v,in.v), _mm_set1_ps(-0.f));
@ -433,10 +413,6 @@ namespace Grid {
return *this; return *this;
} }
friend inline void permute(vComplexF &y,vComplexF b,int perm)
{
Gpermute<vComplexF>(y,b,perm);
}
/* /*
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted) friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
{ {
@ -460,7 +436,7 @@ namespace Grid {
inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r)
{ {
return conj(l)*r; return conjugate(l)*r;
} }
inline void zeroit(vComplexF &z){ vzero(z);} inline void zeroit(vComplexF &z){ vzero(z);}

View File

@ -117,7 +117,7 @@ namespace Grid {
}; };
/////////////////////////////////////////////// ///////////////////////////////////////////////
// mult, sub, add, adj,conj, mac functions // mult, sub, add, adj,conjugate, mac functions
/////////////////////////////////////////////// ///////////////////////////////////////////////
friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);} friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);} friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}

View File

@ -26,7 +26,7 @@ namespace Grid {
friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);} friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);} friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
friend inline vRealD adj(const vRealD &in) { return in; } friend inline vRealD adj(const vRealD &in) { return in; }
friend inline vRealD conj(const vRealD &in){ return in; } friend inline vRealD conjugate(const vRealD &in){ return in; }
friend inline void mac (vRealD &y,const vRealD a,const vRealD x){ friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
#if defined (AVX1) || defined (SSE4) #if defined (AVX1) || defined (SSE4)
@ -112,11 +112,12 @@ namespace Grid {
// all subtypes; may not be a good assumption, but could // all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example // add the vector width as a template param for BG/Q for example
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
/*
friend inline void permute(vRealD &y,vRealD b,int perm) friend inline void permute(vRealD &y,vRealD b,int perm)
{ {
Gpermute<vRealD>(y,b,perm); Gpermute<vRealD>(y,b,perm);
} }
/*
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted) friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
{ {
Gmerge<vRealD,RealD >(y,extracted); Gmerge<vRealD,RealD >(y,extracted);
@ -209,48 +210,26 @@ namespace Grid {
friend inline RealD Reduce(const vRealD & in) friend inline RealD Reduce(const vRealD & in)
{ {
#if defined (SSE4) #ifdef SSE4
// FIXME Hack vRealD v1;
const RealD * ptr =(const RealD *) &in; permute(v1,in,0); // sse 128; paired real double
RealD ret = 0; v1=v1+in;
for(int i=0;i<vRealD::Nsimd();i++){ return RealD(v1.v[0]);
ret = ret+ptr[i];
}
return ret;
#endif #endif
#if defined (AVX1) || defined(AVX2) #if defined(AVX1) || defined (AVX2)
typedef union { vRealD v1,v2;
uint64_t l; permute(v1,in,0); // avx 256; quad double
double d; v1=v1+in;
} my_conv_t; permute(v2,v1,1);
my_conv_t converter; v1=v1+v2;
// more reduce_add return v1.v[0];
/*
__attribute__ ((aligned(32))) double c_[16];
__m256d tmp = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
__m256d hadd = _mm256_hadd_pd(in.v,tmp); // hadd = 1+0,3+2,3+2,1+0
tmp = _mm256_permute2f128_pd(hadd,hadd,0x01);// tmp = 3+2,1+0,1+0,3+2
hadd = _mm256_hadd_pd(tmp,tmp); // tmp = 3+2+1+0,3+2+1+0,1+0+3+2,1+0+3+2
_mm256_store_pd(c_,hadd);ô
return c[0]
*/
__m256d tmp = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
__m256d hadd = _mm256_hadd_pd(in.v,tmp); // hadd = 1+0,3+2,3+2,1+0
hadd = _mm256_hadd_pd(hadd,hadd); // hadd = 1+0+3+2...
converter.l = _mm256_extract_epi64((ivec)hadd,0);
return converter.d;
#endif #endif
#ifdef AVX512 #ifdef AVX512
return _mm512_reduce_add_pd(in.v); return _mm512_reduce_add_pd(in.v);
/*
__attribute__ ((aligned(32))) double c_[8];
_mm512_store_pd(c_,in.v);
return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7];
*/
#endif #endif
#ifdef QPX #ifdef QPX
#endif #endif
} }
// *=,+=,-= operators // *=,+=,-= operators
inline vRealD &operator *=(const vRealD &r) { inline vRealD &operator *=(const vRealD &r) {
@ -270,7 +249,7 @@ namespace Grid {
static int Nsimd(void) { return sizeof(dvec)/sizeof(double);} static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
}; };
inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; } inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conjugate(l)*r; }
inline void zeroit(vRealD &z){ vzero(z);} inline void zeroit(vRealD &z){ vzero(z);}
inline vRealD outerProduct(const vRealD &l, const vRealD& r) inline vRealD outerProduct(const vRealD &l, const vRealD& r)

View File

@ -92,13 +92,13 @@ namespace Grid {
}; };
/////////////////////////////////////////////// ///////////////////////////////////////////////
// mult, sub, add, adj,conj, mac functions // mult, sub, add, adj,conjugate, mac functions
/////////////////////////////////////////////// ///////////////////////////////////////////////
friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);} friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);}
friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);} friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);}
friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);} friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);}
friend inline vRealF adj(const vRealF &in) { return in; } friend inline vRealF adj(const vRealF &in) { return in; }
friend inline vRealF conj(const vRealF &in){ return in; } friend inline vRealF conjugate(const vRealF &in){ return in; }
friend inline void mac (vRealF &y,const vRealF a,const vRealF x){ friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
#if defined (AVX1) || defined (SSE4) #if defined (AVX1) || defined (SSE4)
@ -133,11 +133,12 @@ namespace Grid {
// all subtypes; may not be a good assumption, but could // all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example // add the vector width as a template param for BG/Q for example
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
/*
friend inline void permute(vRealF &y,vRealF b,int perm) friend inline void permute(vRealF &y,vRealF b,int perm)
{ {
Gpermute<vRealF>(y,b,perm); Gpermute<vRealF>(y,b,perm);
} }
/*
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted) friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
{ {
Gmerge<vRealF,RealF >(y,extracted); Gmerge<vRealF,RealF >(y,extracted);
@ -155,7 +156,6 @@ namespace Grid {
Gextract<vRealF,RealF>(y,extracted); Gextract<vRealF,RealF>(y,extracted);
} }
*/ */
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Broadcast a value across Nsimd copies. // Broadcast a value across Nsimd copies.
@ -243,33 +243,26 @@ friend inline void vstore(const vRealF &ret, float *a){
} }
friend inline RealF Reduce(const vRealF & in) friend inline RealF Reduce(const vRealF & in)
{ {
#if defined (SSE4) #ifdef SSE4
// FIXME Hack vRealF v1,v2;
const RealF * ptr = (const RealF *) &in; permute(v1,in,0); // sse 128; quad single
RealF ret = 0; v1=v1+in;
for(int i=0;i<vRealF::Nsimd();i++){ permute(v2,v1,1);
ret = ret+ptr[i]; v1=v1+v2;
} return v1.v[0];
return ret;
#endif #endif
#if defined (AVX1) || defined(AVX2) #if defined(AVX1) || defined (AVX2)
__attribute__ ((aligned(32))) float c_[16]; vRealF v1,v2;
__m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01); permute(v1,in,0); // avx 256; octo-double
__m256 hadd = _mm256_hadd_ps(in.v,tmp); v1=v1+in;
tmp = _mm256_permute2f128_ps(hadd,hadd,0x01); permute(v2,v1,1);
hadd = _mm256_hadd_ps(tmp,tmp); v1=v1+v2;
_mm256_store_ps(c_,hadd); permute(v2,v1,2);
return (float)c_[0]; v1=v1+v2;
return v1.v[0];
#endif #endif
#ifdef AVX512 #ifdef AVX512
return _mm512_reduce_add_ps(in.v); return _mm512_reduce_add_ps(in.v);
/*
__attribute__ ((aligned(64))) float c_[16];
_mm512_store_ps(c_,in.v);
return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7]
+c_[8]+c_[9]+c_[10]+c_[11]+c_[12]+c_[13]+c_[14]+c_[15];
*/
#endif #endif
#ifdef QPX #ifdef QPX
#endif #endif
@ -291,7 +284,7 @@ friend inline void vstore(const vRealF &ret, float *a){
public: public:
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);} static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
}; };
inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; } inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conjugate(l)*r; }
inline void zeroit(vRealF &z){ vzero(z);} inline void zeroit(vRealF &z){ vzero(z);}
inline vRealF outerProduct(const vRealF &l, const vRealF& r) inline vRealF outerProduct(const vRealF &l, const vRealF& r)

View File

@ -2,7 +2,7 @@
/*! @file Grid_vector_types.h /*! @file Grid_vector_types.h
@brief Defines templated class Grid_simd to deal with inner vector types @brief Defines templated class Grid_simd to deal with inner vector types
*/ */
// Time-stamp: <2015-05-20 17:21:52 neo> // Time-stamp: <2015-05-20 17:31:55 neo>
//--------------------------------------------------------------------------- //---------------------------------------------------------------------------
#ifndef GRID_VECTOR_TYPES #ifndef GRID_VECTOR_TYPES
#define GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES
@ -96,8 +96,9 @@ namespace Grid {
friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); } friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); } friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); } friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
//not for integer types... FIXME //not for integer types... FIXME
friend inline Grid_simd adj(const Grid_simd &in){ return conj(in); } friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
/////////////////////////////////////////////// ///////////////////////////////////////////////
// Initialise to 1,0,i for the correct types // Initialise to 1,0,i for the correct types
@ -247,13 +248,13 @@ namespace Grid {
// Conjugate // Conjugate
/////////////////////// ///////////////////////
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 > template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline Grid_simd conj(const Grid_simd &in){ friend inline Grid_simd conjugate(const Grid_simd &in){
Grid_simd ret ; Grid_simd ret ;
ret.v = unary<Vector_type>(in.v, ConjSIMD()); ret.v = unary<Vector_type>(in.v, ConjSIMD());
return ret; return ret;
} }
template < class S = Scalar_type, NotEnableIf<is_complex < S >, int> = 0 > template < class S = Scalar_type, NotEnableIf<is_complex < S >, int> = 0 >
friend inline Grid_simd conj(const Grid_simd &in){ friend inline Grid_simd conjugate(const Grid_simd &in){
return in; // for real objects return in; // for real objects
} }
@ -345,7 +346,7 @@ namespace Grid {
template<class scalar_type, class vector_type > template<class scalar_type, class vector_type >
inline Grid_simd< scalar_type, vector_type> innerProduct(const Grid_simd< scalar_type, vector_type> & l, const Grid_simd< scalar_type, vector_type> & r) inline Grid_simd< scalar_type, vector_type> innerProduct(const Grid_simd< scalar_type, vector_type> & l, const Grid_simd< scalar_type, vector_type> & r)
{ {
return conj(l)*r; return conjugate(l)*r;
} }
template<class scalar_type, class vector_type > template<class scalar_type, class vector_type >

9
scripts/bench_wilson.sh Executable file
View File

@ -0,0 +1,9 @@
for omp in 1 2 4
do
echo > wilson.t$omp
for vol in 4.4.4.4 4.4.4.8 4.4.8.8 4.8.8.8 8.8.8.8 8.8.8.16 8.8.16.16 8.16.16.16
do
perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp | grep mflop | awk '{print $3}'`
echo $vol $perf >> wilson.t$omp
done
done

View File

@ -1,6 +1,6 @@
#!/bin/bash #!/bin/bash -e
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi" DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi clang-sse"
EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512" EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512"
BLACK="\033[30m" BLACK="\033[30m"
RED="\033[31m" RED="\033[31m"

View File

@ -1,6 +1,6 @@
#!/bin/bash #!/bin/bash
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx" DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse"
for D in $DIRS for D in $DIRS
do do

View File

@ -34,6 +34,9 @@ icpc-avx512)
icpc-mic) icpc-mic)
CXX=icpc ../../configure --host=none --enable-simd=AVX512 CXXFLAGS="-mmic -O3 -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none CXX=icpc ../../configure --host=none --enable-simd=AVX512 CXXFLAGS="-mmic -O3 -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
;; ;;
clang-sse)
CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-avx) clang-avx)
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;; ;;
@ -47,16 +50,16 @@ clang-avx2-openmp)
CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none
;; ;;
clang-avx-openmp-mpi) clang-avx-openmp-mpi)
CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=mpi CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
;; ;;
clang-avx2-openmp-mpi) clang-avx2-openmp-mpi)
CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=mpi CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
;; ;;
clang-avx-mpi) clang-avx-mpi)
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx " LIBS="-lgmp -lmpfr" --enable-comms=mpi CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
;; ;;
clang-avx2-mpi) clang-avx2-mpi)
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx " LIBS="-lgmp -lmpfr" --enable-comms=mpi CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
;; ;;
esac esac
echo -e $NORMAL echo -e $NORMAL

7
scripts/wilson.gnu Normal file
View File

@ -0,0 +1,7 @@
plot 'wilson.t1' u 2 w l t "AVX1-OMP=1"
replot 'wilson.t2' u 2 w l t "AVX1-OMP=2"
replot 'wilson.t4' u 2 w l t "AVX1-OMP=4"
set terminal 'pdf'
set output 'wilson_clang.pdf'
replot
quit

View File

@ -144,7 +144,7 @@ int main (int argc, char ** argv)
// -=,+=,*=,() // -=,+=,*=,()
// add,+,sub,-,mult,mac,* // add,+,sub,-,mult,mac,*
// adj,conj // adj,conjugate
// real,imag // real,imag
// transpose,transposeIndex // transpose,transposeIndex
// trace,traceIndex // trace,traceIndex
@ -484,7 +484,7 @@ int main (int argc, char ** argv)
for(int r=0;r<3;r++){ for(int r=0;r<3;r++){
for(int c=0;c<3;c++){ for(int c=0;c<3;c++){
diff =shifted1()()(r,c)-shifted2()()(r,c); diff =shifted1()()(r,c)-shifted2()()(r,c);
nn=real(conj(diff)*diff); nn=real(conjugate(diff)*diff);
if ( nn > 0 ) if ( nn > 0 )
cout<<"Shift fail (shifted1/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" " cout<<"Shift fail (shifted1/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
<<shifted1()()(r,c)<<" "<<shifted2()()(r,c) <<shifted1()()(r,c)<<" "<<shifted2()()(r,c)
@ -498,7 +498,7 @@ int main (int argc, char ** argv)
for(int r=0;r<3;r++){ for(int r=0;r<3;r++){
for(int c=0;c<3;c++){ for(int c=0;c<3;c++){
diff =shifted3()()(r,c)-shifted2()()(r,c); diff =shifted3()()(r,c)-shifted2()()(r,c);
nn=real(conj(diff)*diff); nn=real(conjugate(diff)*diff);
if ( nn > 0 ) if ( nn > 0 )
cout<<"Shift rb fail (shifted3/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" " cout<<"Shift rb fail (shifted3/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
<<shifted3()()(r,c)<<" "<<shifted2()()(r,c) <<shifted3()()(r,c)<<" "<<shifted2()()(r,c)
@ -515,7 +515,7 @@ int main (int argc, char ** argv)
for(int r=0;r<Nc;r++){ for(int r=0;r<Nc;r++){
for(int c=0;c<Nc;c++){ for(int c=0;c<Nc;c++){
diff =foobar2()()(r,c)-foobar1()()(r,c); diff =foobar2()()(r,c)-foobar1()()(r,c);
nrm = nrm + real(conj(diff)*diff); nrm = nrm + real(conjugate(diff)*diff);
}} }}
}}}} }}}}
if( Fine.IsBoss() ){ if( Fine.IsBoss() ){

52
tests/Grid_rng_fixed.cc Normal file
View File

@ -0,0 +1,52 @@
#include <Grid.h>
#include <parallelIO/GridNerscIO.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::vector<int> latt_size = GridDefaultLatt();
std::vector<int> simd_layout = GridDefaultSimd(4,vComplexF::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
std::vector<int> seeds({1,2,3,4});
GridSerialRNG fsRNG; fsRNG.SeedFixedIntegers(seeds);
GridParallelRNG fpRNG(&Grid); fpRNG.SeedFixedIntegers(seeds);
vComplexF tmp; random(fsRNG,tmp);
std::cout<<"Random vComplexF (fixed seed)\n"<< tmp<<std::endl;
std::cout<<"conjugate(tmp)\n"<< conjugate(tmp)<<std::endl;
std::cout<<"conjugate(tmp)*tmp\n"<< conjugate(tmp)*tmp<<std::endl;
std::cout<<"innerProduct"<< innerProduct(tmp,tmp)<<std::endl;
std::cout<<"Reduce(innerProduct)"<< Reduce(innerProduct(tmp,tmp))<<std::endl;
SpinMatrix rnd ;
random(fsRNG,rnd);
std::cout<<"Random Spin Matrix (fixed seed)\n"<< rnd<<std::endl;
SpinVector rv;
random(fsRNG,rv);
std::cout<<"Random Spin Vector (fixed seed)\n"<< rv<<std::endl;
gaussian(fsRNG,rv);
std::cout<<"Gaussian Spin Vector (fixed seed)\n"<< rv<<std::endl;
LatticeColourVector lcv(&Grid);
LatticeFermion src(&Grid); random(fpRNG,src);
std::cout << "src norm : " << norm2(src)<<std::endl;
std::cout << "src " << src<<std::endl;
random(fpRNG,lcv);
std::cout<<"Random Lattice Colour Vector (fixed seed)\n"<< lcv<<std::endl;
Grid_finalize();
}

View File

@ -26,7 +26,7 @@ public:
class funcConj { class funcConj {
public: public:
funcConj() {}; funcConj() {};
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = conj(i1);} template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = conjugate(i1);}
std::string name(void) const { return std::string("Conj"); } std::string name(void) const { return std::string("Conj"); }
}; };
class funcAdj { class funcAdj {
@ -49,6 +49,34 @@ public:
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = timesMinusI(i1);} template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = timesMinusI(i1);}
std::string name(void) const { return std::string("timesMinusI"); } std::string name(void) const { return std::string("timesMinusI"); }
}; };
class funcInnerProduct {
public:
funcInnerProduct() {};
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = innerProduct(i1,i2);}
std::string name(void) const { return std::string("innerProduct"); }
};
// FIXME still to test:
//
// innerProduct,
// norm2,
// Reduce,
//
// mac,mult,sub,add, vone,vzero,vcomplex_i, =zero,
// vset,vsplat,vstore,vstream,vload, scalar*vec, vec*scalar
// unary -,
// *= , -=, +=
// outerproduct,
// zeroit
// permute
class funcReduce {
public:
funcReduce() {};
template<class reduce,class vec> void vfunc(reduce &rr,vec &i1,vec &i2) const { rr = Reduce(i1);}
template<class reduce,class scal> void sfunc(reduce &rr,scal &i1,scal &i2) const { rr = i1;}
std::string name(void) const { return std::string("Reduce"); }
};
template<class scal, class vec,class functor > template<class scal, class vec,class functor >
void Tester(const functor &func) void Tester(const functor &func)
@ -96,8 +124,59 @@ void Tester(const functor &func)
ok++; ok++;
} }
} }
if ( ok==0 ) std::cout << " OK!" <<std::endl; if ( ok==0 ) {
std::cout << " OK!" <<std::endl;
}
assert(ok==0);
}
template<class reduced,class scal, class vec,class functor >
void ReductionTester(const functor &func)
{
GridSerialRNG sRNG;
sRNG.SeedRandomDevice();
int Nsimd = vec::Nsimd();
std::vector<scal> input1(Nsimd);
std::vector<scal> input2(Nsimd);
reduced result(0);
reduced reference(0);
reduced tmp;
std::vector<vec,alignedAllocator<vec> > buf(3);
vec & v_input1 = buf[0];
vec & v_input2 = buf[1];
for(int i=0;i<Nsimd;i++){
random(sRNG,input1[i]);
random(sRNG,input2[i]);
}
merge<vec,scal>(v_input1,input1);
merge<vec,scal>(v_input2,input2);
func.template vfunc<reduced,vec>(result,v_input1,v_input2);
for(int i=0;i<Nsimd;i++) {
func.template sfunc<reduced,scal>(tmp,input1[i],input2[i]);
reference+=tmp;
}
std::cout << " " << func.name()<<std::endl;
int ok=0;
if ( abs(reference-result)/abs(reference) > 1.0e-6 ){ // rounding is possible for reduce order
std::cout<< "*****" << std::endl;
std::cout<< abs(reference-result) << " " <<reference<< " " << result<<std::endl;
ok++;
}
if ( ok==0 ) {
std::cout << " OK!" <<std::endl;
}
assert(ok==0);
} }
@ -127,6 +206,8 @@ int main (int argc, char ** argv)
Tester<ComplexF,vComplexF>(funcTimes()); Tester<ComplexF,vComplexF>(funcTimes());
Tester<ComplexF,vComplexF>(funcConj()); Tester<ComplexF,vComplexF>(funcConj());
Tester<ComplexF,vComplexF>(funcAdj()); Tester<ComplexF,vComplexF>(funcAdj());
Tester<ComplexF,vComplexF>(funcInnerProduct());
ReductionTester<ComplexF,ComplexF,vComplexF>(funcReduce());
std::cout << "==================================="<< std::endl; std::cout << "==================================="<< std::endl;
std::cout << "Testing vComplexD "<<std::endl; std::cout << "Testing vComplexD "<<std::endl;
@ -140,6 +221,8 @@ int main (int argc, char ** argv)
Tester<ComplexD,vComplexD>(funcTimes()); Tester<ComplexD,vComplexD>(funcTimes());
Tester<ComplexD,vComplexD>(funcConj()); Tester<ComplexD,vComplexD>(funcConj());
Tester<ComplexD,vComplexD>(funcAdj()); Tester<ComplexD,vComplexD>(funcAdj());
Tester<ComplexD,vComplexD>(funcInnerProduct());
ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());
std::cout << "==================================="<< std::endl; std::cout << "==================================="<< std::endl;
std::cout << "Testing vRealF "<<std::endl; std::cout << "Testing vRealF "<<std::endl;
@ -150,6 +233,9 @@ int main (int argc, char ** argv)
Tester<RealF,vRealF>(funcMinus()); Tester<RealF,vRealF>(funcMinus());
Tester<RealF,vRealF>(funcTimes()); Tester<RealF,vRealF>(funcTimes());
Tester<RealF,vRealF>(funcAdj()); Tester<RealF,vRealF>(funcAdj());
Tester<RealF,vRealF>(funcConj());
Tester<RealF,vRealF>(funcInnerProduct());
ReductionTester<RealF,RealF,vRealF>(funcReduce());
std::cout << "==================================="<< std::endl; std::cout << "==================================="<< std::endl;
std::cout << "Testing vRealD "<<std::endl; std::cout << "Testing vRealD "<<std::endl;
@ -159,6 +245,9 @@ int main (int argc, char ** argv)
Tester<RealD,vRealD>(funcMinus()); Tester<RealD,vRealD>(funcMinus());
Tester<RealD,vRealD>(funcTimes()); Tester<RealD,vRealD>(funcTimes());
Tester<RealD,vRealD>(funcAdj()); Tester<RealD,vRealD>(funcAdj());
Tester<RealD,vRealD>(funcConj());
Tester<RealD,vRealD>(funcInnerProduct());
ReductionTester<RealD,RealD,vRealD>(funcReduce());
Grid_finalize(); Grid_finalize();
} }

View File

@ -27,7 +27,7 @@ public:
class funcConj { class funcConj {
public: public:
funcConj() {}; funcConj() {};
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = conj(i1);} template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = conjugate(i1);}
std::string name(void) const { return std::string("Conj"); } std::string name(void) const { return std::string("Conj"); }
}; };
class funcAdj { class funcAdj {

View File

@ -93,7 +93,7 @@ int main (int argc, char ** argv)
for(int r=0;r<3;r++){ for(int r=0;r<3;r++){
for(int c=0;c<3;c++){ for(int c=0;c<3;c++){
diff =check()()(r,c)-bar()()(r,c); diff =check()()(r,c)-bar()()(r,c);
double nn=real(conj(diff)*diff); double nn=real(conjugate(diff)*diff);
if ( nn > 0){ if ( nn > 0){
printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n", printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n",
coor[0],coor[1],coor[2],coor[3],r,c, coor[0],coor[1],coor[2],coor[3],r,c,
@ -103,8 +103,8 @@ int main (int argc, char ** argv)
real(bar()()(r,c)) real(bar()()(r,c))
); );
} }
snrmC=snrmC+real(conj(check()()(r,c))*check()()(r,c)); snrmC=snrmC+real(conjugate(check()()(r,c))*check()()(r,c));
snrmB=snrmB+real(conj(bar()()(r,c))*bar()()(r,c)); snrmB=snrmB+real(conjugate(bar()()(r,c))*bar()()(r,c));
snrm=snrm+nn; snrm=snrm+nn;
}} }}

View File

@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_builddir)/lib
# #
# Test code # Test code
# #
bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma Grid_simd Grid_rng Grid_remez Grid_simd_new bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma Grid_simd Grid_rng Grid_remez Grid_rng_fixed Grid_simd_new
Grid_main_SOURCES = Grid_main.cc Grid_main_SOURCES = Grid_main.cc
Grid_main_LDADD = -lGrid Grid_main_LDADD = -lGrid
@ -13,6 +13,9 @@ Grid_main_LDADD = -lGrid
Grid_rng_SOURCES = Grid_rng.cc Grid_rng_SOURCES = Grid_rng.cc
Grid_rng_LDADD = -lGrid Grid_rng_LDADD = -lGrid
Grid_rng_fixed_SOURCES = Grid_rng_fixed.cc
Grid_rng_fixed_LDADD = -lGrid
Grid_remez_SOURCES = Grid_remez.cc Grid_remez_SOURCES = Grid_remez.cc
Grid_remez_LDADD = -lGrid Grid_remez_LDADD = -lGrid

View File

@ -186,7 +186,7 @@ int main (int argc, char ** argv)
for(int r=0;r<3;r++){ for(int r=0;r<3;r++){
for(int c=0;c<3;c++){ for(int c=0;c<3;c++){
diff =check._internal._internal[r][c]-bar._internal._internal[r][c]; diff =check._internal._internal[r][c]-bar._internal._internal[r][c];
double nn=real(conj(diff)*diff); double nn=real(conjugate(diff)*diff);
if ( nn > 0 ){ if ( nn > 0 ){
printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n", printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
coor[0],coor[1],coor[2],coor[3],r,c, coor[0],coor[1],coor[2],coor[3],r,c,