1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Merge branch 'feature/normHP' into develop

This commit is contained in:
paboyle 2017-04-18 10:57:21 +01:00
commit de41b84c5c
13 changed files with 671 additions and 433 deletions

17
TODO
View File

@ -3,19 +3,20 @@ TODO:
Peter's work list:
-- Merge high precision reduction into develop
-- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started
-- Merge high precision reduction into develop <-- done
-- Precision conversion and sort out localConvert <--
-- Physical propagator interface
-- Precision conversion and sort out localConvert
-- slice* linalg routines for multiRHS, BlockCG
-- multiRHS DWF; benchmark on Cori/BNL for comms elimination
-- slice* linalg routines for multiRHS, BlockCG <-- started
-- Profile CG, BlockCG, etc... Flop count/rate
-- Binary I/O speed up & x-strips
-- Half-precision comms
-- multiRHS DWF; benchmark on Cori/BNL for comms elimination
-- Half-precision comms <-- started
-- GaugeFix into central location
-- Help Julia with NPR code
-- Switch to measurements
-- FFTfix in sensible place
-- Multigrid Wilson and DWF, compare to other Multigrid implementations
-- Remove DenseVector, DenseMatrix; Use Eigen instead.
-- quaternions -- Might not need

View File

@ -30,210 +30,9 @@ directory
#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
#define GRID_BLOCK_CONJUGATE_GRADIENT_H
#include <Grid/Eigen/Dense>
namespace Grid {
GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(0);
std::vector<int> simd_phys(0);
std::vector<int> mpi_phys(0);
for(int d=0;d<NN;d++){
if( d!=Orthog ) {
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
}
}
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
// FIXME: Implementation is slow
// If we based this on Cshift it would work for spread out
// but it would be even slower
//
// Repeated extract slice is inefficient
//
// Best base the linear combination by constructing a
// set of vectors of size grid->_rdimensions[Orthog].
for(int i=0;i<Nblock;i++){
ExtractSlice(Rslice,Y,i,Orthog);
for(int j=0;j<Nblock;j++){
ExtractSlice(Xslice,X,j,Orthog);
Rslice = Rslice + Xslice*(scale*aa(j,i));
}
InsertSlice(Rslice,R,i,Orthog);
}
};
template<class vobj>
static void sliceMaddVector (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int Orthog,RealD scale=1.0)
{
// FIXME: Implementation is slow
// Best base the linear combination by constructing a
// set of vectors of size grid->_rdimensions[Orthog].
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
// If we based this on Cshift it would work for spread out
// but it would be even slower
for(int i=0;i<Nblock;i++){
ExtractSlice(Rslice,Y,i,Orthog);
ExtractSlice(Xslice,X,i,Orthog);
Rslice = Rslice + Xslice*(scale*a[i]);
InsertSlice(Rslice,R,i,Orthog);
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
// FIXME: Implementation is slow
// Not sure of best solution.. think about it
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
Lattice<vobj> Lslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
for(int i=0;i<Nblock;i++){
ExtractSlice(Lslice,lhs,i,Orthog);
for(int j=0;j<Nblock;j++){
ExtractSlice(Rslice,rhs,j,Orthog);
mat(i,j) = innerProduct(Lslice,Rslice);
}
}
#undef FORCE_DIAG
#ifdef FORCE_DIAG
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
if ( i != j ) mat(i,j)=0.0;
}
}
#endif
return;
}
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
// FIXME: Implementation is slow
// Look at localInnerProduct implementation,
// and do inside a site loop with block strided iterators
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::tensor_reduced scalar;
typedef typename scalar::scalar_object scomplex;
int Nblock = lhs._grid->GlobalDimensions()[Orthog];
vec.resize(Nblock);
std::vector<scomplex> sip(Nblock);
Lattice<scalar> IP(lhs._grid);
IP=localInnerProduct(lhs,rhs);
sliceSum(IP,sip,Orthog);
for(int ss=0;ss<Nblock;ss++){
vec[ss] = TensorRemove(sip[ss]);
}
}
template<class vobj>
static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) {
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = rhs._grid->GlobalDimensions()[Orthog];
std::vector<ComplexD> ip(Nblock);
sn.resize(Nblock);
sliceInnerProductVector(ip,rhs,rhs,Orthog);
for(int ss=0;ss<Nblock;ss++){
sn[ss] = real(ip[ss]);
}
};
/*
template<class vobj>
static void sliceInnerProductMatrixOld( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::tensor_reduced scalar;
typedef typename scalar::scalar_object scomplex;
int Nblock = lhs._grid->GlobalDimensions()[Orthog];
std::cout << " sliceInnerProductMatrix Dim "<<Orthog<<" Nblock " << Nblock<<std::endl;
Lattice<scalar> IP(lhs._grid);
std::vector<scomplex> sip(Nblock);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Lattice<vobj> tmp = rhs;
for(int s1=0;s1<Nblock;s1++){
IP=localInnerProduct(lhs,tmp);
sliceSum(IP,sip,Orthog);
std::cout << "InnerProductMatrix ["<<s1<<"] = ";
for(int ss=0;ss<Nblock;ss++){
std::cout << TensorRemove(sip[ss])<<" ";
}
std::cout << std::endl;
for(int ss=0;ss<Nblock;ss++){
mat(ss,(s1+ss)%Nblock) = TensorRemove(sip[ss]);
}
if ( s1!=(Nblock-1) ) {
tmp = Cshift(tmp,Orthog,1);
}
}
}
*/
//////////////////////////////////////////////////////////////////////////
// Block conjugate gradient. Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
@ -261,8 +60,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = 0; // First dimension is block dim
Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<std::endl;
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Nblock "<<Nblock<<std::endl;
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard;
conformable(Psi, Src);
@ -271,10 +70,6 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
Field AP(Src);
Field R(Src);
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);
@ -317,33 +112,49 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
P = R;
sliceInnerProductMatrix(m_rr,R,R,Orthog);
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
RealD rrsum=0;
for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
std::cout << GridLogIterative << " iteration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
MatrixTimer.Start();
Linop.HermOp(P, AP);
MatrixTimer.Stop();
// Alpha
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
sliceInnerTimer.Stop();
m_pAp_inv = m_pAp.inverse();
m_alpha = m_pAp_inv * m_rr ;
// Psi, R update
sliceMaddTimer.Start();
sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog); // add alpha * P to psi
sliceMaddMatrix(R ,m_alpha,AP, R,Orthog,-1.0);// sub alpha * AP to resid
sliceMaddTimer.Stop();
// Beta
m_rr_inv = m_rr.inverse();
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_rr,R,R,Orthog);
sliceInnerTimer.Stop();
m_beta = m_rr_inv *m_rr;
// Search update
sliceMaddTimer.Start();
sliceMaddMatrix(AP,m_beta,P,R,Orthog);
sliceMaddTimer.Stop();
P= AP;
/*********************
@ -358,16 +169,24 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
if ( max_resid < Tolerance*Tolerance ) {
std::cout << GridLogMessage<<" Block solver has converged in "
<<k<<" iterations; max residual is "<<std::sqrt(max_resid)<<std::endl;
SolverTimer.Stop();
std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< " block "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout << " Block solver true residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
@ -408,8 +227,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = 0; // First dimension is block dim
Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<" MultiRHS Conjugate Gradient : Orthog "<<Orthog<<std::endl;
std::cout<<GridLogMessage<<" MultiRHS Conjugate Gradient : Nblock "<<Nblock<<std::endl;
std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard;
conformable(Psi, Src);
@ -445,38 +264,57 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
P = R;
sliceNorm(v_rr,R,Orthog);
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch sliceNormTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
RealD rrsum=0;
for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
std::cout << GridLogIterative << " iteration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
MatrixTimer.Start();
Linop.HermOp(P, AP);
MatrixTimer.Stop();
// Alpha
// sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog);
sliceInnerTimer.Start();
sliceInnerProductVector(v_pAp,P,AP,Orthog);
sliceInnerTimer.Stop();
for(int b=0;b<Nblock;b++){
// std::cout << " "<< v_pAp[b]<<" "<< v_pAp_test[b]<<std::endl;
v_alpha[b] = v_rr[b]/real(v_pAp[b]);
}
// Psi, R update
sliceMaddTimer.Start();
sliceMaddVector(Psi,v_alpha, P,Psi,Orthog); // add alpha * P to psi
sliceMaddVector(R ,v_alpha,AP, R,Orthog,-1.0);// sub alpha * AP to resid
sliceMaddTimer.Stop();
// Beta
for(int b=0;b<Nblock;b++){
v_rr_inv[b] = 1.0/v_rr[b];
}
sliceNormTimer.Start();
sliceNorm(v_rr,R,Orthog);
sliceNormTimer.Stop();
for(int b=0;b<Nblock;b++){
v_beta[b] = v_rr_inv[b] *v_rr[b];
}
// Search update
sliceMaddTimer.Start();
sliceMaddVector(P,v_beta,P,R,Orthog);
sliceMaddTimer.Stop();
/*********************
* convergence monitor
@ -489,15 +327,27 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
}
if ( max_resid < Tolerance*Tolerance ) {
std::cout << GridLogMessage<<" MultiRHS solver has converged in "
<<k<<" iterations; max residual is "<<std::sqrt(max_resid)<<std::endl;
SolverTimer.Stop();
std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< " block "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout << " MultiRHS solver true residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << sliceNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}

View File

@ -78,18 +78,12 @@ class ConjugateGradient : public OperatorFunction<Field> {
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: p " << a << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
@ -144,19 +138,20 @@ class ConjugateGradient : public OperatorFunction<Field> {
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage
<< "ConjugateGradient: Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual << " target "
<< Tolerance << std::endl;
std::cout << GridLogMessage << "Time elapsed: Iterations "
<< SolverTimer.Elapsed() << " Matrix "
<< MatrixTimer.Elapsed() << " Linalg "
<< LinalgTimer.Elapsed();
std::cout << std::endl;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k;
return;
}
}

View File

@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LATTICE_REDUCTION_H
#define GRID_LATTICE_REDUCTION_H
#include <Grid/Eigen/Dense>
namespace Grid {
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
@ -38,120 +40,123 @@ namespace Grid {
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg);
return std::real(nrm);
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg);
return std::real(nrm);
}
// Double inner product
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
scalar_type nrm;
GridBase *grid = left._grid;
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
}
sumarray[thr]=TensorRemove(vnrm) ;
}
vector_type vvnrm; vvnrm=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vvnrm = vvnrm+sumarray[i];
}
nrm = Reduce(vvnrm);// sum across simd
right._grid->GlobalSum(nrm);
return nrm;
}
template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
scalar_type nrm;
GridBase *grid = left._grid;
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
}
sumarray[thr]=TensorRemove(vnrm) ;
}
vector_type vvnrm; vvnrm=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vvnrm = vvnrm+sumarray[i];
}
nrm = Reduce(vvnrm);// sum across simd
right._grid->GlobalSum(nrm);
return nrm;
}
template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))
))::scalar_object
{
return sum(closure(expr));
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg){
GridBase *grid=arg._grid;
int Nsimd = grid->Nsimd();
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;
sobj ssum=zero;
std::vector<sobj> buf(Nsimd);
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg._grid->GlobalSum(ssum);
return ssum;
{
return sum(closure(expr));
}
template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))
))::scalar_object
{
return sum(closure(expr));
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
GridBase *grid=arg._grid;
int Nsimd = grid->Nsimd();
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;
sobj ssum=zero;
std::vector<sobj> buf(Nsimd);
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg._grid->GlobalSum(ssum);
return ssum;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
{
///////////////////////////////////////////////////////
// FIXME precision promoted summation
// may be important for correlation functions
// But easily avoided by using double precision fields
///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
GridBase *grid = Data._grid;
assert(grid!=NULL);
// FIXME
// std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
@ -163,23 +168,31 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int rd=grid->_rdimensions[orthogdim];
std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars
std::vector<sobj> extracted(Nsimd); // splitting the SIMD
std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars
std::vector<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
result.resize(fd); // And then global sum to return the same vector to every node
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
std::vector<int> coor(Nd);
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
parallel_for(int r=0;r<rd;r++){
for(int ss=0;ss<grid->oSites();ss++){
Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions);
int r = coor[orthogdim];
lvSum[r]=lvSum[r]+Data._odata[ss];
}
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data._odata[ss];
}
}
}
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
@ -214,10 +227,304 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
result[t]=gsum;
}
}
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
assert(grid!=NULL);
conformable(grid,rhs._grid);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
parallel_for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss]));
lvSum[r]=lvSum[r]+vv;
}
}
}
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
temp._internal = lvSum[rt];
extract(temp,extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
}
}
// sum over nodes.
scalar_type gsum;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt];
} else {
gsum=scalar_type(0.0);
}
grid->GlobalSum(gsum);
result[t]=gsum;
}
}
template<class vobj>
static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = rhs._grid->GlobalDimensions()[Orthog];
std::vector<ComplexD> ip(Nblock);
sn.resize(Nblock);
sliceInnerProductVector(ip,rhs,rhs,Orthog);
for(int ss=0;ss<Nblock;ss++){
sn[ss] = real(ip[ss]);
}
};
template<class vobj>
static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int orthogdim,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::tensor_reduced tensor_reduced;
GridBase *grid = X._grid;
int Nsimd =grid->Nsimd();
int Nblock =grid->GlobalDimensions()[orthogdim];
int fd =grid->_fdimensions[orthogdim];
int ld =grid->_ldimensions[orthogdim];
int rd =grid->_rdimensions[orthogdim];
int e1 =grid->_slice_nblock[orthogdim];
int e2 =grid->_slice_block [orthogdim];
int stride =grid->_slice_stride[orthogdim];
std::vector<int> icoor;
for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
vector_type av;
for(int l=0;l<Nsimd;l++){
grid->iCoorFromIindex(icoor,l);
int ldx =r+icoor[orthogdim]*rd;
scalar_type *as =(scalar_type *)&av;
as[l] = scalar_type(a[ldx])*scale;
}
tensor_reduced at; at=av;
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
R._odata[ss] = at*X._odata[ss]+Y._odata[ss];
}
}
}
};
/*
template<class vobj>
static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int Orthog,RealD scale=1.0)
{
// FIXME: Implementation is slow
// Best base the linear combination by constructing a
// set of vectors of size grid->_rdimensions[Orthog].
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
// If we based this on Cshift it would work for spread out
// but it would be even slower
for(int i=0;i<Nblock;i++){
ExtractSlice(Rslice,Y,i,Orthog);
ExtractSlice(Xslice,X,i,Orthog);
Rslice = Rslice + Xslice*(scale*a[i]);
InsertSlice(Rslice,R,i,Orthog);
}
};
template<class vobj>
static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
// FIXME: Implementation is slow
// Look at localInnerProduct implementation,
// and do inside a site loop with block strided iterators
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::tensor_reduced scalar;
typedef typename scalar::scalar_object scomplex;
int Nblock = lhs._grid->GlobalDimensions()[Orthog];
vec.resize(Nblock);
std::vector<scomplex> sip(Nblock);
Lattice<scalar> IP(lhs._grid);
IP=localInnerProduct(lhs,rhs);
sliceSum(IP,sip,Orthog);
for(int ss=0;ss<Nblock;ss++){
vec[ss] = TensorRemove(sip[ss]);
}
}
*/
//////////////////////////////////////////////////////////////////////////////////////////
// FIXME: Implementation is slow
// If we based this on Cshift it would work for spread out
// but it would be even slower
//
// Repeated extract slice is inefficient
//
// Best base the linear combination by constructing a
// set of vectors of size grid->_rdimensions[Orthog].
//////////////////////////////////////////////////////////////////////////////////////////
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(0);
std::vector<int> simd_phys(0);
std::vector<int> mpi_phys(0);
for(int d=0;d<NN;d++){
if( d!=Orthog ) {
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
}
}
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
}
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
for(int i=0;i<Nblock;i++){
ExtractSlice(Rslice,Y,i,Orthog);
for(int j=0;j<Nblock;j++){
ExtractSlice(Xslice,X,j,Orthog);
Rslice = Rslice + Xslice*(scale*aa(j,i));
}
InsertSlice(Rslice,R,i,Orthog);
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
// FIXME: Implementation is slow
// Not sure of best solution.. think about it
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
Lattice<vobj> Lslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
for(int i=0;i<Nblock;i++){
ExtractSlice(Lslice,lhs,i,Orthog);
for(int j=0;j<Nblock;j++){
ExtractSlice(Rslice,rhs,j,Orthog);
mat(i,j) = innerProduct(Lslice,Rslice);
}
}
#undef FORCE_DIAG
#ifdef FORCE_DIAG
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
if ( i != j ) mat(i,j)=0.0;
}
}
#endif
return;
}
} /*END NAMESPACE GRID*/
#endif

View File

@ -110,8 +110,8 @@ public:
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) {
stream << log.background()<< std::setw(10) << std::left << log.topName << log.background()<< " : ";
stream << log.colour() << std::setw(14) << std::left << log.name << log.background() << " : ";
stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : ";
stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : ";
if ( log.timestamp ) {
StopWatch.Stop();
GridTime now = StopWatch.Elapsed();

View File

@ -2,7 +2,7 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_vector_types.h
Source file: ./lib/simd/Grid_vector_type.h
Copyright (C) 2015
@ -411,7 +411,6 @@ template <class S, class V, IfNotComplex<S> = 0>
inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
nrot = nrot % Grid_simd<S, V>::Nsimd();
Grid_simd<S, V> ret;
// std::cout << "Rotate Real by "<<nrot<<std::endl;
ret.v = Optimization::Rotate::rotate(b.v, nrot);
return ret;
}
@ -419,7 +418,6 @@ template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
nrot = nrot % Grid_simd<S, V>::Nsimd();
Grid_simd<S, V> ret;
// std::cout << "Rotate Complex by "<<nrot<<std::endl;
ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
return ret;
}
@ -427,14 +425,12 @@ template <class S, class V, IfNotComplex<S> =0>
inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
{
nrot = nrot % Grid_simd<S,V>::Nsimd();
// std::cout << "Rotate Real by "<<nrot<<std::endl;
ret.v = Optimization::Rotate::rotate(b.v,nrot);
}
template <class S, class V, IfComplex<S> =0>
inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
{
nrot = nrot % Grid_simd<S,V>::Nsimd();
// std::cout << "Rotate Complex by "<<nrot<<std::endl;
ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
}
@ -694,7 +690,6 @@ inline Grid_simd<S, V> innerProduct(const Grid_simd<S, V> &l,
const Grid_simd<S, V> &r) {
return conjugate(l) * r;
}
template <class S, class V>
inline Grid_simd<S, V> outerProduct(const Grid_simd<S, V> &l,
const Grid_simd<S, V> &r) {

View File

@ -56,11 +56,11 @@ class iScalar {
typedef vtype element;
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD;
typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
typedef iScalar<tensor_reduced_v> tensor_reduced;
typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
typedef iScalar<tensor_reduced_v> tensor_reduced;
typedef iScalar<recurse_scalar_object> scalar_object;
// substitutes a real or complex version with same tensor structure
typedef iScalar<typename GridTypeMapper<vtype>::Complexified> Complexified;
typedef iScalar<typename GridTypeMapper<vtype>::Realified> Realified;
@ -77,8 +77,12 @@ class iScalar {
iScalar<vtype> & operator= (const iScalar<vtype> &copyme) = default;
iScalar<vtype> & operator= (iScalar<vtype> &&copyme) = default;
*/
iScalar(scalar_type s)
: _internal(s){}; // recurse down and hit the constructor for vector_type
// template<int N=0>
// iScalar(EnableIf<isSIMDvectorized<vector_type>, vector_type> s) : _internal(s){}; // recurse down and hit the constructor for vector_type
iScalar(scalar_type s) : _internal(s){}; // recurse down and hit the constructor for vector_type
iScalar(const Zero &z) { *this = zero; };
iScalar<vtype> &operator=(const Zero &hero) {
@ -134,33 +138,28 @@ class iScalar {
strong_inline const vtype &operator()(void) const { return _internal; }
// Type casts meta programmed, must be pure scalar to match TensorRemove
template <class U = vtype, class V = scalar_type, IfComplex<V> = 0,
IfNotSimd<U> = 0>
template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0>
operator ComplexF() const {
return (TensorRemove(_internal));
};
template <class U = vtype, class V = scalar_type, IfComplex<V> = 0,
IfNotSimd<U> = 0>
template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0>
operator ComplexD() const {
return (TensorRemove(_internal));
};
// template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> =
// 0> operator RealD () const { return(real(TensorRemove(_internal))); }
template <class U = vtype, class V = scalar_type, IfReal<V> = 0,
IfNotSimd<U> = 0>
template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0>
operator RealD() const {
return TensorRemove(_internal);
}
template <class U = vtype, class V = scalar_type, IfInteger<V> = 0,
IfNotSimd<U> = 0>
template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0>
operator Integer() const {
return Integer(TensorRemove(_internal));
}
// convert from a something to a scalar via constructor of something arg
template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type
* = nullptr>
strong_inline iScalar<vtype> operator=(T arg) {
template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
strong_inline iScalar<vtype> operator=(T arg) {
_internal = arg;
return *this;
}
@ -193,6 +192,7 @@ class iVector {
typedef vtype element;
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD;
typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
typedef iScalar<tensor_reduced_v> tensor_reduced;
@ -305,6 +305,7 @@ class iMatrix {
typedef vtype element;
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD;
typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;

View File

@ -29,51 +29,109 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_MATH_INNER_H
#define GRID_MATH_INNER_H
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////
// innerProduct Scalar x Scalar -> Scalar
// innerProduct Vector x Vector -> Scalar
// innerProduct Matrix x Matrix -> Scalar
///////////////////////////////////////////////////////////////////////////////////////
template<class sobj> inline RealD norm2(const sobj &arg){
typedef typename sobj::scalar_type scalar;
decltype(innerProduct(arg,arg)) nrm;
nrm = innerProduct(arg,arg);
RealD ret = real(nrm);
return ret;
}
///////////////////////////////////////////////////////////////////////////////////////
// innerProduct Scalar x Scalar -> Scalar
// innerProduct Vector x Vector -> Scalar
// innerProduct Matrix x Matrix -> Scalar
///////////////////////////////////////////////////////////////////////////////////////
template<class sobj> inline RealD norm2(const sobj &arg){
auto nrm = innerProductD(arg,arg);
RealD ret = real(nrm);
return ret;
}
//////////////////////////////////////
// If single promote to double and sum 2x
//////////////////////////////////////
template<class l,class r,int N> inline
auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
{
typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
iScalar<ret_t> ret;
ret=zero;
for(int c1=0;c1<N;c1++){
ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
}
return ret;
inline ComplexD innerProductD(const ComplexF &l,const ComplexF &r){ return innerProduct(l,r); }
inline ComplexD innerProductD(const ComplexD &l,const ComplexD &r){ return innerProduct(l,r); }
inline RealD innerProductD(const RealD &l,const RealD &r){ return innerProduct(l,r); }
inline RealD innerProductD(const RealF &l,const RealF &r){ return innerProduct(l,r); }
inline vComplexD innerProductD(const vComplexD &l,const vComplexD &r){ return innerProduct(l,r); }
inline vRealD innerProductD(const vRealD &l,const vRealD &r){ return innerProduct(l,r); }
inline vComplexD innerProductD(const vComplexF &l,const vComplexF &r){
vComplexD la,lb;
vComplexD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
return innerProduct(la,ra) + innerProduct(lb,rb);
}
inline vRealD innerProductD(const vRealF &l,const vRealF &r){
vRealD la,lb;
vRealD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
return innerProduct(la,ra) + innerProduct(lb,rb);
}
template<class l,class r,int N> inline
auto innerProductD (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProductD(lhs._internal[0],rhs._internal[0]))>
{
typedef decltype(innerProductD(lhs._internal[0],rhs._internal[0])) ret_t;
iScalar<ret_t> ret;
ret=zero;
for(int c1=0;c1<N;c1++){
ret._internal += innerProductD(lhs._internal[c1],rhs._internal[c1]);
}
template<class l,class r,int N> inline
auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
{
typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
iScalar<ret_t> ret;
iScalar<ret_t> tmp;
ret=zero;
for(int c1=0;c1<N;c1++){
for(int c2=0;c2<N;c2++){
ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
}}
return ret;
}
template<class l,class r> inline
auto innerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))>
{
typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
iScalar<ret_t> ret;
ret._internal = innerProduct(lhs._internal,rhs._internal);
return ret;
return ret;
}
template<class l,class r,int N> inline
auto innerProductD (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0]))>
{
typedef decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
iScalar<ret_t> ret;
iScalar<ret_t> tmp;
ret=zero;
for(int c1=0;c1<N;c1++){
for(int c2=0;c2<N;c2++){
ret._internal+=innerProductD(lhs._internal[c1][c2],rhs._internal[c1][c2]);
}}
return ret;
}
template<class l,class r> inline
auto innerProductD (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProductD(lhs._internal,rhs._internal))>
{
typedef decltype(innerProductD(lhs._internal,rhs._internal)) ret_t;
iScalar<ret_t> ret;
ret._internal = innerProductD(lhs._internal,rhs._internal);
return ret;
}
//////////////////////
// Keep same precison
//////////////////////
template<class l,class r,int N> inline
auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
{
typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
iScalar<ret_t> ret;
ret=zero;
for(int c1=0;c1<N;c1++){
ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
}
return ret;
}
template<class l,class r,int N> inline
auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
{
typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
iScalar<ret_t> ret;
iScalar<ret_t> tmp;
ret=zero;
for(int c1=0;c1<N;c1++){
for(int c2=0;c2<N;c2++){
ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
}}
return ret;
}
template<class l,class r> inline
auto innerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))>
{
typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
iScalar<ret_t> ret;
ret._internal = innerProduct(lhs._internal,rhs._internal);
return ret;
}
}
#endif

View File

@ -53,6 +53,7 @@ namespace Grid {
public:
typedef typename T::scalar_type scalar_type;
typedef typename T::vector_type vector_type;
typedef typename T::vector_typeD vector_typeD;
typedef typename T::tensor_reduced tensor_reduced;
typedef typename T::scalar_object scalar_object;
typedef typename T::Complexified Complexified;
@ -67,6 +68,7 @@ namespace Grid {
public:
typedef RealF scalar_type;
typedef RealF vector_type;
typedef RealD vector_typeD;
typedef RealF tensor_reduced ;
typedef RealF scalar_object;
typedef ComplexF Complexified;
@ -77,6 +79,7 @@ namespace Grid {
public:
typedef RealD scalar_type;
typedef RealD vector_type;
typedef RealD vector_typeD;
typedef RealD tensor_reduced;
typedef RealD scalar_object;
typedef ComplexD Complexified;
@ -87,6 +90,7 @@ namespace Grid {
public:
typedef ComplexF scalar_type;
typedef ComplexF vector_type;
typedef ComplexD vector_typeD;
typedef ComplexF tensor_reduced;
typedef ComplexF scalar_object;
typedef ComplexF Complexified;
@ -97,6 +101,7 @@ namespace Grid {
public:
typedef ComplexD scalar_type;
typedef ComplexD vector_type;
typedef ComplexD vector_typeD;
typedef ComplexD tensor_reduced;
typedef ComplexD scalar_object;
typedef ComplexD Complexified;
@ -107,6 +112,7 @@ namespace Grid {
public:
typedef Integer scalar_type;
typedef Integer vector_type;
typedef Integer vector_typeD;
typedef Integer tensor_reduced;
typedef Integer scalar_object;
typedef void Complexified;
@ -118,6 +124,7 @@ namespace Grid {
public:
typedef RealF scalar_type;
typedef vRealF vector_type;
typedef vRealD vector_typeD;
typedef vRealF tensor_reduced;
typedef RealF scalar_object;
typedef vComplexF Complexified;
@ -128,6 +135,7 @@ namespace Grid {
public:
typedef RealD scalar_type;
typedef vRealD vector_type;
typedef vRealD vector_typeD;
typedef vRealD tensor_reduced;
typedef RealD scalar_object;
typedef vComplexD Complexified;
@ -138,6 +146,7 @@ namespace Grid {
public:
typedef ComplexF scalar_type;
typedef vComplexF vector_type;
typedef vComplexD vector_typeD;
typedef vComplexF tensor_reduced;
typedef ComplexF scalar_object;
typedef vComplexF Complexified;
@ -148,6 +157,7 @@ namespace Grid {
public:
typedef ComplexD scalar_type;
typedef vComplexD vector_type;
typedef vComplexD vector_typeD;
typedef vComplexD tensor_reduced;
typedef ComplexD scalar_object;
typedef vComplexD Complexified;
@ -158,6 +168,7 @@ namespace Grid {
public:
typedef Integer scalar_type;
typedef vInteger vector_type;
typedef vInteger vector_typeD;
typedef vInteger tensor_reduced;
typedef Integer scalar_object;
typedef void Complexified;
@ -241,7 +252,8 @@ namespace Grid {
template<typename T>
class isSIMDvectorized{
template<typename U>
static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type, typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *);
static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,
typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *);
template<typename U>
static double test(...);

View File

@ -89,7 +89,7 @@ int main(int argc, char** argv) {
GridStopWatch CGTimer;
SchurDiagMooeeOperator<DomainWallFermionR, LatticeFermion> HermOpEO(Ddwf);
ConjugateGradient<LatticeFermion> CG(1.0e-8, 10000, 0);// switch off the assert
ConjugateGradient<LatticeFermion> CG(1.0e-5, 10000, 0);// switch off the assert
CGTimer.Start();
CG(HermOpEO, src_o, result_o);

View File

@ -73,7 +73,7 @@ int main (int argc, char ** argv)
DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
ConjugateGradient<LatticeFermion> CG(1.0e-6,10000);
CG(HermOp,src,result);
Grid_finalize();

View File

@ -51,7 +51,7 @@ int main (int argc, char ** argv)
typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField;
typename ImprovedStaggeredFermion5DR::ImplParams params;
const int Ls=8;
const int Ls=4;
Grid_init(&argc,&argv);
@ -76,24 +76,44 @@ int main (int argc, char ** argv)
RealD mass=0.01;
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
ConjugateGradient<FermionField> CG(1.0e-8,10000);
BlockConjugateGradient<FermionField> BCG(1.0e-8,10000);
MultiRHSConjugateGradient<FermionField> mCG(1.0e-8,10000);
std::cout << GridLogMessage << " Calling CG "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
FermionField src4d(UGrid); random(pRNG,src4d);
FermionField result4d(UGrid); result4d=zero;
CG(HermOp4d,src4d,result4d);
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero;
CG(HermOp,src,result);
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << " Calling multiRHS CG "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero;
mCG(HermOp,src,result);
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << " Calling Block CG "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero;
BCG(HermOp,src,result);
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
Grid_finalize();
}

View File

@ -76,7 +76,6 @@ int main (int argc, char ** argv)
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
ConjugateGradient<FermionField> CG(1.0e-8,10000);
CG(HermOp,src,result);
Grid_finalize();