mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	MultiRHS solver improvements with slice operations moved into lattice and sped up.
Block solver requires a lot of performance work.
This commit is contained in:
		@@ -60,8 +60,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 | 
			
		||||
{
 | 
			
		||||
  int Orthog = 0; // First dimension is block dim
 | 
			
		||||
  Nblock = Src._grid->_fdimensions[Orthog];
 | 
			
		||||
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Nblock "<<Nblock<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
 | 
			
		||||
 | 
			
		||||
  Psi.checkerboard = Src.checkerboard;
 | 
			
		||||
  conformable(Psi, Src);
 | 
			
		||||
@@ -70,10 +70,6 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 | 
			
		||||
  Field AP(Src);
 | 
			
		||||
  Field R(Src);
 | 
			
		||||
  
 | 
			
		||||
  GridStopWatch LinalgTimer;
 | 
			
		||||
  GridStopWatch MatrixTimer;
 | 
			
		||||
  GridStopWatch SolverTimer;
 | 
			
		||||
 | 
			
		||||
  Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
 | 
			
		||||
  Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
 | 
			
		||||
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 | 
			
		||||
@@ -116,33 +112,49 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 | 
			
		||||
  P = R;
 | 
			
		||||
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
 | 
			
		||||
 | 
			
		||||
  GridStopWatch sliceInnerTimer;
 | 
			
		||||
  GridStopWatch sliceMaddTimer;
 | 
			
		||||
  GridStopWatch MatrixTimer;
 | 
			
		||||
  GridStopWatch SolverTimer;
 | 
			
		||||
  SolverTimer.Start();
 | 
			
		||||
 | 
			
		||||
  int k;
 | 
			
		||||
  for (k = 1; k <= MaxIterations; k++) {
 | 
			
		||||
 | 
			
		||||
    RealD rrsum=0;
 | 
			
		||||
    for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogIterative << " iteration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 | 
			
		||||
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 | 
			
		||||
	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
 | 
			
		||||
 | 
			
		||||
    MatrixTimer.Start();
 | 
			
		||||
    Linop.HermOp(P, AP);
 | 
			
		||||
    MatrixTimer.Stop();
 | 
			
		||||
 | 
			
		||||
    // Alpha
 | 
			
		||||
    sliceInnerTimer.Start();
 | 
			
		||||
    sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
 | 
			
		||||
    sliceInnerTimer.Stop();
 | 
			
		||||
    m_pAp_inv = m_pAp.inverse();
 | 
			
		||||
    m_alpha   = m_pAp_inv * m_rr ;
 | 
			
		||||
 | 
			
		||||
    // Psi, R update
 | 
			
		||||
    sliceMaddTimer.Start();
 | 
			
		||||
    sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi
 | 
			
		||||
    sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
 | 
			
		||||
    sliceMaddTimer.Stop();
 | 
			
		||||
 | 
			
		||||
    // Beta
 | 
			
		||||
    m_rr_inv = m_rr.inverse();
 | 
			
		||||
    sliceInnerTimer.Start();
 | 
			
		||||
    sliceInnerProductMatrix(m_rr,R,R,Orthog);
 | 
			
		||||
    sliceInnerTimer.Stop();
 | 
			
		||||
    m_beta = m_rr_inv *m_rr;
 | 
			
		||||
 | 
			
		||||
    // Search update
 | 
			
		||||
    sliceMaddTimer.Start();
 | 
			
		||||
    sliceMaddMatrix(AP,m_beta,P,R,Orthog);
 | 
			
		||||
    sliceMaddTimer.Stop();
 | 
			
		||||
    P= AP;
 | 
			
		||||
 | 
			
		||||
    /*********************
 | 
			
		||||
@@ -157,16 +169,24 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 | 
			
		||||
    
 | 
			
		||||
    if ( max_resid < Tolerance*Tolerance ) { 
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage<<" Block solver has converged in "
 | 
			
		||||
		<<k<<" iterations; max residual is "<<std::sqrt(max_resid)<<std::endl;
 | 
			
		||||
      SolverTimer.Stop();
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
 | 
			
		||||
      for(int b=0;b<Nblock;b++){
 | 
			
		||||
	std::cout << GridLogMessage<< " block "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
 | 
			
		||||
 | 
			
		||||
      Linop.HermOp(Psi, AP);
 | 
			
		||||
      AP = AP-Src;
 | 
			
		||||
      std::cout << " Block solver true residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
 | 
			
		||||
	    
 | 
			
		||||
      IterationsToComplete = k;
 | 
			
		||||
      return;
 | 
			
		||||
    }
 | 
			
		||||
@@ -207,8 +227,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 | 
			
		||||
{
 | 
			
		||||
  int Orthog = 0; // First dimension is block dim
 | 
			
		||||
  Nblock = Src._grid->_fdimensions[Orthog];
 | 
			
		||||
  std::cout<<GridLogMessage<<" MultiRHS Conjugate Gradient : Orthog "<<Orthog<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage<<" MultiRHS Conjugate Gradient : Nblock "<<Nblock<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
 | 
			
		||||
 | 
			
		||||
  Psi.checkerboard = Src.checkerboard;
 | 
			
		||||
  conformable(Psi, Src);
 | 
			
		||||
@@ -244,40 +264,57 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 | 
			
		||||
  P = R;
 | 
			
		||||
  sliceNorm(v_rr,R,Orthog);
 | 
			
		||||
 | 
			
		||||
  GridStopWatch sliceInnerTimer;
 | 
			
		||||
  GridStopWatch sliceMaddTimer;
 | 
			
		||||
  GridStopWatch sliceNormTimer;
 | 
			
		||||
  GridStopWatch MatrixTimer;
 | 
			
		||||
  GridStopWatch SolverTimer;
 | 
			
		||||
 | 
			
		||||
  SolverTimer.Start();
 | 
			
		||||
  int k;
 | 
			
		||||
  for (k = 1; k <= MaxIterations; k++) {
 | 
			
		||||
 | 
			
		||||
    RealD rrsum=0;
 | 
			
		||||
    for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogIterative << " iteration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 | 
			
		||||
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 | 
			
		||||
	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
 | 
			
		||||
 | 
			
		||||
    MatrixTimer.Start();
 | 
			
		||||
    Linop.HermOp(P, AP);
 | 
			
		||||
    MatrixTimer.Stop();
 | 
			
		||||
 | 
			
		||||
    // Alpha
 | 
			
		||||
    //    sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog);
 | 
			
		||||
    sliceInnerTimer.Start();
 | 
			
		||||
    sliceInnerProductVector(v_pAp,P,AP,Orthog);
 | 
			
		||||
    sliceInnerTimer.Stop();
 | 
			
		||||
    for(int b=0;b<Nblock;b++){
 | 
			
		||||
      //      std::cout << " "<< v_pAp[b]<<" "<< v_pAp_test[b]<<std::endl;
 | 
			
		||||
      v_alpha[b] = v_rr[b]/real(v_pAp[b]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Psi, R update
 | 
			
		||||
    sliceMaddTimer.Start();
 | 
			
		||||
    sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi
 | 
			
		||||
    sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
 | 
			
		||||
    sliceMaddTimer.Stop();
 | 
			
		||||
 | 
			
		||||
    // Beta
 | 
			
		||||
    for(int b=0;b<Nblock;b++){
 | 
			
		||||
      v_rr_inv[b] = 1.0/v_rr[b];
 | 
			
		||||
    }
 | 
			
		||||
    sliceNormTimer.Start();
 | 
			
		||||
    sliceNorm(v_rr,R,Orthog);
 | 
			
		||||
    sliceNormTimer.Stop();
 | 
			
		||||
    for(int b=0;b<Nblock;b++){
 | 
			
		||||
      v_beta[b] = v_rr_inv[b] *v_rr[b];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Search update
 | 
			
		||||
    sliceMaddTimer.Start();
 | 
			
		||||
    sliceMaddVector(P,v_beta,P,R,Orthog);
 | 
			
		||||
    sliceMaddTimer.Stop();
 | 
			
		||||
 | 
			
		||||
    /*********************
 | 
			
		||||
     * convergence monitor
 | 
			
		||||
@@ -290,15 +327,27 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    if ( max_resid < Tolerance*Tolerance ) { 
 | 
			
		||||
      std::cout << GridLogMessage<<" MultiRHS solver has converged in "
 | 
			
		||||
		<<k<<" iterations; max residual is "<<std::sqrt(max_resid)<<std::endl;
 | 
			
		||||
 | 
			
		||||
      SolverTimer.Stop();
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
 | 
			
		||||
      for(int b=0;b<Nblock;b++){
 | 
			
		||||
	std::cout << GridLogMessage<< " block "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
 | 
			
		||||
 | 
			
		||||
      Linop.HermOp(Psi, AP);
 | 
			
		||||
      AP = AP-Src;
 | 
			
		||||
      std::cout << " MultiRHS solver true residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
 | 
			
		||||
      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      IterationsToComplete = k;
 | 
			
		||||
      return;
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -78,18 +78,12 @@ class ConjugateGradient : public OperatorFunction<Field> {
 | 
			
		||||
    cp = a;
 | 
			
		||||
    ssq = norm2(src);
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4)
 | 
			
		||||
              << "ConjugateGradient: guess " << guess << std::endl;
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4)
 | 
			
		||||
              << "ConjugateGradient:   src " << ssq << std::endl;
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4)
 | 
			
		||||
              << "ConjugateGradient:    mp " << d << std::endl;
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4)
 | 
			
		||||
              << "ConjugateGradient:   mmp " << b << std::endl;
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4)
 | 
			
		||||
              << "ConjugateGradient:  cp,r " << cp << std::endl;
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4)
 | 
			
		||||
              << "ConjugateGradient:     p " << a << std::endl;
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl;
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   src " << ssq << std::endl;
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:    mp " << d << std::endl;
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   mmp " << b << std::endl;
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:  cp,r " << cp << std::endl;
 | 
			
		||||
    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:     p " << a << std::endl;
 | 
			
		||||
 | 
			
		||||
    RealD rsq = Tolerance * Tolerance * ssq;
 | 
			
		||||
 | 
			
		||||
@@ -144,19 +138,20 @@ class ConjugateGradient : public OperatorFunction<Field> {
 | 
			
		||||
        RealD resnorm = sqrt(norm2(p));
 | 
			
		||||
        RealD true_residual = resnorm / srcnorm;
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogMessage
 | 
			
		||||
                  << "ConjugateGradient: Converged on iteration " << k << std::endl;
 | 
			
		||||
        std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq)
 | 
			
		||||
                  << " true residual " << true_residual << " target "
 | 
			
		||||
                  << Tolerance << std::endl;
 | 
			
		||||
        std::cout << GridLogMessage << "Time elapsed: Iterations "
 | 
			
		||||
                  << SolverTimer.Elapsed() << " Matrix  "
 | 
			
		||||
                  << MatrixTimer.Elapsed() << " Linalg "
 | 
			
		||||
                  << LinalgTimer.Elapsed();
 | 
			
		||||
        std::cout << std::endl;
 | 
			
		||||
        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
 | 
			
		||||
        std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
 | 
			
		||||
 | 
			
		||||
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 | 
			
		||||
 | 
			
		||||
	IterationsToComplete = k;	
 | 
			
		||||
 | 
			
		||||
        return;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -44,6 +44,7 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
 | 
			
		||||
  ComplexD nrm = innerProduct(arg,arg);
 | 
			
		||||
  return std::real(nrm); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Double inner product
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) 
 | 
			
		||||
@@ -101,7 +102,6 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
 | 
			
		||||
  return sum(closure(expr));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// FIXME precision promoted summation
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 | 
			
		||||
{
 | 
			
		||||
@@ -141,14 +141,22 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 | 
			
		||||
  return ssum;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
 | 
			
		||||
{
 | 
			
		||||
  ///////////////////////////////////////////////////////
 | 
			
		||||
  // FIXME precision promoted summation
 | 
			
		||||
  // may be important for correlation functions
 | 
			
		||||
  // But easily avoided by using double precision fields
 | 
			
		||||
  ///////////////////////////////////////////////////////
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  GridBase  *grid = Data._grid;
 | 
			
		||||
  assert(grid!=NULL);
 | 
			
		||||
 | 
			
		||||
  // FIXME
 | 
			
		||||
  // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
 | 
			
		||||
  const int    Nd = grid->_ndimension;
 | 
			
		||||
  const int Nsimd = grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
@@ -163,19 +171,28 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 | 
			
		||||
  std::vector<sobj> lsSum(ld,zero);                    // sum across these down to scalars
 | 
			
		||||
  std::vector<sobj> extracted(Nsimd);                  // splitting the SIMD
 | 
			
		||||
 | 
			
		||||
  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
 | 
			
		||||
  result.resize(fd); // And then global sum to return the same vector to every node 
 | 
			
		||||
  for(int r=0;r<rd;r++){
 | 
			
		||||
    lvSum[r]=zero;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::vector<int>  coor(Nd);  
 | 
			
		||||
  int e1=    grid->_slice_nblock[orthogdim];
 | 
			
		||||
  int e2=    grid->_slice_block [orthogdim];
 | 
			
		||||
  int stride=grid->_slice_stride[orthogdim];
 | 
			
		||||
 | 
			
		||||
  // sum over reduced dimension planes, breaking out orthog dir
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss++){
 | 
			
		||||
    Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions);
 | 
			
		||||
    int r = coor[orthogdim];
 | 
			
		||||
  // Parallel over orthog direction
 | 
			
		||||
  parallel_for(int r=0;r<rd;r++){
 | 
			
		||||
 | 
			
		||||
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
 | 
			
		||||
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int ss= so+n*stride+b;
 | 
			
		||||
	lvSum[r]=lvSum[r]+Data._odata[ss];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Sum across simd lanes in the plane, breaking out orthog dir.
 | 
			
		||||
  std::vector<int> icoor(Nd);
 | 
			
		||||
@@ -212,32 +229,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
  static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 | 
			
		||||
  {
 | 
			
		||||
    // FIXME: Implementation is slow
 | 
			
		||||
    // Look at localInnerProduct implementation,
 | 
			
		||||
    // and do inside a site loop with block strided iterators
 | 
			
		||||
    typedef typename vobj::scalar_object sobj;
 | 
			
		||||
    typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
    typedef typename vobj::vector_type vector_type;
 | 
			
		||||
    typedef typename vobj::tensor_reduced scalar;
 | 
			
		||||
    typedef typename scalar::scalar_object  scomplex;
 | 
			
		||||
  
 | 
			
		||||
    int Nblock = lhs._grid->GlobalDimensions()[Orthog];
 | 
			
		||||
 | 
			
		||||
    vec.resize(Nblock);
 | 
			
		||||
    std::vector<scomplex> sip(Nblock);
 | 
			
		||||
    Lattice<scalar> IP(lhs._grid); 
 | 
			
		||||
 | 
			
		||||
    IP=localInnerProduct(lhs,rhs);
 | 
			
		||||
    sliceSum(IP,sip,Orthog);
 | 
			
		||||
  
 | 
			
		||||
    for(int ss=0;ss<Nblock;ss++){
 | 
			
		||||
      vec[ss] = TensorRemove(sip[ss]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 | 
			
		||||
{
 | 
			
		||||
@@ -247,8 +238,6 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
 | 
			
		||||
  assert(grid!=NULL);
 | 
			
		||||
  conformable(grid,rhs._grid);
 | 
			
		||||
 | 
			
		||||
  // FIXME
 | 
			
		||||
  // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
 | 
			
		||||
  const int    Nd = grid->_ndimension;
 | 
			
		||||
  const int Nsimd = grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
@@ -268,16 +257,18 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
 | 
			
		||||
    lvSum[r]=zero;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // sum over reduced dimension planes, breaking out orthog dir
 | 
			
		||||
  PARALLEL_REGION {
 | 
			
		||||
    std::vector<int>  coor(Nd);  
 | 
			
		||||
    vector_type vv;
 | 
			
		||||
    PARALLEL_FOR_LOOP_INTERN
 | 
			
		||||
    for(int ss=0;ss<grid->oSites();ss++){
 | 
			
		||||
      Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions);
 | 
			
		||||
      int r = coor[orthogdim];
 | 
			
		||||
      vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss]));
 | 
			
		||||
      PARALLEL_CRITICAL { // ouch slow rfo thrashing atomic fp add
 | 
			
		||||
  int e1=    grid->_slice_nblock[orthogdim];
 | 
			
		||||
  int e2=    grid->_slice_block [orthogdim];
 | 
			
		||||
  int stride=grid->_slice_stride[orthogdim];
 | 
			
		||||
 | 
			
		||||
  parallel_for(int r=0;r<rd;r++){
 | 
			
		||||
 | 
			
		||||
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
 | 
			
		||||
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int ss= so+n*stride+b;
 | 
			
		||||
	vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss]));
 | 
			
		||||
	lvSum[r]=lvSum[r]+vv;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
@@ -287,7 +278,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
 | 
			
		||||
  std::vector<int> icoor(Nd);
 | 
			
		||||
  for(int rt=0;rt<rd;rt++){
 | 
			
		||||
 | 
			
		||||
    iScalar<vector_type> temp; temp._internal = lvSum[rt];
 | 
			
		||||
    iScalar<vector_type> temp; 
 | 
			
		||||
    temp._internal = lvSum[rt];
 | 
			
		||||
    extract(temp,extracted);
 | 
			
		||||
 | 
			
		||||
    for(int idx=0;idx<Nsimd;idx++){
 | 
			
		||||
@@ -317,122 +309,78 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
 | 
			
		||||
    result[t]=gsum;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#if 0
 | 
			
		||||
template<class vobj>
 | 
			
		||||
static void sliceInnerProductVector( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 | 
			
		||||
static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) 
 | 
			
		||||
{
 | 
			
		||||
    // FIXME: Implementation is slow
 | 
			
		||||
    // Look at sliceSum implementation,
 | 
			
		||||
    // and do inside a site loop with block strided iterators
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
    typedef typename vobj::tensor_reduced scalar;
 | 
			
		||||
    typedef typename scalar::scalar_object  scomplex;
 | 
			
		||||
  
 | 
			
		||||
    GridBase * grid = lhs._grid;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  const int    Nd = grid->_ndimension;
 | 
			
		||||
  const int Nsimd = grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
  assert(orthogdim >= 0);
 | 
			
		||||
  assert(orthogdim < Nd);
 | 
			
		||||
 | 
			
		||||
  int fd=grid->_fdimensions[orthogdim];
 | 
			
		||||
  int ld=grid->_ldimensions[orthogdim];
 | 
			
		||||
  int rd=grid->_rdimensions[orthogdim];
 | 
			
		||||
 | 
			
		||||
    int Nblock  = grid->GlobalDimensions()[Orthog];
 | 
			
		||||
    int Nrblock = grid->_rdimensions[Orthog];
 | 
			
		||||
    int Nthr    = grid->SumArraySize();
 | 
			
		||||
 | 
			
		||||
    std::vector<vector_type,alignedAllocator<vector_type> > sumarray(Nrblock*Nthr);
 | 
			
		||||
 | 
			
		||||
    parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
 | 
			
		||||
 | 
			
		||||
      int nwork, mywork, myoff;
 | 
			
		||||
 | 
			
		||||
      for(int rb=0;rb<Nrblock;rb++){
 | 
			
		||||
	GridThread::GetWork((left._grid->oSites()/Nrblock),thr,mywork,myoff);
 | 
			
		||||
	int off = rb * grid->_slice_
 | 
			
		||||
	vector_type vnrm=zero; // private to thread; sub summation
 | 
			
		||||
	for(int ss=myoff;ss<mywork+myoff; ss++){
 | 
			
		||||
	  vnrm = vnrm + TensorRemove(innerProductD(left._odata[ss],right._odata[ss]));
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      sumarray[thr+Nthr*rb]=vnrm ;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    vec.resize(Nblock);
 | 
			
		||||
    std::vector<scomplex> sip(Nblock);
 | 
			
		||||
    Lattice<scalar> IP(lhs._grid); 
 | 
			
		||||
 | 
			
		||||
    IP=localInnerProduct(lhs,rhs);
 | 
			
		||||
    sliceSum(IP,sip,Orthog);
 | 
			
		||||
  int Nblock = rhs._grid->GlobalDimensions()[Orthog];
 | 
			
		||||
  std::vector<ComplexD> ip(Nblock);
 | 
			
		||||
  sn.resize(Nblock);
 | 
			
		||||
  
 | 
			
		||||
  sliceInnerProductVector(ip,rhs,rhs,Orthog);
 | 
			
		||||
  for(int ss=0;ss<Nblock;ss++){
 | 
			
		||||
      vec[ss] = TensorRemove(sip[ss]);
 | 
			
		||||
    sn[ss] = real(ip[ss]);
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 | 
			
		||||
 {
 | 
			
		||||
   int NN    = BlockSolverGrid->_ndimension;
 | 
			
		||||
   int nsimd = BlockSolverGrid->Nsimd();
 | 
			
		||||
 | 
			
		||||
   std::vector<int> latt_phys(0);
 | 
			
		||||
   std::vector<int> simd_phys(0);
 | 
			
		||||
   std::vector<int>  mpi_phys(0);
 | 
			
		||||
  
 | 
			
		||||
   for(int d=0;d<NN;d++){
 | 
			
		||||
     if( d!=Orthog ) { 
 | 
			
		||||
       latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
 | 
			
		||||
       simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
 | 
			
		||||
       mpi_phys.push_back(BlockSolverGrid->_processors[d]);
 | 
			
		||||
     }
 | 
			
		||||
   }
 | 
			
		||||
   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 | 
			
		||||
 }
 | 
			
		||||
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum
 | 
			
		||||
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj>
 | 
			
		||||
  static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 | 
			
		||||
  {    
 | 
			
		||||
static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 | 
			
		||||
			    int orthogdim,RealD scale=1.0) 
 | 
			
		||||
{    
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
  typedef typename vobj::tensor_reduced tensor_reduced;
 | 
			
		||||
  
 | 
			
		||||
    int Nblock = X._grid->GlobalDimensions()[Orthog];
 | 
			
		||||
  GridBase *grid  = X._grid;
 | 
			
		||||
 | 
			
		||||
    GridBase *FullGrid  = X._grid;
 | 
			
		||||
    GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
  int Nsimd  =grid->Nsimd();
 | 
			
		||||
  int Nblock =grid->GlobalDimensions()[orthogdim];
 | 
			
		||||
 | 
			
		||||
    Lattice<vobj> Xslice(SliceGrid);
 | 
			
		||||
    Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
    // FIXME: Implementation is slow
 | 
			
		||||
    // If we based this on Cshift it would work for spread out
 | 
			
		||||
    // but it would be even slower
 | 
			
		||||
    //
 | 
			
		||||
    // Repeated extract slice is inefficient
 | 
			
		||||
    //
 | 
			
		||||
    // Best base the linear combination by constructing a 
 | 
			
		||||
    // set of vectors of size grid->_rdimensions[Orthog].
 | 
			
		||||
    for(int i=0;i<Nblock;i++){
 | 
			
		||||
      ExtractSlice(Rslice,Y,i,Orthog);
 | 
			
		||||
      for(int j=0;j<Nblock;j++){
 | 
			
		||||
	ExtractSlice(Xslice,X,j,Orthog);
 | 
			
		||||
	Rslice = Rslice + Xslice*(scale*aa(j,i));
 | 
			
		||||
  int fd     =grid->_fdimensions[orthogdim];
 | 
			
		||||
  int ld     =grid->_ldimensions[orthogdim];
 | 
			
		||||
  int rd     =grid->_rdimensions[orthogdim];
 | 
			
		||||
 | 
			
		||||
  int e1     =grid->_slice_nblock[orthogdim];
 | 
			
		||||
  int e2     =grid->_slice_block [orthogdim];
 | 
			
		||||
  int stride =grid->_slice_stride[orthogdim];
 | 
			
		||||
 | 
			
		||||
  std::vector<int> icoor;
 | 
			
		||||
 | 
			
		||||
  for(int r=0;r<rd;r++){
 | 
			
		||||
 | 
			
		||||
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
 | 
			
		||||
 | 
			
		||||
    vector_type    av;
 | 
			
		||||
 | 
			
		||||
    for(int l=0;l<Nsimd;l++){
 | 
			
		||||
      grid->iCoorFromIindex(icoor,l);
 | 
			
		||||
      int ldx =r+icoor[orthogdim]*rd;
 | 
			
		||||
      scalar_type *as =(scalar_type *)&av;
 | 
			
		||||
      as[l] = scalar_type(a[ldx])*scale;
 | 
			
		||||
    }
 | 
			
		||||
      InsertSlice(Rslice,R,i,Orthog);
 | 
			
		||||
 | 
			
		||||
    tensor_reduced at; at=av;
 | 
			
		||||
 | 
			
		||||
    parallel_for_nest2(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int ss= so+n*stride+b;
 | 
			
		||||
	R._odata[ss] = at*X._odata[ss]+Y._odata[ss];
 | 
			
		||||
      }
 | 
			
		||||
  };
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
template<class vobj>
 | 
			
		||||
  static void sliceMaddVector (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 | 
			
		||||
static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 | 
			
		||||
			     int Orthog,RealD scale=1.0) 
 | 
			
		||||
  {    
 | 
			
		||||
{    
 | 
			
		||||
  // FIXME: Implementation is slow
 | 
			
		||||
  // Best base the linear combination by constructing a 
 | 
			
		||||
  // set of vectors of size grid->_rdimensions[Orthog].
 | 
			
		||||
@@ -455,10 +403,94 @@ template<class vobj>
 | 
			
		||||
    Rslice = Rslice + Xslice*(scale*a[i]);
 | 
			
		||||
    InsertSlice(Rslice,R,i,Orthog);
 | 
			
		||||
  }
 | 
			
		||||
  };
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
  static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 | 
			
		||||
static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 | 
			
		||||
  {
 | 
			
		||||
    // FIXME: Implementation is slow
 | 
			
		||||
    // Look at localInnerProduct implementation,
 | 
			
		||||
    // and do inside a site loop with block strided iterators
 | 
			
		||||
    typedef typename vobj::scalar_object sobj;
 | 
			
		||||
    typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
    typedef typename vobj::vector_type vector_type;
 | 
			
		||||
    typedef typename vobj::tensor_reduced scalar;
 | 
			
		||||
    typedef typename scalar::scalar_object  scomplex;
 | 
			
		||||
  
 | 
			
		||||
    int Nblock = lhs._grid->GlobalDimensions()[Orthog];
 | 
			
		||||
 | 
			
		||||
    vec.resize(Nblock);
 | 
			
		||||
    std::vector<scomplex> sip(Nblock);
 | 
			
		||||
    Lattice<scalar> IP(lhs._grid); 
 | 
			
		||||
 | 
			
		||||
    IP=localInnerProduct(lhs,rhs);
 | 
			
		||||
    sliceSum(IP,sip,Orthog);
 | 
			
		||||
  
 | 
			
		||||
    for(int ss=0;ss<Nblock;ss++){
 | 
			
		||||
      vec[ss] = TensorRemove(sip[ss]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// FIXME: Implementation is slow
 | 
			
		||||
// If we based this on Cshift it would work for spread out
 | 
			
		||||
// but it would be even slower
 | 
			
		||||
//
 | 
			
		||||
// Repeated extract slice is inefficient
 | 
			
		||||
//
 | 
			
		||||
// Best base the linear combination by constructing a 
 | 
			
		||||
// set of vectors of size grid->_rdimensions[Orthog].
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 | 
			
		||||
{
 | 
			
		||||
  int NN    = BlockSolverGrid->_ndimension;
 | 
			
		||||
  int nsimd = BlockSolverGrid->Nsimd();
 | 
			
		||||
  
 | 
			
		||||
  std::vector<int> latt_phys(0);
 | 
			
		||||
  std::vector<int> simd_phys(0);
 | 
			
		||||
  std::vector<int>  mpi_phys(0);
 | 
			
		||||
  
 | 
			
		||||
  for(int d=0;d<NN;d++){
 | 
			
		||||
    if( d!=Orthog ) { 
 | 
			
		||||
      latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
 | 
			
		||||
      simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
 | 
			
		||||
      mpi_phys.push_back(BlockSolverGrid->_processors[d]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 | 
			
		||||
{    
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
  int Nblock = X._grid->GlobalDimensions()[Orthog];
 | 
			
		||||
  
 | 
			
		||||
  GridBase *FullGrid  = X._grid;
 | 
			
		||||
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
  
 | 
			
		||||
  Lattice<vobj> Xslice(SliceGrid);
 | 
			
		||||
  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
  
 | 
			
		||||
  for(int i=0;i<Nblock;i++){
 | 
			
		||||
    ExtractSlice(Rslice,Y,i,Orthog);
 | 
			
		||||
    for(int j=0;j<Nblock;j++){
 | 
			
		||||
      ExtractSlice(Xslice,X,j,Orthog);
 | 
			
		||||
      Rslice = Rslice + Xslice*(scale*aa(j,i));
 | 
			
		||||
    }
 | 
			
		||||
    InsertSlice(Rslice,R,i,Orthog);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 | 
			
		||||
{
 | 
			
		||||
  // FIXME: Implementation is slow
 | 
			
		||||
  // Not sure of best solution.. think about it
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
@@ -482,26 +514,17 @@ template<class vobj>
 | 
			
		||||
      mat(i,j) = innerProduct(Lslice,Rslice);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
#undef FORCE_DIAG
 | 
			
		||||
#ifdef FORCE_DIAG
 | 
			
		||||
  for(int i=0;i<Nblock;i++){
 | 
			
		||||
    for(int j=0;j<Nblock;j++){
 | 
			
		||||
      if ( i != j ) mat(i,j)=0.0;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  return;
 | 
			
		||||
  }
 | 
			
		||||
template<class vobj>
 | 
			
		||||
  static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) {
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
  
 | 
			
		||||
  int Nblock = rhs._grid->GlobalDimensions()[Orthog];
 | 
			
		||||
  std::vector<ComplexD> ip(Nblock);
 | 
			
		||||
  sn.resize(Nblock);
 | 
			
		||||
  
 | 
			
		||||
  sliceInnerProductVector(ip,rhs,rhs,Orthog);
 | 
			
		||||
  for(int ss=0;ss<Nblock;ss++){
 | 
			
		||||
    sn[ss] = real(ip[ss]);
 | 
			
		||||
  }
 | 
			
		||||
 };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
} /*END NAMESPACE GRID*/
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -110,8 +110,8 @@ public:
 | 
			
		||||
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
 | 
			
		||||
 | 
			
		||||
    if ( log.active ) {
 | 
			
		||||
      stream << log.background()<< std::setw(10) << std::left << log.topName << log.background()<< " : ";
 | 
			
		||||
      stream << log.colour() << std::setw(14) << std::left << log.name << log.background() << " : ";
 | 
			
		||||
      stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : ";
 | 
			
		||||
      stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : ";
 | 
			
		||||
      if ( log.timestamp ) {
 | 
			
		||||
	StopWatch.Stop();
 | 
			
		||||
	GridTime now = StopWatch.Elapsed();
 | 
			
		||||
 
 | 
			
		||||
@@ -411,7 +411,6 @@ template <class S, class V, IfNotComplex<S> = 0>
 | 
			
		||||
inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
 | 
			
		||||
  nrot = nrot % Grid_simd<S, V>::Nsimd();
 | 
			
		||||
  Grid_simd<S, V> ret;
 | 
			
		||||
  //    std::cout << "Rotate Real by "<<nrot<<std::endl;
 | 
			
		||||
  ret.v = Optimization::Rotate::rotate(b.v, nrot);
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
@@ -419,7 +418,6 @@ template <class S, class V, IfComplex<S> = 0>
 | 
			
		||||
inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
 | 
			
		||||
  nrot = nrot % Grid_simd<S, V>::Nsimd();
 | 
			
		||||
  Grid_simd<S, V> ret;
 | 
			
		||||
  //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
 | 
			
		||||
  ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
@@ -427,14 +425,12 @@ template <class S, class V, IfNotComplex<S> =0>
 | 
			
		||||
inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 | 
			
		||||
{
 | 
			
		||||
  nrot = nrot % Grid_simd<S,V>::Nsimd();
 | 
			
		||||
  //    std::cout << "Rotate Real by "<<nrot<<std::endl;
 | 
			
		||||
  ret.v = Optimization::Rotate::rotate(b.v,nrot);
 | 
			
		||||
}
 | 
			
		||||
template <class S, class V, IfComplex<S> =0> 
 | 
			
		||||
inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 | 
			
		||||
{
 | 
			
		||||
  nrot = nrot % Grid_simd<S,V>::Nsimd();
 | 
			
		||||
  //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
 | 
			
		||||
  ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -58,10 +58,9 @@ class iScalar {
 | 
			
		||||
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
 | 
			
		||||
  typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD;
 | 
			
		||||
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
 | 
			
		||||
  typedef iScalar<tensor_reduced_v> tensor_reduced;
 | 
			
		||||
  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
 | 
			
		||||
  typedef iScalar<tensor_reduced_v> tensor_reduced;
 | 
			
		||||
  typedef iScalar<recurse_scalar_object> scalar_object;
 | 
			
		||||
 | 
			
		||||
  // substitutes a real or complex version with same tensor structure
 | 
			
		||||
  typedef iScalar<typename GridTypeMapper<vtype>::Complexified> Complexified;
 | 
			
		||||
  typedef iScalar<typename GridTypeMapper<vtype>::Realified> Realified;
 | 
			
		||||
@@ -78,8 +77,12 @@ class iScalar {
 | 
			
		||||
  iScalar<vtype> & operator= (const iScalar<vtype> ©me) = default;
 | 
			
		||||
  iScalar<vtype> & operator= (iScalar<vtype> &©me) = default;
 | 
			
		||||
  */
 | 
			
		||||
  iScalar(scalar_type s)
 | 
			
		||||
      : _internal(s){};  // recurse down and hit the constructor for vector_type
 | 
			
		||||
 | 
			
		||||
  //  template<int N=0>
 | 
			
		||||
  //  iScalar(EnableIf<isSIMDvectorized<vector_type>, vector_type> s) : _internal(s){};  // recurse down and hit the constructor for vector_type
 | 
			
		||||
 | 
			
		||||
  iScalar(scalar_type s) : _internal(s){};  // recurse down and hit the constructor for vector_type
 | 
			
		||||
 | 
			
		||||
  iScalar(const Zero &z) { *this = zero; };
 | 
			
		||||
 | 
			
		||||
  iScalar<vtype> &operator=(const Zero &hero) {
 | 
			
		||||
@@ -135,32 +138,27 @@ class iScalar {
 | 
			
		||||
  strong_inline const vtype &operator()(void) const { return _internal; }
 | 
			
		||||
 | 
			
		||||
  // Type casts meta programmed, must be pure scalar to match TensorRemove
 | 
			
		||||
  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0,
 | 
			
		||||
            IfNotSimd<U> = 0>
 | 
			
		||||
  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0>
 | 
			
		||||
  operator ComplexF() const {
 | 
			
		||||
    return (TensorRemove(_internal));
 | 
			
		||||
  };
 | 
			
		||||
  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0,
 | 
			
		||||
            IfNotSimd<U> = 0>
 | 
			
		||||
  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0>
 | 
			
		||||
  operator ComplexD() const {
 | 
			
		||||
    return (TensorRemove(_internal));
 | 
			
		||||
  };
 | 
			
		||||
  //  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> =
 | 
			
		||||
  //  0> operator RealD    () const { return(real(TensorRemove(_internal))); }
 | 
			
		||||
  template <class U = vtype, class V = scalar_type, IfReal<V> = 0,
 | 
			
		||||
            IfNotSimd<U> = 0>
 | 
			
		||||
  template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0>
 | 
			
		||||
  operator RealD() const {
 | 
			
		||||
    return TensorRemove(_internal);
 | 
			
		||||
  }
 | 
			
		||||
  template <class U = vtype, class V = scalar_type, IfInteger<V> = 0,
 | 
			
		||||
            IfNotSimd<U> = 0>
 | 
			
		||||
  template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0>
 | 
			
		||||
  operator Integer() const {
 | 
			
		||||
    return Integer(TensorRemove(_internal));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // convert from a something to a scalar via constructor of something arg
 | 
			
		||||
  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type
 | 
			
		||||
                         * = nullptr>
 | 
			
		||||
  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
 | 
			
		||||
    strong_inline iScalar<vtype> operator=(T arg) {
 | 
			
		||||
    _internal = arg;
 | 
			
		||||
    return *this;
 | 
			
		||||
 
 | 
			
		||||
@@ -252,7 +252,8 @@ namespace Grid {
 | 
			
		||||
  template<typename T>
 | 
			
		||||
  class isSIMDvectorized{
 | 
			
		||||
    template<typename U>
 | 
			
		||||
    static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,   typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *);
 | 
			
		||||
    static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,   
 | 
			
		||||
      typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *);
 | 
			
		||||
 | 
			
		||||
    template<typename U>
 | 
			
		||||
    static double test(...);
 | 
			
		||||
 
 | 
			
		||||
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; 
 | 
			
		||||
  typename ImprovedStaggeredFermion5DR::ImplParams params; 
 | 
			
		||||
 | 
			
		||||
  const int Ls=8;
 | 
			
		||||
  const int Ls=4;
 | 
			
		||||
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
@@ -76,24 +76,44 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.01;
 | 
			
		||||
  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
 | 
			
		||||
 | 
			
		||||
  MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
 | 
			
		||||
 | 
			
		||||
  ConjugateGradient<FermionField> CG(1.0e-8,10000);
 | 
			
		||||
  BlockConjugateGradient<FermionField> BCG(1.0e-8,10000);
 | 
			
		||||
  MultiRHSConjugateGradient<FermionField> mCG(1.0e-8,10000);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << " Calling CG "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
 | 
			
		||||
  MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
 | 
			
		||||
  FermionField src4d(UGrid); random(pRNG,src4d);
 | 
			
		||||
  FermionField result4d(UGrid); result4d=zero;
 | 
			
		||||
  CG(HermOp4d,src4d,result4d);
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  result=zero;
 | 
			
		||||
  CG(HermOp,src,result);
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << " Calling multiRHS CG "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  result=zero;
 | 
			
		||||
  mCG(HermOp,src,result);
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << " Calling Block CG "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  result=zero;
 | 
			
		||||
  BCG(HermOp,src,result);
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -76,7 +76,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
 | 
			
		||||
 | 
			
		||||
  MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
 | 
			
		||||
  ConjugateGradient<FermionField> CG(1.0e-8,10000);
 | 
			
		||||
  CG(HermOp,src,result);
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user