mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 03:54:33 +00:00 
			
		
		
		
	MultiRHS solver improvements with slice operations moved into lattice and sped up.
Block solver requires a lot of performance work.
This commit is contained in:
		| @@ -60,8 +60,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) | ||||
| { | ||||
|   int Orthog = 0; // First dimension is block dim | ||||
|   Nblock = Src._grid->_fdimensions[Orthog]; | ||||
|   std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<std::endl; | ||||
|   std::cout<<GridLogMessage<<" Block Conjugate Gradient : Nblock "<<Nblock<<std::endl; | ||||
|  | ||||
|   std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; | ||||
|  | ||||
|   Psi.checkerboard = Src.checkerboard; | ||||
|   conformable(Psi, Src); | ||||
| @@ -70,10 +70,6 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) | ||||
|   Field AP(Src); | ||||
|   Field R(Src); | ||||
|    | ||||
|   GridStopWatch LinalgTimer; | ||||
|   GridStopWatch MatrixTimer; | ||||
|   GridStopWatch SolverTimer; | ||||
|  | ||||
|   Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock); | ||||
|   Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock); | ||||
|   Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
| @@ -116,33 +112,49 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) | ||||
|   P = R; | ||||
|   sliceInnerProductMatrix(m_rr,R,R,Orthog); | ||||
|  | ||||
|   GridStopWatch sliceInnerTimer; | ||||
|   GridStopWatch sliceMaddTimer; | ||||
|   GridStopWatch MatrixTimer; | ||||
|   GridStopWatch SolverTimer; | ||||
|   SolverTimer.Start(); | ||||
|  | ||||
|   int k; | ||||
|   for (k = 1; k <= MaxIterations; k++) { | ||||
|  | ||||
|     RealD rrsum=0; | ||||
|     for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b)); | ||||
|  | ||||
|     std::cout << GridLogIterative << " iteration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum | ||||
|     std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum | ||||
| 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl; | ||||
|  | ||||
|     MatrixTimer.Start(); | ||||
|     Linop.HermOp(P, AP); | ||||
|     MatrixTimer.Stop(); | ||||
|  | ||||
|     // Alpha | ||||
|     sliceInnerTimer.Start(); | ||||
|     sliceInnerProductMatrix(m_pAp,P,AP,Orthog); | ||||
|     sliceInnerTimer.Stop(); | ||||
|     m_pAp_inv = m_pAp.inverse(); | ||||
|     m_alpha   = m_pAp_inv * m_rr ; | ||||
|  | ||||
|     // Psi, R update | ||||
|     sliceMaddTimer.Start(); | ||||
|     sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi | ||||
|     sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid | ||||
|     sliceMaddTimer.Stop(); | ||||
|  | ||||
|     // Beta | ||||
|     m_rr_inv = m_rr.inverse(); | ||||
|     sliceInnerTimer.Start(); | ||||
|     sliceInnerProductMatrix(m_rr,R,R,Orthog); | ||||
|     sliceInnerTimer.Stop(); | ||||
|     m_beta = m_rr_inv *m_rr; | ||||
|  | ||||
|     // Search update | ||||
|     sliceMaddTimer.Start(); | ||||
|     sliceMaddMatrix(AP,m_beta,P,R,Orthog); | ||||
|     sliceMaddTimer.Stop(); | ||||
|     P= AP; | ||||
|  | ||||
|     /********************* | ||||
| @@ -157,16 +169,24 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) | ||||
|      | ||||
|     if ( max_resid < Tolerance*Tolerance ) {  | ||||
|  | ||||
|       std::cout << GridLogMessage<<" Block solver has converged in " | ||||
| 		<<k<<" iterations; max residual is "<<std::sqrt(max_resid)<<std::endl; | ||||
|       SolverTimer.Stop(); | ||||
|  | ||||
|       std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl; | ||||
|       for(int b=0;b<Nblock;b++){ | ||||
| 	std::cout << GridLogMessage<< " block "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl; | ||||
| 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl; | ||||
|       } | ||||
|       std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl; | ||||
|  | ||||
|       Linop.HermOp(Psi, AP); | ||||
|       AP = AP-Src; | ||||
|       std::cout << " Block solver true residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl; | ||||
|       std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl; | ||||
|  | ||||
|       std::cout << GridLogMessage << "Time Breakdown "<<std::endl; | ||||
|       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl; | ||||
| 	     | ||||
|       IterationsToComplete = k; | ||||
|       return; | ||||
|     } | ||||
| @@ -207,8 +227,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) | ||||
| { | ||||
|   int Orthog = 0; // First dimension is block dim | ||||
|   Nblock = Src._grid->_fdimensions[Orthog]; | ||||
|   std::cout<<GridLogMessage<<" MultiRHS Conjugate Gradient : Orthog "<<Orthog<<std::endl; | ||||
|   std::cout<<GridLogMessage<<" MultiRHS Conjugate Gradient : Nblock "<<Nblock<<std::endl; | ||||
|  | ||||
|   std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; | ||||
|  | ||||
|   Psi.checkerboard = Src.checkerboard; | ||||
|   conformable(Psi, Src); | ||||
| @@ -244,40 +264,57 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) | ||||
|   P = R; | ||||
|   sliceNorm(v_rr,R,Orthog); | ||||
|  | ||||
|   GridStopWatch sliceInnerTimer; | ||||
|   GridStopWatch sliceMaddTimer; | ||||
|   GridStopWatch sliceNormTimer; | ||||
|   GridStopWatch MatrixTimer; | ||||
|   GridStopWatch SolverTimer; | ||||
|  | ||||
|   SolverTimer.Start(); | ||||
|   int k; | ||||
|   for (k = 1; k <= MaxIterations; k++) { | ||||
|  | ||||
|     RealD rrsum=0; | ||||
|     for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]); | ||||
|  | ||||
|     std::cout << GridLogIterative << " iteration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum | ||||
|     std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum | ||||
| 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl; | ||||
|  | ||||
|     MatrixTimer.Start(); | ||||
|     Linop.HermOp(P, AP); | ||||
|     MatrixTimer.Stop(); | ||||
|  | ||||
|     // Alpha | ||||
|     //    sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog); | ||||
|     sliceInnerTimer.Start(); | ||||
|     sliceInnerProductVector(v_pAp,P,AP,Orthog); | ||||
|     sliceInnerTimer.Stop(); | ||||
|     for(int b=0;b<Nblock;b++){ | ||||
|       //      std::cout << " "<< v_pAp[b]<<" "<< v_pAp_test[b]<<std::endl; | ||||
|       v_alpha[b] = v_rr[b]/real(v_pAp[b]); | ||||
|     } | ||||
|  | ||||
|     // Psi, R update | ||||
|     sliceMaddTimer.Start(); | ||||
|     sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi | ||||
|     sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid | ||||
|     sliceMaddTimer.Stop(); | ||||
|  | ||||
|     // Beta | ||||
|     for(int b=0;b<Nblock;b++){ | ||||
|       v_rr_inv[b] = 1.0/v_rr[b]; | ||||
|     } | ||||
|     sliceNormTimer.Start(); | ||||
|     sliceNorm(v_rr,R,Orthog); | ||||
|     sliceNormTimer.Stop(); | ||||
|     for(int b=0;b<Nblock;b++){ | ||||
|       v_beta[b] = v_rr_inv[b] *v_rr[b]; | ||||
|     } | ||||
|  | ||||
|     // Search update | ||||
|     sliceMaddTimer.Start(); | ||||
|     sliceMaddVector(P,v_beta,P,R,Orthog); | ||||
|     sliceMaddTimer.Stop(); | ||||
|  | ||||
|     /********************* | ||||
|      * convergence monitor | ||||
| @@ -290,15 +327,27 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) | ||||
|     } | ||||
|      | ||||
|     if ( max_resid < Tolerance*Tolerance ) {  | ||||
|       std::cout << GridLogMessage<<" MultiRHS solver has converged in " | ||||
| 		<<k<<" iterations; max residual is "<<std::sqrt(max_resid)<<std::endl; | ||||
|  | ||||
|       SolverTimer.Stop(); | ||||
|  | ||||
|       std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl; | ||||
|       for(int b=0;b<Nblock;b++){ | ||||
| 	std::cout << GridLogMessage<< " block "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl; | ||||
| 	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl; | ||||
|       } | ||||
|       std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl; | ||||
|  | ||||
|       Linop.HermOp(Psi, AP); | ||||
|       AP = AP-Src; | ||||
|       std::cout << " MultiRHS solver true residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl; | ||||
|       std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl; | ||||
|  | ||||
|       std::cout << GridLogMessage << "Time Breakdown "<<std::endl; | ||||
|       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl; | ||||
|       std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl; | ||||
|  | ||||
|  | ||||
|       IterationsToComplete = k; | ||||
|       return; | ||||
|     } | ||||
|   | ||||
| @@ -78,18 +78,12 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|     cp = a; | ||||
|     ssq = norm2(src); | ||||
|  | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient: guess " << guess << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient:   src " << ssq << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient:    mp " << d << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient:   mmp " << b << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient:  cp,r " << cp << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) | ||||
|               << "ConjugateGradient:     p " << a << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   src " << ssq << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:    mp " << d << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   mmp " << b << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:  cp,r " << cp << std::endl; | ||||
|     std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:     p " << a << std::endl; | ||||
|  | ||||
|     RealD rsq = Tolerance * Tolerance * ssq; | ||||
|  | ||||
| @@ -144,19 +138,20 @@ class ConjugateGradient : public OperatorFunction<Field> { | ||||
|         RealD resnorm = sqrt(norm2(p)); | ||||
|         RealD true_residual = resnorm / srcnorm; | ||||
|  | ||||
|         std::cout << GridLogMessage | ||||
|                   << "ConjugateGradient: Converged on iteration " << k << std::endl; | ||||
|         std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq) | ||||
|                   << " true residual " << true_residual << " target " | ||||
|                   << Tolerance << std::endl; | ||||
|         std::cout << GridLogMessage << "Time elapsed: Iterations " | ||||
|                   << SolverTimer.Elapsed() << " Matrix  " | ||||
|                   << MatrixTimer.Elapsed() << " Linalg " | ||||
|                   << LinalgTimer.Elapsed(); | ||||
|         std::cout << std::endl; | ||||
|         std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; | ||||
|         std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; | ||||
|  | ||||
|         std::cout << GridLogMessage << "Time breakdown "<<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl; | ||||
|  | ||||
|         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||
|  | ||||
| 	IterationsToComplete = k;	 | ||||
|  | ||||
|         return; | ||||
|       } | ||||
|     } | ||||
|   | ||||
| @@ -44,6 +44,7 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ | ||||
|   ComplexD nrm = innerProduct(arg,arg); | ||||
|   return std::real(nrm);  | ||||
| } | ||||
|  | ||||
| // Double inner product | ||||
| template<class vobj> | ||||
| inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)  | ||||
| @@ -101,7 +102,6 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) | ||||
|   return sum(closure(expr)); | ||||
| } | ||||
|  | ||||
| // FIXME precision promoted summation | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||
| { | ||||
| @@ -141,14 +141,22 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||
|   return ssum; | ||||
| } | ||||
|  | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim) | ||||
| { | ||||
|   /////////////////////////////////////////////////////// | ||||
|   // FIXME precision promoted summation | ||||
|   // may be important for correlation functions | ||||
|   // But easily avoided by using double precision fields | ||||
|   /////////////////////////////////////////////////////// | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   GridBase  *grid = Data._grid; | ||||
|   assert(grid!=NULL); | ||||
|  | ||||
|   // FIXME | ||||
|   // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl; | ||||
|   const int    Nd = grid->_ndimension; | ||||
|   const int Nsimd = grid->Nsimd(); | ||||
|  | ||||
| @@ -163,18 +171,27 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | ||||
|   std::vector<sobj> lsSum(ld,zero);                    // sum across these down to scalars | ||||
|   std::vector<sobj> extracted(Nsimd);                  // splitting the SIMD | ||||
|  | ||||
|   result.resize(fd); // And then global sum to return the same vector to every node for IO to file | ||||
|   result.resize(fd); // And then global sum to return the same vector to every node  | ||||
|   for(int r=0;r<rd;r++){ | ||||
|     lvSum[r]=zero; | ||||
|   } | ||||
|  | ||||
|   std::vector<int>  coor(Nd);   | ||||
|   int e1=    grid->_slice_nblock[orthogdim]; | ||||
|   int e2=    grid->_slice_block [orthogdim]; | ||||
|   int stride=grid->_slice_stride[orthogdim]; | ||||
|  | ||||
|   // sum over reduced dimension planes, breaking out orthog dir | ||||
|   for(int ss=0;ss<grid->oSites();ss++){ | ||||
|     Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); | ||||
|     int r = coor[orthogdim]; | ||||
|     lvSum[r]=lvSum[r]+Data._odata[ss]; | ||||
|   // Parallel over orthog direction | ||||
|   parallel_for(int r=0;r<rd;r++){ | ||||
|  | ||||
|     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||
|  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int ss= so+n*stride+b; | ||||
| 	lvSum[r]=lvSum[r]+Data._odata[ss]; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   // Sum across simd lanes in the plane, breaking out orthog dir. | ||||
| @@ -212,32 +229,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | ||||
|   } | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
|   static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)  | ||||
|   { | ||||
|     // FIXME: Implementation is slow | ||||
|     // Look at localInnerProduct implementation, | ||||
|     // and do inside a site loop with block strided iterators | ||||
|     typedef typename vobj::scalar_object sobj; | ||||
|     typedef typename vobj::scalar_type scalar_type; | ||||
|     typedef typename vobj::vector_type vector_type; | ||||
|     typedef typename vobj::tensor_reduced scalar; | ||||
|     typedef typename scalar::scalar_object  scomplex; | ||||
|    | ||||
|     int Nblock = lhs._grid->GlobalDimensions()[Orthog]; | ||||
|  | ||||
|     vec.resize(Nblock); | ||||
|     std::vector<scomplex> sip(Nblock); | ||||
|     Lattice<scalar> IP(lhs._grid);  | ||||
|  | ||||
|     IP=localInnerProduct(lhs,rhs); | ||||
|     sliceSum(IP,sip,Orthog); | ||||
|    | ||||
|     for(int ss=0;ss<Nblock;ss++){ | ||||
|       vec[ss] = TensorRemove(sip[ss]); | ||||
|     } | ||||
|   } | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)  | ||||
| { | ||||
| @@ -247,8 +238,6 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti | ||||
|   assert(grid!=NULL); | ||||
|   conformable(grid,rhs._grid); | ||||
|  | ||||
|   // FIXME | ||||
|   // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl; | ||||
|   const int    Nd = grid->_ndimension; | ||||
|   const int Nsimd = grid->Nsimd(); | ||||
|  | ||||
| @@ -268,16 +257,18 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti | ||||
|     lvSum[r]=zero; | ||||
|   } | ||||
|  | ||||
|   // sum over reduced dimension planes, breaking out orthog dir | ||||
|   PARALLEL_REGION { | ||||
|     std::vector<int>  coor(Nd);   | ||||
|     vector_type vv; | ||||
|     PARALLEL_FOR_LOOP_INTERN | ||||
|     for(int ss=0;ss<grid->oSites();ss++){ | ||||
|       Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); | ||||
|       int r = coor[orthogdim]; | ||||
|       vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); | ||||
|       PARALLEL_CRITICAL { // ouch slow rfo thrashing atomic fp add | ||||
|   int e1=    grid->_slice_nblock[orthogdim]; | ||||
|   int e2=    grid->_slice_block [orthogdim]; | ||||
|   int stride=grid->_slice_stride[orthogdim]; | ||||
|  | ||||
|   parallel_for(int r=0;r<rd;r++){ | ||||
|  | ||||
|     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||
|  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int ss= so+n*stride+b; | ||||
| 	vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); | ||||
| 	lvSum[r]=lvSum[r]+vv; | ||||
|       } | ||||
|     } | ||||
| @@ -287,7 +278,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti | ||||
|   std::vector<int> icoor(Nd); | ||||
|   for(int rt=0;rt<rd;rt++){ | ||||
|  | ||||
|     iScalar<vector_type> temp; temp._internal = lvSum[rt]; | ||||
|     iScalar<vector_type> temp;  | ||||
|     temp._internal = lvSum[rt]; | ||||
|     extract(temp,extracted); | ||||
|  | ||||
|     for(int idx=0;idx<Nsimd;idx++){ | ||||
| @@ -317,176 +309,9 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti | ||||
|     result[t]=gsum; | ||||
|   } | ||||
| } | ||||
| #if 0 | ||||
| template<class vobj> | ||||
| static void sliceInnerProductVector( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)  | ||||
| static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog)  | ||||
| { | ||||
|     // FIXME: Implementation is slow | ||||
|     // Look at sliceSum implementation, | ||||
|     // and do inside a site loop with block strided iterators | ||||
|     typedef typename vobj::scalar_object sobj; | ||||
|     typedef typename vobj::scalar_type scalar_type; | ||||
|     typedef typename vobj::vector_type vector_type; | ||||
|     typedef typename vobj::tensor_reduced scalar; | ||||
|     typedef typename scalar::scalar_object  scomplex; | ||||
|  | ||||
|     GridBase * grid = lhs._grid; | ||||
|  | ||||
|  | ||||
|   const int    Nd = grid->_ndimension; | ||||
|   const int Nsimd = grid->Nsimd(); | ||||
|  | ||||
|   assert(orthogdim >= 0); | ||||
|   assert(orthogdim < Nd); | ||||
|  | ||||
|   int fd=grid->_fdimensions[orthogdim]; | ||||
|   int ld=grid->_ldimensions[orthogdim]; | ||||
|   int rd=grid->_rdimensions[orthogdim]; | ||||
|  | ||||
|     int Nblock  = grid->GlobalDimensions()[Orthog]; | ||||
|     int Nrblock = grid->_rdimensions[Orthog]; | ||||
|     int Nthr    = grid->SumArraySize(); | ||||
|  | ||||
|     std::vector<vector_type,alignedAllocator<vector_type> > sumarray(Nrblock*Nthr); | ||||
|  | ||||
|     parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||
|  | ||||
|       int nwork, mywork, myoff; | ||||
|  | ||||
|       for(int rb=0;rb<Nrblock;rb++){ | ||||
| 	GridThread::GetWork((left._grid->oSites()/Nrblock),thr,mywork,myoff); | ||||
| 	int off = rb * grid->_slice_ | ||||
| 	vector_type vnrm=zero; // private to thread; sub summation | ||||
| 	for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||
| 	  vnrm = vnrm + TensorRemove(innerProductD(left._odata[ss],right._odata[ss])); | ||||
| 	} | ||||
|       } | ||||
|       sumarray[thr+Nthr*rb]=vnrm ; | ||||
|     } | ||||
|  | ||||
|     vec.resize(Nblock); | ||||
|     std::vector<scomplex> sip(Nblock); | ||||
|     Lattice<scalar> IP(lhs._grid);  | ||||
|  | ||||
|     IP=localInnerProduct(lhs,rhs); | ||||
|     sliceSum(IP,sip,Orthog); | ||||
|    | ||||
|     for(int ss=0;ss<Nblock;ss++){ | ||||
|       vec[ss] = TensorRemove(sip[ss]); | ||||
|     } | ||||
|   } | ||||
| #endif | ||||
|  | ||||
| inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) | ||||
|  { | ||||
|    int NN    = BlockSolverGrid->_ndimension; | ||||
|    int nsimd = BlockSolverGrid->Nsimd(); | ||||
|  | ||||
|    std::vector<int> latt_phys(0); | ||||
|    std::vector<int> simd_phys(0); | ||||
|    std::vector<int>  mpi_phys(0); | ||||
|    | ||||
|    for(int d=0;d<NN;d++){ | ||||
|      if( d!=Orthog ) {  | ||||
|        latt_phys.push_back(BlockSolverGrid->_fdimensions[d]); | ||||
|        simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); | ||||
|        mpi_phys.push_back(BlockSolverGrid->_processors[d]); | ||||
|      } | ||||
|    } | ||||
|    return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);  | ||||
|  } | ||||
|  ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|  // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum | ||||
|  ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template<class vobj> | ||||
|   static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)  | ||||
|   {     | ||||
|     typedef typename vobj::scalar_object sobj; | ||||
|     typedef typename vobj::scalar_type scalar_type; | ||||
|     typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|     int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||
|      | ||||
|     GridBase *FullGrid  = X._grid; | ||||
|     GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|    | ||||
|     Lattice<vobj> Xslice(SliceGrid); | ||||
|     Lattice<vobj> Rslice(SliceGrid); | ||||
|     // FIXME: Implementation is slow | ||||
|     // If we based this on Cshift it would work for spread out | ||||
|     // but it would be even slower | ||||
|     // | ||||
|     // Repeated extract slice is inefficient | ||||
|     // | ||||
|     // Best base the linear combination by constructing a  | ||||
|     // set of vectors of size grid->_rdimensions[Orthog]. | ||||
|     for(int i=0;i<Nblock;i++){ | ||||
|       ExtractSlice(Rslice,Y,i,Orthog); | ||||
|       for(int j=0;j<Nblock;j++){ | ||||
| 	ExtractSlice(Xslice,X,j,Orthog); | ||||
| 	Rslice = Rslice + Xslice*(scale*aa(j,i)); | ||||
|       } | ||||
|       InsertSlice(Rslice,R,i,Orthog); | ||||
|     } | ||||
|   }; | ||||
| template<class vobj> | ||||
|   static void sliceMaddVector (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y, | ||||
| 			       int Orthog,RealD scale=1.0)  | ||||
|   {     | ||||
|     // FIXME: Implementation is slow | ||||
|     // Best base the linear combination by constructing a  | ||||
|     // set of vectors of size grid->_rdimensions[Orthog]. | ||||
|     typedef typename vobj::scalar_object sobj; | ||||
|     typedef typename vobj::scalar_type scalar_type; | ||||
|     typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|     int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||
|      | ||||
|     GridBase *FullGrid  = X._grid; | ||||
|     GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|    | ||||
|     Lattice<vobj> Xslice(SliceGrid); | ||||
|     Lattice<vobj> Rslice(SliceGrid); | ||||
|     // If we based this on Cshift it would work for spread out | ||||
|     // but it would be even slower | ||||
|     for(int i=0;i<Nblock;i++){ | ||||
|       ExtractSlice(Rslice,Y,i,Orthog); | ||||
|       ExtractSlice(Xslice,X,i,Orthog); | ||||
|       Rslice = Rslice + Xslice*(scale*a[i]); | ||||
|       InsertSlice(Rslice,R,i,Orthog); | ||||
|     } | ||||
|   }; | ||||
| template<class vobj> | ||||
|   static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)  | ||||
|   { | ||||
|     // FIXME: Implementation is slow | ||||
|     // Not sure of best solution.. think about it | ||||
|     typedef typename vobj::scalar_object sobj; | ||||
|     typedef typename vobj::scalar_type scalar_type; | ||||
|     typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|     GridBase *FullGrid  = lhs._grid; | ||||
|     GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|  | ||||
|     int Nblock = FullGrid->GlobalDimensions()[Orthog]; | ||||
|    | ||||
|     Lattice<vobj> Lslice(SliceGrid); | ||||
|     Lattice<vobj> Rslice(SliceGrid); | ||||
|  | ||||
|     mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
|  | ||||
|     for(int i=0;i<Nblock;i++){ | ||||
|       ExtractSlice(Lslice,lhs,i,Orthog); | ||||
|       for(int j=0;j<Nblock;j++){ | ||||
| 	ExtractSlice(Rslice,rhs,j,Orthog); | ||||
| 	mat(i,j) = innerProduct(Lslice,Rslice); | ||||
|       } | ||||
|     } | ||||
|     return; | ||||
|   } | ||||
| template<class vobj> | ||||
|   static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) { | ||||
|  | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
| @@ -499,9 +324,207 @@ template<class vobj> | ||||
|   for(int ss=0;ss<Nblock;ss++){ | ||||
|     sn[ss] = real(ip[ss]); | ||||
|   } | ||||
|  }; | ||||
| }; | ||||
|  | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y, | ||||
| 			    int orthogdim,RealD scale=1.0)  | ||||
| {     | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|   typedef typename vobj::tensor_reduced tensor_reduced; | ||||
|    | ||||
|   GridBase *grid  = X._grid; | ||||
|  | ||||
|   int Nsimd  =grid->Nsimd(); | ||||
|   int Nblock =grid->GlobalDimensions()[orthogdim]; | ||||
|  | ||||
|   int fd     =grid->_fdimensions[orthogdim]; | ||||
|   int ld     =grid->_ldimensions[orthogdim]; | ||||
|   int rd     =grid->_rdimensions[orthogdim]; | ||||
|  | ||||
|   int e1     =grid->_slice_nblock[orthogdim]; | ||||
|   int e2     =grid->_slice_block [orthogdim]; | ||||
|   int stride =grid->_slice_stride[orthogdim]; | ||||
|  | ||||
|   std::vector<int> icoor; | ||||
|  | ||||
|   for(int r=0;r<rd;r++){ | ||||
|  | ||||
|     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||
|  | ||||
|     vector_type    av; | ||||
|  | ||||
|     for(int l=0;l<Nsimd;l++){ | ||||
|       grid->iCoorFromIindex(icoor,l); | ||||
|       int ldx =r+icoor[orthogdim]*rd; | ||||
|       scalar_type *as =(scalar_type *)&av; | ||||
|       as[l] = scalar_type(a[ldx])*scale; | ||||
|     } | ||||
|  | ||||
|     tensor_reduced at; at=av; | ||||
|  | ||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| 	int ss= so+n*stride+b; | ||||
| 	R._odata[ss] = at*X._odata[ss]+Y._odata[ss]; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| /* | ||||
| template<class vobj> | ||||
| static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y, | ||||
| 			     int Orthog,RealD scale=1.0)  | ||||
| {     | ||||
|   // FIXME: Implementation is slow | ||||
|   // Best base the linear combination by constructing a  | ||||
|   // set of vectors of size grid->_rdimensions[Orthog]. | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|    | ||||
|   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||
|    | ||||
|   GridBase *FullGrid  = X._grid; | ||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|    | ||||
|   Lattice<vobj> Xslice(SliceGrid); | ||||
|   Lattice<vobj> Rslice(SliceGrid); | ||||
|   // If we based this on Cshift it would work for spread out | ||||
|   // but it would be even slower | ||||
|   for(int i=0;i<Nblock;i++){ | ||||
|     ExtractSlice(Rslice,Y,i,Orthog); | ||||
|     ExtractSlice(Xslice,X,i,Orthog); | ||||
|     Rslice = Rslice + Xslice*(scale*a[i]); | ||||
|     InsertSlice(Rslice,R,i,Orthog); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)  | ||||
|   { | ||||
|     // FIXME: Implementation is slow | ||||
|     // Look at localInnerProduct implementation, | ||||
|     // and do inside a site loop with block strided iterators | ||||
|     typedef typename vobj::scalar_object sobj; | ||||
|     typedef typename vobj::scalar_type scalar_type; | ||||
|     typedef typename vobj::vector_type vector_type; | ||||
|     typedef typename vobj::tensor_reduced scalar; | ||||
|     typedef typename scalar::scalar_object  scomplex; | ||||
|    | ||||
|     int Nblock = lhs._grid->GlobalDimensions()[Orthog]; | ||||
|  | ||||
|     vec.resize(Nblock); | ||||
|     std::vector<scomplex> sip(Nblock); | ||||
|     Lattice<scalar> IP(lhs._grid);  | ||||
|  | ||||
|     IP=localInnerProduct(lhs,rhs); | ||||
|     sliceSum(IP,sip,Orthog); | ||||
|    | ||||
|     for(int ss=0;ss<Nblock;ss++){ | ||||
|       vec[ss] = TensorRemove(sip[ss]); | ||||
|     } | ||||
|   } | ||||
| */ | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // FIXME: Implementation is slow | ||||
| // If we based this on Cshift it would work for spread out | ||||
| // but it would be even slower | ||||
| // | ||||
| // Repeated extract slice is inefficient | ||||
| // | ||||
| // Best base the linear combination by constructing a  | ||||
| // set of vectors of size grid->_rdimensions[Orthog]. | ||||
| ////////////////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) | ||||
| { | ||||
|   int NN    = BlockSolverGrid->_ndimension; | ||||
|   int nsimd = BlockSolverGrid->Nsimd(); | ||||
|    | ||||
|   std::vector<int> latt_phys(0); | ||||
|   std::vector<int> simd_phys(0); | ||||
|   std::vector<int>  mpi_phys(0); | ||||
|    | ||||
|   for(int d=0;d<NN;d++){ | ||||
|     if( d!=Orthog ) {  | ||||
|       latt_phys.push_back(BlockSolverGrid->_fdimensions[d]); | ||||
|       simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); | ||||
|       mpi_phys.push_back(BlockSolverGrid->_processors[d]); | ||||
|     } | ||||
|   } | ||||
|   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);  | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)  | ||||
| {     | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|  | ||||
|   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||
|    | ||||
|   GridBase *FullGrid  = X._grid; | ||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|    | ||||
|   Lattice<vobj> Xslice(SliceGrid); | ||||
|   Lattice<vobj> Rslice(SliceGrid); | ||||
|    | ||||
|   for(int i=0;i<Nblock;i++){ | ||||
|     ExtractSlice(Rslice,Y,i,Orthog); | ||||
|     for(int j=0;j<Nblock;j++){ | ||||
|       ExtractSlice(Xslice,X,j,Orthog); | ||||
|       Rslice = Rslice + Xslice*(scale*aa(j,i)); | ||||
|     } | ||||
|     InsertSlice(Rslice,R,i,Orthog); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template<class vobj> | ||||
| static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)  | ||||
| { | ||||
|   // FIXME: Implementation is slow | ||||
|   // Not sure of best solution.. think about it | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_type scalar_type; | ||||
|   typedef typename vobj::vector_type vector_type; | ||||
|    | ||||
|   GridBase *FullGrid  = lhs._grid; | ||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||
|    | ||||
|   int Nblock = FullGrid->GlobalDimensions()[Orthog]; | ||||
|    | ||||
|   Lattice<vobj> Lslice(SliceGrid); | ||||
|   Lattice<vobj> Rslice(SliceGrid); | ||||
|    | ||||
|   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||
|    | ||||
|   for(int i=0;i<Nblock;i++){ | ||||
|     ExtractSlice(Lslice,lhs,i,Orthog); | ||||
|     for(int j=0;j<Nblock;j++){ | ||||
|       ExtractSlice(Rslice,rhs,j,Orthog); | ||||
|       mat(i,j) = innerProduct(Lslice,Rslice); | ||||
|     } | ||||
|   } | ||||
| #undef FORCE_DIAG | ||||
| #ifdef FORCE_DIAG | ||||
|   for(int i=0;i<Nblock;i++){ | ||||
|     for(int j=0;j<Nblock;j++){ | ||||
|       if ( i != j ) mat(i,j)=0.0; | ||||
|     } | ||||
|   } | ||||
| #endif | ||||
|   return; | ||||
| } | ||||
|  | ||||
| } /*END NAMESPACE GRID*/ | ||||
| #endif | ||||
|  | ||||
|   | ||||
| @@ -110,8 +110,8 @@ public: | ||||
|   friend std::ostream& operator<< (std::ostream& stream, Logger& log){ | ||||
|  | ||||
|     if ( log.active ) { | ||||
|       stream << log.background()<< std::setw(10) << std::left << log.topName << log.background()<< " : "; | ||||
|       stream << log.colour() << std::setw(14) << std::left << log.name << log.background() << " : "; | ||||
|       stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : "; | ||||
|       stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : "; | ||||
|       if ( log.timestamp ) { | ||||
| 	StopWatch.Stop(); | ||||
| 	GridTime now = StopWatch.Elapsed(); | ||||
|   | ||||
| @@ -411,7 +411,6 @@ template <class S, class V, IfNotComplex<S> = 0> | ||||
| inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) { | ||||
|   nrot = nrot % Grid_simd<S, V>::Nsimd(); | ||||
|   Grid_simd<S, V> ret; | ||||
|   //    std::cout << "Rotate Real by "<<nrot<<std::endl; | ||||
|   ret.v = Optimization::Rotate::rotate(b.v, nrot); | ||||
|   return ret; | ||||
| } | ||||
| @@ -419,7 +418,6 @@ template <class S, class V, IfComplex<S> = 0> | ||||
| inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) { | ||||
|   nrot = nrot % Grid_simd<S, V>::Nsimd(); | ||||
|   Grid_simd<S, V> ret; | ||||
|   //    std::cout << "Rotate Complex by "<<nrot<<std::endl; | ||||
|   ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); | ||||
|   return ret; | ||||
| } | ||||
| @@ -427,14 +425,12 @@ template <class S, class V, IfNotComplex<S> =0> | ||||
| inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot) | ||||
| { | ||||
|   nrot = nrot % Grid_simd<S,V>::Nsimd(); | ||||
|   //    std::cout << "Rotate Real by "<<nrot<<std::endl; | ||||
|   ret.v = Optimization::Rotate::rotate(b.v,nrot); | ||||
| } | ||||
| template <class S, class V, IfComplex<S> =0>  | ||||
| inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot) | ||||
| { | ||||
|   nrot = nrot % Grid_simd<S,V>::Nsimd(); | ||||
|   //    std::cout << "Rotate Complex by "<<nrot<<std::endl; | ||||
|   ret.v = Optimization::Rotate::rotate(b.v,2*nrot); | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -58,10 +58,9 @@ class iScalar { | ||||
|   typedef typename GridTypeMapper<vtype>::vector_type vector_type; | ||||
|   typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD; | ||||
|   typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v; | ||||
|   typedef iScalar<tensor_reduced_v> tensor_reduced; | ||||
|   typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object; | ||||
|   typedef iScalar<tensor_reduced_v> tensor_reduced; | ||||
|   typedef iScalar<recurse_scalar_object> scalar_object; | ||||
|  | ||||
|   // substitutes a real or complex version with same tensor structure | ||||
|   typedef iScalar<typename GridTypeMapper<vtype>::Complexified> Complexified; | ||||
|   typedef iScalar<typename GridTypeMapper<vtype>::Realified> Realified; | ||||
| @@ -78,8 +77,12 @@ class iScalar { | ||||
|   iScalar<vtype> & operator= (const iScalar<vtype> ©me) = default; | ||||
|   iScalar<vtype> & operator= (iScalar<vtype> &©me) = default; | ||||
|   */ | ||||
|   iScalar(scalar_type s) | ||||
|       : _internal(s){};  // recurse down and hit the constructor for vector_type | ||||
|  | ||||
|   //  template<int N=0> | ||||
|   //  iScalar(EnableIf<isSIMDvectorized<vector_type>, vector_type> s) : _internal(s){};  // recurse down and hit the constructor for vector_type | ||||
|  | ||||
|   iScalar(scalar_type s) : _internal(s){};  // recurse down and hit the constructor for vector_type | ||||
|  | ||||
|   iScalar(const Zero &z) { *this = zero; }; | ||||
|  | ||||
|   iScalar<vtype> &operator=(const Zero &hero) { | ||||
| @@ -135,33 +138,28 @@ class iScalar { | ||||
|   strong_inline const vtype &operator()(void) const { return _internal; } | ||||
|  | ||||
|   // Type casts meta programmed, must be pure scalar to match TensorRemove | ||||
|   template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, | ||||
|             IfNotSimd<U> = 0> | ||||
|   template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0> | ||||
|   operator ComplexF() const { | ||||
|     return (TensorRemove(_internal)); | ||||
|   }; | ||||
|   template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, | ||||
|             IfNotSimd<U> = 0> | ||||
|   template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0> | ||||
|   operator ComplexD() const { | ||||
|     return (TensorRemove(_internal)); | ||||
|   }; | ||||
|   //  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = | ||||
|   //  0> operator RealD    () const { return(real(TensorRemove(_internal))); } | ||||
|   template <class U = vtype, class V = scalar_type, IfReal<V> = 0, | ||||
|             IfNotSimd<U> = 0> | ||||
|   template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0> | ||||
|   operator RealD() const { | ||||
|     return TensorRemove(_internal); | ||||
|   } | ||||
|   template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, | ||||
|             IfNotSimd<U> = 0> | ||||
|   template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0> | ||||
|   operator Integer() const { | ||||
|     return Integer(TensorRemove(_internal)); | ||||
|   } | ||||
|  | ||||
|   // convert from a something to a scalar via constructor of something arg | ||||
|   template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type | ||||
|                          * = nullptr> | ||||
|   strong_inline iScalar<vtype> operator=(T arg) { | ||||
|   template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr> | ||||
|     strong_inline iScalar<vtype> operator=(T arg) { | ||||
|     _internal = arg; | ||||
|     return *this; | ||||
|   } | ||||
|   | ||||
| @@ -252,7 +252,8 @@ namespace Grid { | ||||
|   template<typename T> | ||||
|   class isSIMDvectorized{ | ||||
|     template<typename U> | ||||
|     static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,   typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *); | ||||
|     static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,    | ||||
|       typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *); | ||||
|  | ||||
|     template<typename U> | ||||
|     static double test(...); | ||||
|   | ||||
| @@ -51,7 +51,7 @@ int main (int argc, char ** argv) | ||||
|   typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField;  | ||||
|   typename ImprovedStaggeredFermion5DR::ImplParams params;  | ||||
|  | ||||
|   const int Ls=8; | ||||
|   const int Ls=4; | ||||
|  | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
| @@ -76,24 +76,44 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   RealD mass=0.01; | ||||
|   ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); | ||||
|  | ||||
|   MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds); | ||||
|  | ||||
|   ConjugateGradient<FermionField> CG(1.0e-8,10000); | ||||
|   BlockConjugateGradient<FermionField> BCG(1.0e-8,10000); | ||||
|   MultiRHSConjugateGradient<FermionField> mCG(1.0e-8,10000); | ||||
|  | ||||
|   std::cout << GridLogMessage << " Calling CG "<<std::endl; | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|   std::cout << GridLogMessage << " Calling 4d CG "<<std::endl; | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|   ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass); | ||||
|   MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d); | ||||
|   FermionField src4d(UGrid); random(pRNG,src4d); | ||||
|   FermionField result4d(UGrid); result4d=zero; | ||||
|   CG(HermOp4d,src4d,result4d); | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|  | ||||
|  | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|   std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl; | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|   result=zero; | ||||
|   CG(HermOp,src,result); | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|  | ||||
|   std::cout << GridLogMessage << " Calling multiRHS CG "<<std::endl; | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|   std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl; | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|   result=zero; | ||||
|   mCG(HermOp,src,result); | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|  | ||||
|   std::cout << GridLogMessage << " Calling Block CG "<<std::endl; | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|   std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl; | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|   result=zero; | ||||
|   BCG(HermOp,src,result); | ||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||
|  | ||||
|  | ||||
|   Grid_finalize(); | ||||
| } | ||||
|   | ||||
| @@ -76,7 +76,6 @@ int main (int argc, char ** argv) | ||||
|   ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); | ||||
|  | ||||
|   MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds); | ||||
|   ConjugateGradient<FermionField> CG(1.0e-8,10000); | ||||
|   CG(HermOp,src,result); | ||||
|  | ||||
|   Grid_finalize(); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user