mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-25 18:19:34 +01:00 
			
		
		
		
	Much faster coarsening
This commit is contained in:
		| @@ -1,3 +1,14 @@ | |||||||
|  |     // blockZaxpy in bockPromote - 3s, 5% | ||||||
|  |     // noncoalesced linalg in Preconditionoer ~ 3s 5% | ||||||
|  |     // Lancos tuning or replace 10-20s ~ 25%, open ended | ||||||
|  |     // setup tuning   5s  ~  8% | ||||||
|  |     //    -- e.g. ordermin, orderstep tunables. | ||||||
|  |     // MdagM path without norm in LinOp code.     few seconds | ||||||
|  |  | ||||||
|  |     // Mdir calc blocking kernels | ||||||
|  |     // Fuse kernels in blockMaskedInnerProduct | ||||||
|  |     // preallocate Vectors in Cayley 5D ~ few percent few seconds | ||||||
|  |  | ||||||
| /************************************************************************************* | /************************************************************************************* | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
| @@ -34,6 +45,34 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
|  | template<class vobj,class CComplex> | ||||||
|  | inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner1, | ||||||
|  | 				    Lattice<CComplex> &CoarseInner2, | ||||||
|  | 				    const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask1, | ||||||
|  | 				    const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask2, | ||||||
|  | 				    const Lattice<vobj> &fineX, | ||||||
|  | 				    const Lattice<vobj> &fineY) | ||||||
|  | { | ||||||
|  |   typedef decltype(innerProduct(vobj(),vobj())) dotp; | ||||||
|  |  | ||||||
|  |   GridBase *coarse(CoarseInner1.Grid()); | ||||||
|  |   GridBase *fine  (fineX.Grid()); | ||||||
|  |  | ||||||
|  |   Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); | ||||||
|  |   Lattice<dotp> fine_inner_msk(fine); | ||||||
|  |  | ||||||
|  |   // Multiply could be fused with innerProduct | ||||||
|  |   // Single block sum kernel could do both masks. | ||||||
|  |   fine_inner = localInnerProduct(fineX,fineY); | ||||||
|  |  | ||||||
|  |   mult(fine_inner_msk, fine_inner,FineMask1); | ||||||
|  |   blockSum(CoarseInner1,fine_inner_msk); | ||||||
|  |  | ||||||
|  |   mult(fine_inner_msk, fine_inner,FineMask2); | ||||||
|  |   blockSum(CoarseInner2,fine_inner_msk); | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
| class Geometry { | class Geometry { | ||||||
| public: | public: | ||||||
|   int npoint; |   int npoint; | ||||||
| @@ -51,10 +90,10 @@ public: | |||||||
|     directions.resize(npoint); |     directions.resize(npoint); | ||||||
|     displacements.resize(npoint); |     displacements.resize(npoint); | ||||||
|     for(int d=0;d<_d;d++){ |     for(int d=0;d<_d;d++){ | ||||||
|       directions[2*d  ] = d+base; |       directions[d   ] = d+base; | ||||||
|       directions[2*d+1] = d+base; |       directions[d+_d] = d+base; | ||||||
|       displacements[2*d  ] = +1; |       displacements[d  ] = +1; | ||||||
|       displacements[2*d+1] = -1; |       displacements[d+_d]= -1; | ||||||
|     } |     } | ||||||
|     directions   [2*_d]=0; |     directions   [2*_d]=0; | ||||||
|     displacements[2*_d]=0; |     displacements[2*_d]=0; | ||||||
| @@ -136,20 +175,15 @@ public: | |||||||
|     std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl; |     std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl; | ||||||
|   } |   } | ||||||
|   void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){ |   void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){ | ||||||
|     //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl; |  | ||||||
|     blockProject(CoarseVec,FineVec,subspace); |     blockProject(CoarseVec,FineVec,subspace); | ||||||
|     //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl; |  | ||||||
|   } |   } | ||||||
|   void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){ |   void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){ | ||||||
|     FineVec.Checkerboard() = subspace[0].Checkerboard(); |     FineVec.Checkerboard() = subspace[0].Checkerboard(); | ||||||
|     //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl; |  | ||||||
|     blockPromote(CoarseVec,FineVec,subspace); |     blockPromote(CoarseVec,FineVec,subspace); | ||||||
|     //    std::cout << GridLogMessage<< "BlockPromote done"<<std::endl; |  | ||||||
|   } |   } | ||||||
|   void CreateSubspaceRandom(GridParallelRNG &RNG){ |   void CreateSubspaceRandom(GridParallelRNG &RNG){ | ||||||
|     for(int i=0;i<nbasis;i++){ |     for(int i=0;i<nbasis;i++){ | ||||||
|       random(RNG,subspace[i]); |       random(RNG,subspace[i]); | ||||||
|       //      std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl; |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -190,12 +224,15 @@ public: | |||||||
|   // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) |   // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) | ||||||
|   // and this is the best I found |   // and this is the best I found | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #if 1 | ||||||
|   virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop, |   virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop, | ||||||
| 				       int nn, | 				       int nn, | ||||||
| 				       double hi, | 				       double hi, | ||||||
| 				       double lo, | 				       double lo, | ||||||
| 				       int order, | 				       int orderfilter, | ||||||
| 				       int orderstep | 				       int ordermin, | ||||||
|  | 				       int orderstep, | ||||||
|  | 				       double filterlo | ||||||
| 				       ) { | 				       ) { | ||||||
|  |  | ||||||
|     RealD scale; |     RealD scale; | ||||||
| @@ -215,7 +252,7 @@ public: | |||||||
|     int b =0; |     int b =0; | ||||||
|     { |     { | ||||||
|       // Filter |       // Filter | ||||||
|       Chebyshev<FineField> Cheb(lo,hi,order); |       Chebyshev<FineField> Cheb(lo,hi,orderfilter); | ||||||
|       Cheb(hermop,noise,Mn); |       Cheb(hermop,noise,Mn); | ||||||
|       // normalise |       // normalise | ||||||
|       scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale; |       scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale; | ||||||
| @@ -227,7 +264,7 @@ public: | |||||||
|  |  | ||||||
|     // Generate a full sequence of Chebyshevs |     // Generate a full sequence of Chebyshevs | ||||||
|     { |     { | ||||||
|       lo=0; |       lo=filterlo; | ||||||
|       noise=Mn; |       noise=Mn; | ||||||
|  |  | ||||||
|       FineField T0(FineGrid); T0 = noise;   |       FineField T0(FineGrid); T0 = noise;   | ||||||
| @@ -245,7 +282,7 @@ public: | |||||||
|       hermop.HermOp(T0,y); |       hermop.HermOp(T0,y); | ||||||
|       T1=y*xscale+noise*mscale; |       T1=y*xscale+noise*mscale; | ||||||
|  |  | ||||||
|       for(int n=2;n<=orderstep*(nn-1);n++){ |       for(int n=2;n<=ordermin+orderstep*(nn-2);n++){ | ||||||
| 	 | 	 | ||||||
| 	hermop.HermOp(*Tn,y); | 	hermop.HermOp(*Tn,y); | ||||||
|  |  | ||||||
| @@ -253,6 +290,7 @@ public: | |||||||
| 	auto Tn_v = Tn->View(); | 	auto Tn_v = Tn->View(); | ||||||
| 	auto Tnp_v = Tnp->View(); | 	auto Tnp_v = Tnp->View(); | ||||||
| 	auto Tnm_v = Tnm->View(); | 	auto Tnm_v = Tnm->View(); | ||||||
|  | 	const int Nsimd = CComplex::Nsimd(); | ||||||
| 	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { | 	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { | ||||||
| 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); | 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); | ||||||
| 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); | 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); | ||||||
| @@ -260,12 +298,14 @@ public: | |||||||
|  |  | ||||||
| 	// Possible more fine grained control is needed than a linear sweep, | 	// Possible more fine grained control is needed than a linear sweep, | ||||||
| 	// but huge productivity gain if this is simple algorithm and not a tunable | 	// but huge productivity gain if this is simple algorithm and not a tunable | ||||||
| 	if ( (n%orderstep)==0 ) {  | 	int m =1; | ||||||
|  | 	if ( n>=ordermin ) m=n-ordermin; | ||||||
|  | 	if ( (m%orderstep)==0 ) {  | ||||||
| 	  Mn=*Tnp; | 	  Mn=*Tnp; | ||||||
| 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale; | 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale; | ||||||
| 	  subspace[b] = Mn; | 	  subspace[b] = Mn; | ||||||
| 	  hermop.Op(Mn,tmp);  | 	  hermop.Op(Mn,tmp);  | ||||||
| 	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; | 	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; | ||||||
| 	  b++; | 	  b++; | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| @@ -279,6 +319,202 @@ public: | |||||||
|     } |     } | ||||||
|     assert(b==nn); |     assert(b==nn); | ||||||
|   } |   } | ||||||
|  | #endif | ||||||
|  | #if 0 | ||||||
|  |   virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop, | ||||||
|  | 				       int nn, | ||||||
|  | 				       double hi, | ||||||
|  | 				       double lo, | ||||||
|  | 				       int orderfilter, | ||||||
|  | 				       int ordermin, | ||||||
|  | 				       int orderstep, | ||||||
|  | 				       double filterlo | ||||||
|  | 				       ) { | ||||||
|  |  | ||||||
|  |     RealD scale; | ||||||
|  |  | ||||||
|  |     FineField noise(FineGrid); | ||||||
|  |     FineField Mn(FineGrid); | ||||||
|  |     FineField tmp(FineGrid); | ||||||
|  |     FineField combined(FineGrid); | ||||||
|  |  | ||||||
|  |     // New normalised noise | ||||||
|  |     gaussian(RNG,noise); | ||||||
|  |     scale = std::pow(norm2(noise),-0.5);  | ||||||
|  |     noise=noise*scale; | ||||||
|  |  | ||||||
|  |     // Initial matrix element | ||||||
|  |     hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl; | ||||||
|  |  | ||||||
|  |     int b =0; | ||||||
|  | #define FILTERb(llo,hhi,oorder)						\ | ||||||
|  |     {									\ | ||||||
|  |       Chebyshev<FineField> Cheb(llo,hhi,oorder);			\ | ||||||
|  |       Cheb(hermop,noise,Mn);						\ | ||||||
|  |       scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\ | ||||||
|  |       subspace[b]   = Mn;						\ | ||||||
|  |       hermop.Op(Mn,tmp);						\ | ||||||
|  |       std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \ | ||||||
|  |       b++;								\ | ||||||
|  |     }									 | ||||||
|  |  | ||||||
|  |     //      JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5);	\ | ||||||
|  |  | ||||||
|  |     RealD alpha=-0.8; | ||||||
|  |     RealD beta =-0.8; | ||||||
|  | #define FILTER(llo,hhi,oorder)						\ | ||||||
|  |     {									\ | ||||||
|  |       Chebyshev<FineField> Cheb(llo,hhi,oorder);			\ | ||||||
|  |       /* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\ | ||||||
|  |       Cheb(hermop,noise,Mn);						\ | ||||||
|  |       scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\ | ||||||
|  |       subspace[b]   = Mn;						\ | ||||||
|  |       hermop.Op(Mn,tmp);						\ | ||||||
|  |       std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \ | ||||||
|  |       b++;								\ | ||||||
|  |     }									 | ||||||
|  |      | ||||||
|  | #define FILTERc(llo,hhi,oorder)				\ | ||||||
|  |     {							\ | ||||||
|  |       Chebyshev<FineField> Cheb(llo,hhi,oorder);	\ | ||||||
|  |       Cheb(hermop,noise,combined);			\ | ||||||
|  |     }									 | ||||||
|  |  | ||||||
|  |     double node = 0.000; | ||||||
|  |     FILTERb(lo,hi,orderfilter);// 0 | ||||||
|  |     //    FILTERc(node,hi,51);// 0 | ||||||
|  |     noise = Mn; | ||||||
|  |     int base = 0; | ||||||
|  |     int mult = 100; | ||||||
|  |     FILTER(node,hi,base+1*mult); | ||||||
|  |     FILTER(node,hi,base+2*mult); | ||||||
|  |     FILTER(node,hi,base+3*mult); | ||||||
|  |     FILTER(node,hi,base+4*mult); | ||||||
|  |     FILTER(node,hi,base+5*mult); | ||||||
|  |     FILTER(node,hi,base+6*mult); | ||||||
|  |     FILTER(node,hi,base+7*mult); | ||||||
|  |     FILTER(node,hi,base+8*mult); | ||||||
|  |     FILTER(node,hi,base+9*mult); | ||||||
|  |     FILTER(node,hi,base+10*mult); | ||||||
|  |     FILTER(node,hi,base+11*mult); | ||||||
|  |     FILTER(node,hi,base+12*mult); | ||||||
|  |     FILTER(node,hi,base+13*mult); | ||||||
|  |     FILTER(node,hi,base+14*mult); | ||||||
|  |     FILTER(node,hi,base+15*mult); | ||||||
|  |     assert(b==nn); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #if 0 | ||||||
|  |   virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop, | ||||||
|  | 				       int nn, | ||||||
|  | 				       double hi, | ||||||
|  | 				       double lo, | ||||||
|  | 				       int orderfilter, | ||||||
|  | 				       int ordermin, | ||||||
|  | 				       int orderstep, | ||||||
|  | 				       double filterlo | ||||||
|  | 				       ) { | ||||||
|  |  | ||||||
|  |     RealD scale; | ||||||
|  |  | ||||||
|  |     FineField noise(FineGrid); | ||||||
|  |     FineField Mn(FineGrid); | ||||||
|  |     FineField tmp(FineGrid); | ||||||
|  |     FineField combined(FineGrid); | ||||||
|  |  | ||||||
|  |     // New normalised noise | ||||||
|  |     gaussian(RNG,noise); | ||||||
|  |     scale = std::pow(norm2(noise),-0.5);  | ||||||
|  |     noise=noise*scale; | ||||||
|  |  | ||||||
|  |     // Initial matrix element | ||||||
|  |     hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl; | ||||||
|  |  | ||||||
|  |     int b =0; | ||||||
|  |     {						 | ||||||
|  |       Chebyshev<FineField> JacobiPoly(0.005,60.,1500); | ||||||
|  |       //      JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5); | ||||||
|  |       //JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5); | ||||||
|  |       //      JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5); | ||||||
|  |       JacobiPoly(hermop,noise,Mn); | ||||||
|  |       scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; | ||||||
|  |       subspace[b]   = Mn; | ||||||
|  |       hermop.Op(Mn,tmp); | ||||||
|  |       std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;  | ||||||
|  |       b++; | ||||||
|  |       //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale; | ||||||
|  |       //      subspace[b]   = tmp;      b++; | ||||||
|  |       //    }									 | ||||||
|  |     }									 | ||||||
|  |  | ||||||
|  | #define FILTER(lambda)						\ | ||||||
|  |     {								\ | ||||||
|  |       hermop.HermOp(subspace[0],tmp);				\ | ||||||
|  |       tmp = tmp - lambda *subspace[0];				\ | ||||||
|  |       scale = std::pow(norm2(tmp),-0.5);			\ | ||||||
|  |       tmp=tmp*scale;							\ | ||||||
|  |       subspace[b]   = tmp;						\ | ||||||
|  |       hermop.Op(subspace[b],tmp);					\ | ||||||
|  |       std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \ | ||||||
|  |       b++;								\ | ||||||
|  |     }									 | ||||||
|  |     //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale; | ||||||
|  |     //      subspace[b]   = tmp;      b++; | ||||||
|  |     //    }									 | ||||||
|  |  | ||||||
|  |     FILTER(2.0e-5); | ||||||
|  |     FILTER(2.0e-4); | ||||||
|  |     FILTER(4.0e-4); | ||||||
|  |     FILTER(8.0e-4); | ||||||
|  |     FILTER(8.0e-4); | ||||||
|  |  | ||||||
|  |     FILTER(2.0e-3); | ||||||
|  |     FILTER(3.0e-3); | ||||||
|  |     FILTER(4.0e-3); | ||||||
|  |     FILTER(5.0e-3); | ||||||
|  |     FILTER(6.0e-3); | ||||||
|  |  | ||||||
|  |     FILTER(2.5e-3); | ||||||
|  |     FILTER(3.5e-3); | ||||||
|  |     FILTER(4.5e-3); | ||||||
|  |     FILTER(5.5e-3); | ||||||
|  |     FILTER(6.5e-3); | ||||||
|  |  | ||||||
|  |     //    FILTER(6.0e-5);//6 | ||||||
|  |     //    FILTER(7.0e-5);//8 | ||||||
|  |     //    FILTER(8.0e-5);//9 | ||||||
|  |     //    FILTER(9.0e-5);//3 | ||||||
|  |  | ||||||
|  |     /* | ||||||
|  |     //    FILTER(1.0e-4);//10 | ||||||
|  |     FILTER(2.0e-4);//11 | ||||||
|  |     //   FILTER(3.0e-4);//12 | ||||||
|  |     //    FILTER(4.0e-4);//13 | ||||||
|  |     FILTER(5.0e-4);//14 | ||||||
|  |  | ||||||
|  |     FILTER(6.0e-3);//4 | ||||||
|  |     FILTER(7.0e-4);//1 | ||||||
|  |     FILTER(8.0e-4);//7 | ||||||
|  |     FILTER(9.0e-4);//15 | ||||||
|  |     FILTER(1.0e-3);//2 | ||||||
|  |  | ||||||
|  |     FILTER(2.0e-3);//2 | ||||||
|  |     FILTER(3.0e-3);//2 | ||||||
|  |     FILTER(4.0e-3);//2 | ||||||
|  |     FILTER(5.0e-3);//2 | ||||||
|  |     FILTER(6.0e-3);//2 | ||||||
|  |  | ||||||
|  |     FILTER(7.0e-3);//2 | ||||||
|  |     FILTER(8.0e-3);//2 | ||||||
|  |     FILTER(1.0e-2);//2 | ||||||
|  |     */ | ||||||
|  |     std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl; | ||||||
|  |     assert(b==nn); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| // Fine Object == (per site) type of fine field | // Fine Object == (per site) type of fine field | ||||||
| @@ -287,7 +523,8 @@ template<class Fobj,class CComplex,int nbasis> | |||||||
| class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  { | class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  { | ||||||
| public: | public: | ||||||
|      |      | ||||||
|   typedef iVector<CComplex,nbasis >             siteVector; |   typedef iVector<CComplex,nbasis >           siteVector; | ||||||
|  |   typedef Lattice<CComplex >                  CoarseComplexField; | ||||||
|   typedef Lattice<siteVector>                 CoarseVector; |   typedef Lattice<siteVector>                 CoarseVector; | ||||||
|   typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix; |   typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix; | ||||||
|   typedef iMatrix<CComplex,nbasis >  Cobj; |   typedef iMatrix<CComplex,nbasis >  Cobj; | ||||||
| @@ -305,7 +542,6 @@ public: | |||||||
|  |  | ||||||
|   std::vector<CoarseMatrix> A; |   std::vector<CoarseMatrix> A; | ||||||
|        |        | ||||||
|        |  | ||||||
|   /////////////////////// |   /////////////////////// | ||||||
|   // Interface |   // Interface | ||||||
|   /////////////////////// |   /////////////////////// | ||||||
| @@ -316,7 +552,6 @@ public: | |||||||
|     conformable(_grid,in.Grid()); |     conformable(_grid,in.Grid()); | ||||||
|     conformable(in.Grid(),out.Grid()); |     conformable(in.Grid(),out.Grid()); | ||||||
|  |  | ||||||
|  |  | ||||||
|     //    RealD Nin = norm2(in); |     //    RealD Nin = norm2(in); | ||||||
|     SimpleCompressor<siteVector> compressor; |     SimpleCompressor<siteVector> compressor; | ||||||
|  |  | ||||||
| @@ -333,16 +568,14 @@ public: | |||||||
|     Aview *Aview_p = & AcceleratorViewContainer[0]; |     Aview *Aview_p = & AcceleratorViewContainer[0]; | ||||||
|  |  | ||||||
|     const int Nsimd = CComplex::Nsimd(); |     const int Nsimd = CComplex::Nsimd(); | ||||||
|  |  | ||||||
|     typedef decltype(coalescedRead(in_v[0])) calcVector; |     typedef decltype(coalescedRead(in_v[0])) calcVector; | ||||||
|     typedef decltype(coalescedRead(in_v[0](0))) calcComplex; |     typedef decltype(coalescedRead(in_v[0](0))) calcComplex; | ||||||
|  |  | ||||||
|     GridStopWatch ArithmeticTimer; |     GridStopWatch ArithmeticTimer; | ||||||
|     int osites=Grid()->oSites(); |     int osites=Grid()->oSites(); | ||||||
|     double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint; |     //    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint; | ||||||
|     double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); |     //    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); | ||||||
|     double usecs =-usecond(); |     double usecs =-usecond(); | ||||||
|  |  | ||||||
|     // assert(geom.npoint==9); |     // assert(geom.npoint==9); | ||||||
|  |  | ||||||
|     accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { |     accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { | ||||||
| @@ -418,7 +651,37 @@ public: | |||||||
|  |  | ||||||
|     auto out_v = out.View(); |     auto out_v = out.View(); | ||||||
|     auto in_v  = in.View(); |     auto in_v  = in.View(); | ||||||
|  |  | ||||||
|  |     const int Nsimd = CComplex::Nsimd(); | ||||||
|  |     typedef decltype(coalescedRead(in_v[0])) calcVector; | ||||||
|  |     typedef decltype(coalescedRead(in_v[0](0))) calcComplex; | ||||||
|  |  | ||||||
|  |     accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { | ||||||
|  |       int ss = sss/nbasis; | ||||||
|  |       int b  = sss%nbasis; | ||||||
|  |       calcComplex res = Zero(); | ||||||
|  |       calcVector nbr; | ||||||
|  |       int ptype; | ||||||
|  |       StencilEntry *SE; | ||||||
|  |  | ||||||
|  |       int lane=SIMTlane(Nsimd); | ||||||
|  |       SE=Stencil.GetEntry(ptype,point,ss); | ||||||
|  | 	   | ||||||
|  |       if(SE->_is_local) {  | ||||||
|  | 	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); | ||||||
|  |       } else { | ||||||
|  | 	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); | ||||||
|  |       } | ||||||
|  |       synchronise(); | ||||||
|  |  | ||||||
|  |       for(int bb=0;bb<nbasis;bb++) { | ||||||
|  | 	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb); | ||||||
|  |       } | ||||||
|  |       coalescedWrite(out_v[ss](b),res,lane); | ||||||
|  |     }); | ||||||
|  | #if 0 | ||||||
|     accelerator_for(ss,Grid()->oSites(),1,{ |     accelerator_for(ss,Grid()->oSites(),1,{ | ||||||
|  |  | ||||||
|       siteVector res = Zero(); |       siteVector res = Zero(); | ||||||
|       siteVector nbr; |       siteVector nbr; | ||||||
|       int ptype; |       int ptype; | ||||||
| @@ -433,18 +696,23 @@ public: | |||||||
|       } else { |       } else { | ||||||
| 	nbr = Stencil.CommBuf()[SE->_offset]; | 	nbr = Stencil.CommBuf()[SE->_offset]; | ||||||
|       } |       } | ||||||
|  |       synchronise(); | ||||||
|  |  | ||||||
|       res = res + Aview_p[point][ss]*nbr; |       res = res + Aview_p[point][ss]*nbr; | ||||||
|        |        | ||||||
|       out_v[ss]=res; |       out_v[ss]=res; | ||||||
|     }); |     }); | ||||||
|  | #endif | ||||||
|   } |   } | ||||||
|   void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out) |   void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out) | ||||||
|   { |   { | ||||||
|     this->MdirComms(in); |     this->MdirComms(in); | ||||||
|     int ndir=geom.npoint-1; |     int ndir=geom.npoint-1; | ||||||
|     assert(out.size()==ndir); |     if ((out.size()!=ndir)&&(out.size()!=ndir+1)) {  | ||||||
|  |       std::cout <<"MdirAll out size "<< out.size()<<std::endl; | ||||||
|  |       std::cout <<"MdirAll ndir "<< ndir<<std::endl; | ||||||
|  |       assert(0); | ||||||
|  |     } | ||||||
|     for(int p=0;p<ndir;p++){ |     for(int p=0;p<ndir;p++){ | ||||||
|       MdirCalc(in,out[p],p); |       MdirCalc(in,out[p],p); | ||||||
|     } |     } | ||||||
| @@ -496,28 +764,45 @@ public: | |||||||
|     geom(CoarseGrid._ndimension), |     geom(CoarseGrid._ndimension), | ||||||
|     hermitian(hermitian_), |     hermitian(hermitian_), | ||||||
|     Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), |     Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), | ||||||
|     A(geom.npoint,&CoarseGrid) |       A(geom.npoint,&CoarseGrid) | ||||||
|   { |   { | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, |   void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, | ||||||
| 		       Aggregation<Fobj,CComplex,nbasis> & Subspace){ | 		       Aggregation<Fobj,CComplex,nbasis> & Subspace) | ||||||
|  |   { | ||||||
|  |     typedef Lattice<typename Fobj::tensor_reduced> FineComplexField; | ||||||
|  |     typedef typename Fobj::scalar_type scalar_type; | ||||||
|  |  | ||||||
|     FineField iblock(FineGrid); // contributions from within this block |     FineComplexField one(FineGrid); one=scalar_type(1.0,0.0); | ||||||
|     FineField oblock(FineGrid); // contributions from outwith this block |     FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0); | ||||||
|  |  | ||||||
|  |     std::vector<FineComplexField> masks(geom.npoint,FineGrid); | ||||||
|  |     FineComplexField imask(FineGrid); // contributions from within this block | ||||||
|  |     FineComplexField omask(FineGrid); // contributions from outwith this block | ||||||
|  |  | ||||||
|  |     FineComplexField evenmask(FineGrid); | ||||||
|  |     FineComplexField oddmask(FineGrid);  | ||||||
|  |  | ||||||
|     FineField     phi(FineGrid); |     FineField     phi(FineGrid); | ||||||
|     FineField     tmp(FineGrid); |     FineField     tmp(FineGrid); | ||||||
|     FineField     zz(FineGrid); zz=Zero(); |     FineField     zz(FineGrid); zz=Zero(); | ||||||
|     FineField    Mphi(FineGrid); |     FineField    Mphi(FineGrid); | ||||||
|  |     FineField    Mphie(FineGrid); | ||||||
|  |     FineField    Mphio(FineGrid); | ||||||
|     std::vector<FineField>     Mphi_p(geom.npoint,FineGrid); |     std::vector<FineField>     Mphi_p(geom.npoint,FineGrid); | ||||||
|  |  | ||||||
|     Lattice<iScalar<vInteger> > coor(FineGrid); |     Lattice<iScalar<vInteger> > coor (FineGrid); | ||||||
|  |     Lattice<iScalar<vInteger> > bcoor(FineGrid); | ||||||
|  |     Lattice<iScalar<vInteger> > bcb  (FineGrid); | ||||||
|  |  | ||||||
|     CoarseVector iProj(Grid());  |     CoarseVector iProj(Grid());  | ||||||
|     CoarseVector oProj(Grid());  |     CoarseVector oProj(Grid());  | ||||||
|     CoarseScalar InnerProd(Grid());  |     CoarseVector SelfProj(Grid());  | ||||||
|  |     CoarseComplexField iZProj(Grid());  | ||||||
|  |     CoarseComplexField oZProj(Grid());  | ||||||
|  |  | ||||||
|  |     CoarseScalar InnerProd(Grid());  | ||||||
|  |  | ||||||
|     // Orthogonalise the subblocks over the basis |     // Orthogonalise the subblocks over the basis | ||||||
|     blockOrthogonalise(InnerProd,Subspace.subspace); |     blockOrthogonalise(InnerProd,Subspace.subspace); | ||||||
| @@ -525,22 +810,46 @@ public: | |||||||
|     // Compute the matrix elements of linop between this orthonormal |     // Compute the matrix elements of linop between this orthonormal | ||||||
|     // set of vectors. |     // set of vectors. | ||||||
|     int self_stencil=-1; |     int self_stencil=-1; | ||||||
|     for(int p=0;p<geom.npoint;p++){  |     for(int p=0;p<geom.npoint;p++) | ||||||
|  |     {  | ||||||
|  |       int dir   = geom.directions[p]; | ||||||
|  |       int disp  = geom.displacements[p]; | ||||||
|       A[p]=Zero(); |       A[p]=Zero(); | ||||||
|       if( geom.displacements[p]==0){ |       if( geom.displacements[p]==0){ | ||||||
| 	self_stencil=p; | 	self_stencil=p; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|  |       Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]); | ||||||
|  |  | ||||||
|  |       LatticeCoordinate(coor,dir); | ||||||
|  |  | ||||||
|  |       /////////////////////////////////////////////////////// | ||||||
|  |       // Work out even and odd block checkerboarding for fast diagonal term | ||||||
|  |       /////////////////////////////////////////////////////// | ||||||
|  |       if ( disp==1 ) { | ||||||
|  | 	bcb   = bcb + div(coor,block); | ||||||
|  |       } | ||||||
|  | 	 | ||||||
|  |       if ( disp==0 ) { | ||||||
|  | 	  masks[p]= Zero(); | ||||||
|  |       } else if ( disp==1 ) { | ||||||
|  | 	masks[p] = where(mod(coor,block)==(block-1),one,zero); | ||||||
|  |       } else if ( disp==-1 ) { | ||||||
|  | 	masks[p] = where(mod(coor,block)==(Integer)0,one,zero); | ||||||
|  |       } | ||||||
|     } |     } | ||||||
|  |     evenmask = where(mod(bcb,2)==(Integer)0,one,zero); | ||||||
|  |     oddmask  = one-evenmask; | ||||||
|  |  | ||||||
|     assert(self_stencil!=-1); |     assert(self_stencil!=-1); | ||||||
|  |  | ||||||
|     for(int i=0;i<nbasis;i++){ |     for(int i=0;i<nbasis;i++){ | ||||||
|  |  | ||||||
|       phi=Subspace.subspace[i]; |       phi=Subspace.subspace[i]; | ||||||
|  |  | ||||||
|       std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir " << std::endl; |       //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl; | ||||||
|       linop.OpDirAll(phi,Mphi_p); |       linop.OpDirAll(phi,Mphi_p); | ||||||
|       std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir calculated" << std::endl; |  | ||||||
|       linop.OpDiag  (phi,Mphi_p[geom.npoint-1]); |       linop.OpDiag  (phi,Mphi_p[geom.npoint-1]); | ||||||
|       std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDiag calculated" << std::endl; |  | ||||||
|  |  | ||||||
|       for(int p=0;p<geom.npoint;p++){  |       for(int p=0;p<geom.npoint;p++){  | ||||||
|  |  | ||||||
| @@ -549,54 +858,66 @@ public: | |||||||
| 	int dir   = geom.directions[p]; | 	int dir   = geom.directions[p]; | ||||||
| 	int disp  = geom.displacements[p]; | 	int disp  = geom.displacements[p]; | ||||||
|  |  | ||||||
| 	Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]); | 	if ( (disp==-1) || (!hermitian ) ) { | ||||||
|  |  | ||||||
| 	LatticeCoordinate(coor,dir); | 	  //////////////////////////////////////////////////////////////////////// | ||||||
|  | 	  // Pick out contributions coming from this cell and neighbour cell | ||||||
|  | 	  //////////////////////////////////////////////////////////////////////// | ||||||
|  | 	  omask = masks[p]; | ||||||
|  | 	  imask = one-omask; | ||||||
| 	 | 	 | ||||||
|  | 	  for(int j=0;j<nbasis;j++){ | ||||||
| 	     | 	     | ||||||
| 	//////////////////////////////////////////////////////////////////////// | 	    blockMaskedInnerProduct(iZProj,oZProj,imask,omask,Subspace.subspace[j],Mphi); | ||||||
| 	// Pick out contributions coming from this cell and neighbour cell | 	     | ||||||
| 	//////////////////////////////////////////////////////////////////////// | 	    auto iZProj_v = iZProj.View() ; | ||||||
| 	if ( disp==0 ) { | 	    auto oZProj_v = oZProj.View() ; | ||||||
| 	  iblock = Mphi; | 	    auto A_p     =  A[p].View(); | ||||||
| 	  oblock = Zero(); | 	    auto A_self  = A[self_stencil].View(); | ||||||
| 	} else if ( disp==1 ) { |  | ||||||
| 	  oblock = where(mod(coor,block)==(block-1),Mphi,zz); | 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); | ||||||
| 	  iblock = where(mod(coor,block)!=(block-1),Mphi,zz); |  | ||||||
| 	} else if ( disp==-1 ) { | 	  } | ||||||
| 	  oblock = where(mod(coor,block)==(Integer)0,Mphi,zz); | 	} | ||||||
| 	  iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz); |       } | ||||||
| 	} else { |  | ||||||
| 	  assert(0); |       /////////////////////////////////////////// | ||||||
|  |       // Faster alternate self coupling.. use hermiticity to save 2x | ||||||
|  |       /////////////////////////////////////////// | ||||||
|  |       { | ||||||
|  | 	mult(tmp,phi,evenmask);  linop.Op(tmp,Mphie); | ||||||
|  | 	mult(tmp,phi,oddmask );   linop.Op(tmp,Mphio); | ||||||
|  |  | ||||||
|  | 	//	tmp = Mphie*evenmask + Mphio*oddmask; | ||||||
|  | 	{ | ||||||
|  | 	  auto tmp_      = tmp.View(); | ||||||
|  | 	  auto evenmask_ = evenmask.View(); | ||||||
|  | 	  auto oddmask_  =  oddmask.View(); | ||||||
|  | 	  auto Mphie_    =  Mphie.View(); | ||||||
|  | 	  auto Mphio_    =  Mphio.View(); | ||||||
|  | 	  accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{  | ||||||
|  | 	      coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); | ||||||
|  | 	    }); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Could do local inner products, | 	blockProject(SelfProj,tmp,Subspace.subspace); | ||||||
| 	// and then block pick the IP's. |  | ||||||
| 	// Ideally write a routine to do two masked block sums at once |  | ||||||
| 	std::cout << GridLogMessage<< "CoarsenMatrix picked "<<p<< std::endl; |  | ||||||
| 	Subspace.ProjectToSubspace(iProj,iblock); |  | ||||||
| 	Subspace.ProjectToSubspace(oProj,oblock); |  | ||||||
| 	std::cout << GridLogMessage<< "CoarsenMatrix projected"<<p<< std::endl; |  | ||||||
|  |  | ||||||
| 	// 4x gain possible in this loop. Profile and identify time loss. | 	auto SelfProj_ = SelfProj.View(); | ||||||
| 	// i)  Assume Hermiticity, upper diagonal only (2x) |  | ||||||
| 	// ii) Local inner product, then pick the local inners and sum. (2x) |  | ||||||
|  |  | ||||||
| 	auto iProj_v = iProj.View() ; |  | ||||||
| 	auto oProj_v = oProj.View() ; |  | ||||||
| 	auto A_p     =  A[p].View(); |  | ||||||
| 	auto A_self  = A[self_stencil].View(); | 	auto A_self  = A[self_stencil].View(); | ||||||
| 	accelerator_for(ss, Grid()->oSites(),1,{ | 	accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ | ||||||
| 	  for(int j=0;j<nbasis;j++){ | 	  for(int j=0;j<nbasis;j++){ | ||||||
| 	    if( disp!= 0 ) { | 	    coalescedWrite(A_self[ss](j,i), SelfProj_(ss)(j)); | ||||||
| 	      A_p[ss](j,i) = oProj_v[ss](j); |  | ||||||
| 	    } |  | ||||||
| 	    A_self[ss](j,i) =	A_self[ss](j,i) + iProj_v[ss](j); |  | ||||||
| 	  } | 	  } | ||||||
| 	}); | 	}); | ||||||
|  |  | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     if(hermitian) { | ||||||
|  |       std::cout << GridLogMessage << " ForceHermitian "<<std::endl; | ||||||
|  |       ForceHermitian(); | ||||||
|  |     } | ||||||
|  |       // AssertHermitian(); | ||||||
|  |       // ForceDiagonal(); | ||||||
|  |   } | ||||||
|  |  | ||||||
| #if 0 | #if 0 | ||||||
|     /////////////////////////// |     /////////////////////////// | ||||||
| @@ -619,25 +940,26 @@ public: | |||||||
|     std::cout<<GridLogMessage<< iProj <<std::endl; |     std::cout<<GridLogMessage<< iProj <<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl; |     std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl; | ||||||
| #endif | #endif | ||||||
|     /* |  | ||||||
|     if(hermitian) { |  | ||||||
|       std::cout << GridLogMessage << " ForceHermitian "<<std::endl; |  | ||||||
|       ForceHermitian(); |  | ||||||
|     } |  | ||||||
|     for(int p=0;p<geom.npoint;p++){ |  | ||||||
|       std::cout << GridLogMessage<< " dir "<< norm2(A[p]) <<std::endl; |  | ||||||
|     } |  | ||||||
|     */ |  | ||||||
|       // AssertHermitian(); |  | ||||||
|       // ForceDiagonal(); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   void ForceHermitian(void) { |   void ForceHermitian(void) { | ||||||
|     for(int d=0;d<4;d++){ |     CoarseMatrix Diff  (Grid()); | ||||||
|       int dd=d+1; |     for(int p=0;p<geom.npoint;p++){ | ||||||
|       A[2*d] = adj(Cshift(A[2*d+1],dd,1)); |       int dir   = geom.directions[p]; | ||||||
|  |       int disp  = geom.displacements[p]; | ||||||
|  |       if(disp==-1) { | ||||||
|  | 	// Find the opposite link | ||||||
|  | 	for(int pp=0;pp<geom.npoint;pp++){ | ||||||
|  | 	  int dirp   = geom.directions[pp]; | ||||||
|  | 	  int dispp  = geom.displacements[pp]; | ||||||
|  | 	  if ( (dirp==dir) && (dispp==1) ){ | ||||||
|  | 	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp];  | ||||||
|  | 	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl; | ||||||
|  | 	    A[pp] = adj(Cshift(A[p],dir,1)); | ||||||
|  | 	  } | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|     } |     } | ||||||
|     //      A[8] = 0.5*(A[8] + adj(A[8])); |  | ||||||
|   } |   } | ||||||
|   void AssertHermitian(void) { |   void AssertHermitian(void) { | ||||||
|     CoarseMatrix AA    (Grid()); |     CoarseMatrix AA    (Grid()); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user