From f2a4f1311113505a280cc8cdb67db22bbf3f7cf2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 10 Dec 2019 19:32:12 -0500 Subject: [PATCH 01/43] Must offload the Coarsened matrix if Stencil buffers are device resident --- Grid/algorithms/CoarsenedMatrix.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 913f5c0c..45d5e8f7 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -128,7 +128,7 @@ public: for(int i=0;ioSites(),{ + accelerator_for(ss, CoarseGrid->oSites(),1,{ eProj[ss](i)=CComplex(1.0); }); eProj=eProj - iProj; @@ -307,10 +307,12 @@ public: RealD Nin = norm2(in); SimpleCompressor compressor; + Stencil.HaloExchange(in,compressor); + auto in_v = in.View(); auto out_v = out.View(); - thread_for(ss,Grid()->oSites(),{ + accelerator_for(ss,Grid()->oSites(),1,{ siteVector res = Zero(); siteVector nbr; int ptype; @@ -331,6 +333,7 @@ public: } vstream(out_v[ss],res); }); + RealD Nout= norm2(out); return Nout; }; @@ -356,6 +359,7 @@ public: conformable(in.Grid(),out.Grid()); SimpleCompressor compressor; + Stencil.HaloExchange(in,compressor); auto point = [dir, disp](){ @@ -367,7 +371,7 @@ public: auto out_v = out.View(); auto in_v = in.View(); - thread_for(ss,Grid()->oSites(),{ + accelerator_for(ss,Grid()->oSites(),1,{ siteVector res = Zero(); siteVector nbr; int ptype; From 710fee5d2601962419b7c3521423cbc4bc62462c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 10 Dec 2019 21:48:42 -0500 Subject: [PATCH 02/43] Subspace setup testing code and timing verbose --- Grid/algorithms/CoarsenedMatrix.h | 70 +++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 45d5e8f7..c19bef19 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -115,9 +115,9 @@ public: void Orthogonalise(void){ CoarseScalar InnerProd(CoarseGrid); - std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"< &hermop,int nn=nbasis) { RealD scale; + const int dependent=4; - Chebyshev Cheb(0.1,64.0,900); + Chebyshev ChebDependent(1.0,64.0,100); + Chebyshev ChebFilt (0.1,64.0,900); FineField noise(FineGrid); FineField Mn(FineGrid); - for(int b=0;b "< "< "< "< "<oSites(),{ + accelerator_for(ss, Grid()->oSites(),1,{ for(int j=0;j Date: Tue, 10 Dec 2019 21:49:12 -0500 Subject: [PATCH 03/43] Some MPI (summit) create sigusr2, so trap that --- Grid/util/Init.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 472013f4..570f4234 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -628,6 +628,7 @@ void Grid_debug_handler_init(void) sigaction(SIGSEGV,&sa,NULL); sigaction(SIGTRAP,&sa,NULL); sigaction(SIGBUS,&sa,NULL); + sigaction(SIGUSR2,&sa,NULL); feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); From d73f0b8618e1c4c84a5b0dc76d6b3e8e6dace17f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 10 Dec 2019 21:50:06 -0500 Subject: [PATCH 04/43] Verbose for temporary debug --- Grid/threads/Pragmas.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/threads/Pragmas.h b/Grid/threads/Pragmas.h index d05f8ee9..4d713258 100644 --- a/Grid/threads/Pragmas.h +++ b/Grid/threads/Pragmas.h @@ -43,6 +43,7 @@ Author: paboyle #ifdef _OPENMP #define GRID_OMP #include +#warning "Grid is using OpenMP for host loops" #endif #ifdef GRID_OMP From 736b19485e1d7306f9894dcd42f5854102912780 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Dec 2019 21:30:48 -0500 Subject: [PATCH 05/43] Faster set up and some dead code ifdef'ed out --- Grid/algorithms/CoarsenedMatrix.h | 58 +++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index c19bef19..7f729bbc 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -240,7 +240,7 @@ public: // // World of possibilities here. // Experiments - // i) Use inverse iteration method equivaleent with Chebyshve + // i) Use inverse iteration method equivaleent with Chebyshev // ii) Multiply by Fourier phases // iii) Multiply by Fourier phases and refilter // @@ -248,10 +248,31 @@ public: RealD scale; - const int dependent=4; + const int dependent=16; - Chebyshev ChebDependent(1.0,64.0,100); - Chebyshev ChebFilt (0.1,64.0,900); + Chebyshev ChebFilt (0.03,64.0,500); + Chebyshev ChebDependent(0.01,64.0,200); + +#if 0 + auto latt_size = FineGrid->GlobalDimensions(); + Coordinate Fourier[dependent] = { + Coordinate({0, 0,0,0,0}), + Coordinate({0, 1,0,0,0}), + Coordinate({0,-1,0,0,0}), + Coordinate({0,0, 1,0,0}), + Coordinate({0,0,-1,0,0}), + Coordinate({0,0,0, 1,0}), + Coordinate({0,0,0,-1,0}), + Coordinate({0,0,0,0, 1}), + Coordinate({0,0,0,0,-1}) + }; + + ComplexD ci(0.0,1.0); + Lattice C(FineGrid); + Lattice coor(FineGrid); + FineField save(FineGrid); + FineField tmp (FineGrid); +#endif FineField noise(FineGrid); FineField Mn(FineGrid); @@ -262,16 +283,29 @@ public: gaussian(RNG,noise); scale = std::pow(norm2(noise),-0.5); noise=noise*scale; + // save=noise; // Initial matrix element hermop.Op(noise,Mn); std::cout< "< "< "< Date: Fri, 13 Dec 2019 22:08:11 -0500 Subject: [PATCH 06/43] offload more of mgrid to GPU --- Grid/algorithms/CoarsenedMatrix.h | 32 ------------------------------- Grid/lattice/Lattice_transfer.h | 20 +++++++++++-------- 2 files changed, 12 insertions(+), 40 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 7f729bbc..e47137f9 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -253,27 +253,6 @@ public: Chebyshev ChebFilt (0.03,64.0,500); Chebyshev ChebDependent(0.01,64.0,200); -#if 0 - auto latt_size = FineGrid->GlobalDimensions(); - Coordinate Fourier[dependent] = { - Coordinate({0, 0,0,0,0}), - Coordinate({0, 1,0,0,0}), - Coordinate({0,-1,0,0,0}), - Coordinate({0,0, 1,0,0}), - Coordinate({0,0,-1,0,0}), - Coordinate({0,0,0, 1,0}), - Coordinate({0,0,0,-1,0}), - Coordinate({0,0,0,0, 1}), - Coordinate({0,0,0,0,-1}) - }; - - ComplexD ci(0.0,1.0); - Lattice C(FineGrid); - Lattice coor(FineGrid); - FineField save(FineGrid); - FineField tmp (FineGrid); -#endif - FineField noise(FineGrid); FineField Mn(FineGrid); @@ -295,17 +274,6 @@ public: if(b==bb) { ChebFilt(hermop,noise,Mn); } else { -#if 0 - C=Zero(); - for(int mu=0;mu<5;mu++){ - RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; - LatticeCoordinate(coor,mu); - C = C + (TwoPiL * Fourier[dep][mu]) * coor; - } - C = exp(C*ci); // Fourier phase - noise=C*save; - hermop.Op(noise,Mn); std::cout< "< > &coarseData, auto fineData_ = fineData.View(); auto coarseData_ = coarseData.View(); - // Loop over coars parallel, and then loop over fine associated with coarse. + //////////////////////////////////////////////////////////////////////////////////////////////////////// + // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. + // Otherwise do finee inner product per site, and make the update atomic + //////////////////////////////////////////////////////////////////////////////////////////////////////// thread_for( sf, fine->oSites(), { int sc; Coordinate coor_c(_ndimension); @@ -120,10 +123,11 @@ inline void blockProject(Lattice > &coarseData, for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - thread_critical { - for(int i=0;i &fineZ, auto fineY_ = fineY.View(); auto coarseA_= coarseA.View(); - thread_for(sf, fine->oSites(), { + accelerator_for(sf, fine->oSites(), 1, { int sc; Coordinate coor_c(_ndimension); @@ -196,7 +200,7 @@ inline void blockInnerProduct(Lattice &CoarseInner, fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); - thread_for(ss, coarse->oSites(),{ + accelerator_for(ss, coarse->oSites(), 1, { CoarseInner_[ss] = coarse_inner_[ss]; }); } @@ -321,7 +325,7 @@ inline void blockPromote(const Lattice > &coarseData, auto coarseData_ = coarseData.View(); // Loop with a cache friendly loop ordering - thread_for(sf,fine->oSites(),{ + acceelerator_for(sf,fine->oSites(),1,{ int sc; Coordinate coor_c(_ndimension); Coordinate coor_f(_ndimension); From 152b525a4dc8b8211b8de001fe0333c9db11ac4d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Dec 2019 22:44:42 -0500 Subject: [PATCH 07/43] Typo fix --- Grid/lattice/Lattice_transfer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 02a9e91b..c1c3b542 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -325,7 +325,7 @@ inline void blockPromote(const Lattice > &coarseData, auto coarseData_ = coarseData.View(); // Loop with a cache friendly loop ordering - acceelerator_for(sf,fine->oSites(),1,{ + accelerator_for(sf,fine->oSites(),1,{ int sc; Coordinate coor_c(_ndimension); Coordinate coor_f(_ndimension); From 9e154749998eb0cfe4c5ada95d79ddaca1e3c4e3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 14 Dec 2019 05:28:16 -0500 Subject: [PATCH 08/43] Accelerator loop attempt at speed up --- Grid/lattice/Lattice_transfer.h | 57 ++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index c1c3b542..9e4003b0 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -106,6 +106,7 @@ inline void blockProject(Lattice > &coarseData, block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]); } + int blockVol = fine->oSites()/coarse->oSites(); coarseData=Zero(); @@ -113,20 +114,26 @@ inline void blockProject(Lattice > &coarseData, auto coarseData_ = coarseData.View(); //////////////////////////////////////////////////////////////////////////////////////////////////////// // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. - // Otherwise do finee inner product per site, and make the update atomic + // Otherwise do fine inner product per site, and make the update atomic //////////////////////////////////////////////////////////////////////////////////////////////////////// - thread_for( sf, fine->oSites(), { - int sc; - Coordinate coor_c(_ndimension); - Coordinate coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + accelerator_for( sc, coarse->oSites(), { - for(int i=0;i_rdimensions); // Block coordinate + coarseData_[sc]=Zero(); + + for(int sb=0;sb_rdimensions); + + for(int i=0;i &coarseData,const Lattice &fineData) for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } + int blockVol = fine->oSites()/coarse->oSites(); // Turn this around to loop threaded over sc and interior loop // over sf would thread better - coarseData=Zero(); auto coarseData_ = coarseData.View(); auto fineData_ = fineData.View(); - thread_for(sf,fine->oSites(),{ - int sc; + accelerator_for(sc,coarse->oSites(),1,{ + + // One thread per sub block Coordinate coor_c(_ndimension); - Coordinate coor_f(_ndimension); - - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - - thread_critical { + Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate + coarseData_[sc]=Zero(); + + for(int sb=0;sb_rdimensions); + coarseData_[sc]=coarseData_[sc]+fineData_[sf]; } From 9aafd204683487795733c0fbe1456e9b84f50179 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 17 Dec 2019 05:01:39 -0500 Subject: [PATCH 09/43] Simple block project promote runs faster on GPU --- Grid/lattice/Lattice_transfer.h | 72 +++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 12 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 9e4003b0..0041f47a 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -1,5 +1,4 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/lattice/Lattice_transfer.h @@ -83,12 +82,35 @@ template inline void setCheckerboard(Lattice &full,const Latti }); } - template inline void blockProject(Lattice > &coarseData, + const Lattice &fineData, + const std::vector > &Basis) +{ + GridBase * fine = fineData.Grid(); + GridBase * coarse= coarseData.Grid(); + + Lattice ip(coarse); + + // auto fineData_ = fineData.View(); + auto coarseData_ = coarseData.View(); + auto ip_ = ip.View(); + for(int v=0;voSites(), vobj::Nsimd(), { + coalescedWrite(coarseData_[sc](v),ip_(sc)); + }); + } +} + +template +inline void blockProject1(Lattice > &coarseData, const Lattice &fineData, const std::vector > &Basis) { + typedef iVector coarseSiteData; + coarseSiteData elide; + typedef decltype(coalescedRead(elide)) ScalarComplex; GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); int _ndimension = coarse->_ndimension; @@ -116,11 +138,17 @@ inline void blockProject(Lattice > &coarseData, // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. // Otherwise do fine inner product per site, and make the update atomic //////////////////////////////////////////////////////////////////////////////////////////////////////// - accelerator_for( sc, coarse->oSites(), { + accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), { + + auto sc=sci/nbasis; + auto i=sci%nbasis; + auto Basis_ = Basis[i].View(); Coordinate coor_c(_ndimension); Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate - coarseData_[sc]=Zero(); + + int sf; + decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero(); for(int sb=0;sb > &coarseData, Lexicographic::CoorFromIndex(coor_b,sb,block_r); for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d]; Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions); - - for(int i=0;i &ip,std::vector > } } +#if 0 template inline void blockPromote(const Lattice > &coarseData, Lattice &fineData, @@ -349,13 +375,35 @@ inline void blockPromote(const Lattice > &coarseData, for(int i=0;i +inline void blockPromote(const Lattice > &coarseData, + Lattice &fineData, + const std::vector > &Basis) +{ + GridBase * fine = fineData.Grid(); + GridBase * coarse= coarseData.Grid(); + + fineData=Zero(); + for(int i=0;i > ip = PeekIndex<0>(coarseData,i); + Lattice cip(coarse); + auto cip_ = cip.View(); + auto ip_ = ip.View(); + accelerator_for(sc,coarse->oSites(),1,{ + cip_[sc] = ip_[sc](); + }); + blockZAXPY(fineData,cip,Basis[i],fineData); + } +} +#endif // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Simd layouts need not match since we use peek/poke Local From e4784042915d5f622fefa7e7f4189c11943ae11f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 17 Dec 2019 05:03:25 -0500 Subject: [PATCH 10/43] Tuned up significantly on GPU, but another 10x in coarse space required --- tests/solver/Test_dwf_hdcr.cc | 54 ++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index 74adc417..3e603a26 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -33,7 +33,6 @@ Author: paboyle using namespace std; using namespace Grid; - ; class myclass: Serializable { public: @@ -126,7 +125,7 @@ public: CoarseVector Csol(_CoarseOperator.Grid()); ConjugateGradient CG(1.0e-10,100000); - ConjugateGradient fCG(3.0e-2,1000); + ConjugateGradient fCG(1.0e-3,1000); HermitianLinearOperator HermOp(_CoarseOperator); MdagMLinearOperator MdagMOp(_CoarseOperator); @@ -191,7 +190,7 @@ public: CoarseVector Csol(_CoarseOperator.Grid()); Csol=Zero(); ConjugateGradient CG(1.0e-10,100000); - ConjugateGradient fCG(3.0e-2,1000); + ConjugateGradient fCG(1.0e-3,1000); HermitianLinearOperator HermOp(_CoarseOperator); MdagMLinearOperator MdagMOp(_CoarseOperator); @@ -279,8 +278,7 @@ public: Chebyshev Cheby (params.lo,params.hi,params.order,InverseApproximation); Chebyshev ChebyAccu(params.lo,params.hi,params.order,InverseApproximation); - _Aggregates.ProjectToSubspace (Csrc,in); - + // _Aggregates.ProjectToSubspace (Csrc,in); // _Aggregates.PromoteFromSubspace(Csrc,out); // std::cout< PosdefLdop(LDOp); - ConjugateGradient CG(1.0e-6,100000); + ConjugateGradient CG(1.0e-2,100000); CG(PosdefLdop,c_src,c_res); + /* std::cout< HermIndefLdop(LDOp); // ConjugateResidual MCR(1.0e-6,100000); // MCR(HermIndefLdop,c_src,c_res); @@ -489,10 +503,10 @@ int main (int argc, char ** argv) // HermIndefOpDD,DdwfDD); // TrivialPrecon simple; - std::cout< fCG(1.0e-8,100000); // fCG(HermDefOp,src,result); - std::cout< HermOpEO(Ddwf); - ConjugateGradient pCG(1.0e-8,10000); + // std::cout< HermOpEO(Ddwf); + // ConjugateGradient pCG(1.0e-8,10000); // pCG(HermOpEO,src_o,result_o); // std::cout< Date: Tue, 17 Dec 2019 05:24:45 -0500 Subject: [PATCH 11/43] Coarse grid on GPU, not fast enough yet. Need a 10x --- Grid/algorithms/CoarsenedMatrix.h | 89 +++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 28 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index e47137f9..5e5bcbfa 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -117,8 +117,8 @@ public: CoarseScalar InnerProd(CoarseGrid); std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"< "< "< siteVector; typedef Lattice CoarseVector; typedef Lattice > CoarseMatrix; - + typedef iMatrix Cobj; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; @@ -336,36 +335,70 @@ public: conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); - RealD Nin = norm2(in); + + // RealD Nin = norm2(in); SimpleCompressor compressor; + double comms_usec = -usecond(); Stencil.HaloExchange(in,compressor); + comms_usec += usecond(); auto in_v = in.View(); auto out_v = out.View(); - accelerator_for(ss,Grid()->oSites(),1,{ - siteVector res = Zero(); - siteVector nbr; + typedef LatticeView Aview; + + Vector AcceleratorViewContainer; + for(int p=0;poSites(); + double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint; + double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); + double usecs =-usecond(); + + assert(geom.npoint==9); + + accelerator_for(ss, Grid()->oSites(), Nsimd, { + + calcVector res = Zero(); + calcVector nbr; int ptype; StencilEntry *SE; - for(int point=0;point_is_local&&SE->_permute) { - permute(nbr,in_v[SE->_offset],ptype); - } else if(SE->_is_local) { - nbr = in_v[SE->_offset]; + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); } else { - nbr = Stencil.CommBuf()[SE->_offset]; + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); } - auto A_point = A[point].View(); - res = res + A_point[ss]*nbr; - } - vstream(out_v[ss],res); - }); + synchronise(); + auto A = coalescedRead(Aview_p[point][ss]); + res = res + A*nbr; + } + coalescedWrite(out_v[ss],res,lane); + }); + usecs +=usecond(); + + double nrm_usec=-usecond(); RealD Nout= norm2(out); + nrm_usec+=usecond(); + /* + std::cout << GridLogMessage << "\tNorm " << nrm_usec << " us" < Date: Sat, 28 Dec 2019 10:32:15 -0500 Subject: [PATCH 12/43] Improved Multigrid for DWF --- Grid/algorithms/CoarsenedMatrix.h | 90 ++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 5e5bcbfa..68c820ca 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -117,8 +117,8 @@ public: CoarseScalar InnerProd(CoarseGrid); std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"< &hermop,int nn=nbasis) { + virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase &hermop, + int nn, + double hi, + std::vector &lo, + std::vector &order + ) { RealD scale; - const int dependent=16; - - Chebyshev ChebFilt (0.03,64.0,500); - Chebyshev ChebDependent(0.01,64.0,200); + const int dependent=lo.size(); FineField noise(FineGrid); FineField Mn(FineGrid); @@ -269,15 +271,12 @@ public: // Initial matrix element hermop.Op(noise,Mn); std::cout< "< Cheb(lo[bbb],hi,order[bbb]); bbb++; + Cheb(hermop,noise,Mn); // normalise scale = std::pow(norm2(Mn),-0.5); @@ -354,6 +353,7 @@ public: const int Nsimd = CComplex::Nsimd(); typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; GridStopWatch ArithmeticTimer; int osites=Grid()->oSites(); @@ -361,17 +361,18 @@ public: double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); double usecs =-usecond(); - assert(geom.npoint==9); + // assert(geom.npoint==9); - accelerator_for(ss, Grid()->oSites(), Nsimd, { - - calcVector res = Zero(); + accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); calcVector nbr; int ptype; StencilEntry *SE; int lane=SIMTlane(Nsimd); - for(int point=0;point<9;point++){ + for(int point=0;point compressor; Stencil.HaloExchange(in,compressor); - - auto point = [dir, disp](){ + + int ndim = in.Grid()->Nd(); + + ////////////// + // 4D action like wilson + // 0+ => 0 + // 0- => 1 + // 1+ => 2 + // 1- => 3 + // etc.. + ////////////// + // 5D action like DWF + // 1+ => 0 + // 1- => 1 + // 2+ => 2 + // 2- => 3 + // etc.. + + auto point = [dir, disp, ndim](){ if(dir == 0 and disp == 0) return 8; - else + else if ( ndim==4 ) { return (4 * dir + 1 - disp) / 2; + } else { + return (4 * (dir-1) + 1 - disp) / 2; + } }(); + typedef LatticeView Aview; + Vector AcceleratorViewContainer; + for(int p=0;poSites(),1,{ @@ -451,11 +479,11 @@ public: nbr = Stencil.CommBuf()[SE->_offset]; } - auto A_point = A[point].View(); - res = res + A_point[ss]*nbr; + res = res + Aview_p[point][ss]*nbr; - vstream(out_v[ss],res); + out_v[ss]=res; }); + }; void Mdiag(const CoarseVector &in, CoarseVector &out){ @@ -493,7 +521,7 @@ public: std::cout << GridLogMessage<< "CoarsenMatrix" << std::endl; // Orthogonalise the subblocks over the basis // blockOrthogonalise(InnerProd,Subspace.subspace); - std::cout << GridLogMessage<< "CoarsenMatrix orthogonalised" << std::endl; + // std::cout << GridLogMessage<< "CoarsenMatrix orthogonalised" << std::endl; // Compute the matrix elements of linop between this orthonormal // set of vectors. @@ -509,7 +537,7 @@ public: for(int i=0;i Date: Sat, 28 Dec 2019 10:32:35 -0500 Subject: [PATCH 13/43] Improved DWF multigrid --- tests/solver/Test_dwf_hdcr.cc | 290 +++++++++++++++++++++++----------- 1 file changed, 195 insertions(+), 95 deletions(-) diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index 3e603a26..817f51c7 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -50,13 +50,12 @@ public: myclass(){}; }; -myclass params; RealD InverseApproximation(RealD x){ return 1.0/x; } -template +template class MultiGridPreconditioner : public LinearFunction< Lattice > { public: @@ -76,17 +75,28 @@ public: FineOperator & _FineOperator; Matrix & _SmootherMatrix; FineOperator & _SmootherOperator; + Guesser & _Guess; + + double cheby_hi; + double cheby_lo; + int cheby_ord; + + myclass _params; // Constructor MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, FineOperator &Fine,Matrix &FineMatrix, - FineOperator &Smooth,Matrix &SmootherMatrix) + FineOperator &Smooth,Matrix &SmootherMatrix, + Guesser &Guess_, + myclass params_) : _Aggregates(Agg), _CoarseOperator(Coarse), _FineOperator(Fine), _FineMatrix(FineMatrix), _SmootherOperator(Smooth), - _SmootherMatrix(SmootherMatrix) + _SmootherMatrix(SmootherMatrix), + _Guess(Guess_), + _params(params_) { } @@ -98,7 +108,7 @@ public: MdagMLinearOperator fMdagMOp(_FineMatrix); p1=in; - for(int i=0;i<20;i++){ + for(int i=0;i<50;i++){ RealD absp1=std::sqrt(norm2(p1)); fMdagMOp.HermOp(p1,p2);// this is the G5 herm bit // _FineOperator.Op(p1,p2);// this is the G5 herm bit @@ -109,8 +119,9 @@ public: } } - void operator()(const FineField &in, FineField & out) { + void operator()(const FineField &in, FineField & out ) { operatorCheby(in,out); + //operatorADEF2(in,out); } //////////////////////////////////////////////////////////////////////// @@ -124,8 +135,8 @@ public: CoarseVector Ctmp(_CoarseOperator.Grid()); CoarseVector Csol(_CoarseOperator.Grid()); - ConjugateGradient CG(1.0e-10,100000); - ConjugateGradient fCG(1.0e-3,1000); + ConjugateGradient CG(1.0e-3,1000,false); + ConjugateGradient fCG(1.0e-3,15,false); HermitianLinearOperator HermOp(_CoarseOperator); MdagMLinearOperator MdagMOp(_CoarseOperator); @@ -152,9 +163,9 @@ public: _FineOperator.Op(Min,tmp); tmp = in - tmp; // in - A Min - Csol=Zero(); _Aggregates.ProjectToSubspace (Csrc,tmp); HermOp.AdjOp(Csrc,Ctmp);// Normal equations + Csol=Zero(); CG(MdagMOp,Ctmp,Csol); HermOp.Op(Csol,Ctmp); @@ -263,9 +274,9 @@ public: CoarseVector Csrc(_CoarseOperator.Grid()); CoarseVector Ctmp(_CoarseOperator.Grid()); - CoarseVector Csol(_CoarseOperator.Grid()); Csol=Zero(); - - ConjugateGradient CG(3.0e-3,100000); + CoarseVector Csol(_CoarseOperator.Grid()); + + ConjugateGradient CG(3.0e-2,100000); HermitianLinearOperator HermOp(_CoarseOperator); MdagMLinearOperator MdagMOp(_CoarseOperator); @@ -275,8 +286,8 @@ public: FineField vec1(in.Grid()); FineField vec2(in.Grid()); - Chebyshev Cheby (params.lo,params.hi,params.order,InverseApproximation); - Chebyshev ChebyAccu(params.lo,params.hi,params.order,InverseApproximation); + Chebyshev Cheby (_params.lo,_params.hi,_params.order,InverseApproximation); + Chebyshev ChebyAccu(_params.lo,_params.hi,_params.order,InverseApproximation); // _Aggregates.ProjectToSubspace (Csrc,in); // _Aggregates.PromoteFromSubspace(Csrc,out); @@ -313,6 +324,8 @@ public: std::cout< block ({2,2,2,2}); const int nbasis= 32; - auto clatt = GridDefaultLatt(); - std::cout << GridLogMessage << " Coarse lattice is "; for(int d=0;d Subspace; typedef CoarsenedMatrix CoarseOperator; typedef CoarseOperator::CoarseVector CoarseVector; - + typedef CoarseOperator::siteVector siteVector; std::cout< HermDefOp(Ddwf); + Subspace Aggregates(Coarse5d,FGrid,0); - // Aggregates.CreateSubspace(RNG5,HermDefOp,nbasis); + assert ( (nbasis & 0x1)==0); int nb=nbasis/2; std::cout< Level1Op; + typedef CoarsenedMatrix,nbasis> Level2Op; + + auto cclatt = clatt; + for(int d=0;d,nbasis> CoarseSubspace; + CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + double c_first = 0.2; + double c_div = 1.2; + std::vector c_lo(nb); + c_lo[0] = c_first; + for(int b=1;b c_ord(nb,200); + c_ord[0]=500; + +#define RECURSIVE_MULTIGRID +#ifdef RECURSIVE_MULTIGRID std::cout< PosdefLdop(LDOp); - ConjugateGradient CG(1.0e-2,100000); - CG(PosdefLdop,c_src,c_res); + // CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nbasis,14.0,c_lo,c_ord); + // CoarseAggregates.CreateSubspaceRandom(CRNG); - /* - std::cout< HermIndefLdop(LDOp); -// ConjugateResidual MCR(1.0e-6,100000); -// MCR(HermIndefLdop,c_src,c_res); + // Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + // HermitianLinearOperator L1LinOp(LDOp); + // L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); +#endif - std::cout< Precon (Aggregates, LDOp, - HermIndefOp,Ddwf, - HermIndefOp,Ddwf); - - // MultiGridPreconditioner PreconDD(Aggregates, LDOp, - // HermIndefOp,Ddwf, - // HermIndefOpDD,DdwfDD); - // TrivialPrecon simple; - - // std::cout< fCG(1.0e-8,100000); // fCG(HermDefOp,src,result); - // std::cout< HermOpEO(Ddwf); - // ConjugateGradient pCG(1.0e-8,10000); - // pCG(HermOpEO,src_o,result_o); - - // std::cout< UPGCR(1.0e-8,100000,simple,8,128); - // UPGCR(HermIndefOp,src,result); - + std::cout< HermOpEO(Ddwf); + ConjugateGradient pCG(1.0e-8,10000); + // pCG(HermOpEO,src_o,result_o); - /// Get themax eval std::cout< IRLHermOp(LDOp); + Chebyshev IRLCheby(0.01,14,161); + FunctionHermOp IRLOpCheby(IRLCheby,IRLHermOp); + PlainHermOp IRLOp (IRLHermOp); + + int Nstop=32; + int Nk=32; + int Nm=48; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-4,20); + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + IRL.calc(eval,evec,c_src,Nconv); - // std::cout< PGCRDD(1.0e-8,100000,PreconDD,8,128); - // result=Zero(); - // std::cout< CG(3.0e-3,100000); + // CG(PosdefLdop,c_src,c_res); + + std::cout< DeflCoarseGuesser(evec,eval); + DeflCoarseGuesser(c_src,c_res); + // CG(PosdefLdop,c_src,c_res); + + std::cout< CoarseZeroGuesser; + + MultiGridPreconditioner > + Precon (Aggregates, LDOp, + HermIndefOp,Ddwf, + HermIndefOp,Ddwf, + CoarseZeroGuesser, + params); + + // Precon.PowerMethod(src); + + /* + std::cout<,nbasis,Level1Op,ZeroGuesser > + CoarsePrecon (CoarseAggregates, + L2Op, + L1LinOp,LDOp, + L1LinOp,LDOp, + CoarseZeroGuesser, + cparams); + + CoarsePrecon.PowerMethod(c_src); + */ + + /* std::cout< > + DeflatedPrecon (Aggregates, LDOp, + HermIndefOp,Ddwf, + HermIndefOp,Ddwf, + DeflCoarseGuesser, + params); + + PrecGeneralisedConjugateResidual deflPGCR(1.0e-8,100000,DeflatedPrecon,16,16); + + std::cout< CPGCR(1.0e-3,10000,CoarsePrecon,8,8); + std::cout< Date: Fri, 3 Jan 2020 05:29:09 -0500 Subject: [PATCH 14/43] Alternate low pass filter option --- Grid/algorithms/approx/Chebyshev.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 97e0e807..74789ead 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -94,6 +94,24 @@ public: Coeffs.assign(0.,order); Coeffs[order-1] = 1.; }; + + // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's. + // Similar kick effect below the threshold as Lanczos filter approach + void InitLowPass(RealD _lo,RealD _hi,int _order) + { + lo=_lo; + hi=_hi; + order=_order; + + if(order < 2) exit(-1); + Coeffs.resize(order); + for(int j=0;j Date: Sat, 4 Jan 2020 03:11:19 -0500 Subject: [PATCH 15/43] Nearing well optimised state --- tests/solver/Test_dwf_hdcr.cc | 77 ++++++++++++++++------------------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index 817f51c7..5a131a57 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -135,8 +135,8 @@ public: CoarseVector Ctmp(_CoarseOperator.Grid()); CoarseVector Csol(_CoarseOperator.Grid()); - ConjugateGradient CG(1.0e-3,1000,false); - ConjugateGradient fCG(1.0e-3,15,false); + ConjugateGradient CG(1.0e-3,100,false); + ConjugateGradient fCG(1.0e-3,10,false); HermitianLinearOperator HermOp(_CoarseOperator); MdagMLinearOperator MdagMOp(_CoarseOperator); @@ -165,7 +165,7 @@ public: _Aggregates.ProjectToSubspace (Csrc,tmp); HermOp.AdjOp(Csrc,Ctmp);// Normal equations - Csol=Zero(); + _Guess(Ctmp,Csol); CG(MdagMOp,Ctmp,Csol); HermOp.Op(Csol,Ctmp); @@ -274,9 +274,10 @@ public: CoarseVector Csrc(_CoarseOperator.Grid()); CoarseVector Ctmp(_CoarseOperator.Grid()); + CoarseVector Ctmp1(_CoarseOperator.Grid()); CoarseVector Csol(_CoarseOperator.Grid()); - ConjugateGradient CG(3.0e-2,100000); + ConjugateGradient CG(5.0e-2,100000); HermitianLinearOperator HermOp(_CoarseOperator); MdagMLinearOperator MdagMOp(_CoarseOperator); @@ -323,10 +324,20 @@ public: _Aggregates.ProjectToSubspace (Csrc,vec1); std::cout< HermIndefOpDD(DdwfDD); CoarsenedMatrix LDOp(*Coarse5d,1); // Hermitian matrix LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); + exit(0); CoarseVector c_src (Coarse5d); CoarseVector c_res (Coarse5d); gaussian(CRNG,c_src); + result=Zero(); c_res=Zero(); ////////////////////////////////////////////////// @@ -515,12 +509,12 @@ int main (int argc, char ** argv) double c_first = 0.2; double c_div = 1.2; - std::vector c_lo(nb); + std::vector c_lo(nbasis/2); c_lo[0] = c_first; - for(int b=1;b c_ord(nb,200); + std::vector c_ord(nbasis/2,200); c_ord[0]=500; #define RECURSIVE_MULTIGRID @@ -562,14 +556,15 @@ int main (int argc, char ** argv) std::cout< IRLHermOp(LDOp); - Chebyshev IRLCheby(0.01,14,161); + Chebyshev IRLCheby(0.005,16.0,51); + // IRLCheby.InitLowPass(0.01,18.0,51); FunctionHermOp IRLOpCheby(IRLCheby,IRLHermOp); PlainHermOp IRLOp (IRLHermOp); - int Nstop=32; - int Nk=32; + int Nstop=24; + int Nk=24; int Nm=48; - ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-4,20); + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); int Nconv; std::vector eval(Nm); std::vector evec(Nm,Coarse5d); From f7e4bd1f6d5f5aa65a8151c0faee2f845d50894d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 4 Jan 2020 03:11:53 -0500 Subject: [PATCH 16/43] Getting more optimised --- Grid/algorithms/CoarsenedMatrix.h | 214 ++++++++++++++---------------- 1 file changed, 100 insertions(+), 114 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 68c820ca..450b76df 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -63,7 +63,7 @@ public: //// report back std::cout< &hermop,int nn=nbasis) - { - // Run a Lanczos with sloppy convergence - const int Nstop = nn; - const int Nk = nn+20; - const int Np = nn+20; - const int Nm = Nk+Np; - const int MaxIt= 10000; - RealD resid = 1.0e-3; - - Chebyshev Cheb(0.5,64.0,21); - ImplicitlyRestartedLanczos IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt); - // IRL.lock = 1; - - FineField noise(FineGrid); gaussian(RNG,noise); - FineField tmp(FineGrid); - std::vector eval(Nm); - std::vector evec(Nm,FineGrid); - - int Nconv; - IRL.calc(eval,evec, - noise, - Nconv); - - // pull back nn vectors - for(int b=0;b "< "< "< Cheb(lo[bbb],hi,order[bbb]); bbb++; - Cheb(hermop,noise,Mn); - - // normalise - scale = std::pow(norm2(Mn),-0.5); - Mn=Mn*scale; - - // set this new vector - subspace[b] = Mn; - - // new matrix element - hermop.Op(Mn,noise); std::cout< "< Cheb(lo,hi,order); + Cheb(hermop,noise,Mn); + // normalise + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "< "< @@ -511,6 +485,7 @@ public: FineField tmp(FineGrid); FineField zz(FineGrid); zz=Zero(); FineField Mphi(FineGrid); + std::vector Mphi_p(geom.npoint,FineGrid); Lattice > coor(FineGrid); @@ -518,10 +493,9 @@ public: CoarseVector oProj(Grid()); CoarseScalar InnerProd(Grid()); - std::cout << GridLogMessage<< "CoarsenMatrix" << std::endl; + // Orthogonalise the subblocks over the basis - // blockOrthogonalise(InnerProd,Subspace.subspace); - // std::cout << GridLogMessage<< "CoarsenMatrix orthogonalised" << std::endl; + blockOrthogonalise(InnerProd,Subspace.subspace); // Compute the matrix elements of linop between this orthonormal // set of vectors. @@ -536,12 +510,21 @@ public: for(int i=0;i Date: Sat, 4 Jan 2020 03:12:17 -0500 Subject: [PATCH 17/43] Make the force term and coarsening multigrid more optimised --- Grid/qcd/action/fermion/WilsonKernels.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 35715097..bf06d000 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -102,6 +102,15 @@ private: static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma); + + static accelerator void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); // Specialised variants static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, From 205ea4bbb2d86f33c6fd2e2dc4e193a0d386a951 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 4 Jan 2020 03:13:40 -0500 Subject: [PATCH 18/43] More verboose Lanczos --- .../iterative/ImplicitlyRestartedLanczos.h | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h index e5573c8e..8e059048 100644 --- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -43,6 +43,11 @@ NAMESPACE_BEGIN(Grid); template void basisOrthogonalize(std::vector &basis,Field &w,int k) { + // If assume basis[j] are already orthonormal, + // can take all inner products in parallel saving 2x bandwidth + // Save 3x bandwidth on the second line of loop. + // perhaps 2.5x speed up. + // 2x overall in Multigrid Lanczos for(int j=0; j& lmd, @@ -589,6 +594,7 @@ until convergence std::vector& evec, Field& w,int Nm,int k) { + std::cout<0) w -= lme[k-1] * evec[k-1]; - ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk) + ComplexD zalph = innerProduct(evec_k,w); RealD alph = real(zalph); - w = w - alph * evec_k;// 5. wk:=wk−αkvk + w = w - alph * evec_k; - RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop - // 7. vk+1 := wk/βk+1 + RealD beta = normalise(w); lmd[k] = alph; lme[k] = beta; - if (k>0 && k % orth_period == 0) { + if ( (k>0) && ( (k % orth_period) == 0 )) { + std::cout<& lmd, std::vector& lme, From 3c3d6a94f3ad1df6a38fb3549f016f8c7dc979e6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 4 Jan 2020 03:16:23 -0500 Subject: [PATCH 19/43] OPtimising the force term a bit --- .../WilsonKernelsImplementation.h | 84 +++++++++++++------ 1 file changed, 60 insertions(+), 24 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index a787fa79..f13bfdde 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -91,8 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } \ synchronise(); -#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ - if (gamma == Dir) { \ +#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \ if (SE->_is_local ) { \ int perm= SE->_permute; \ auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ @@ -102,10 +101,14 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } \ synchronise(); \ Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \ - Recon(result, Uchi); \ - synchronise(); \ + Recon(result, Uchi); + +#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ + if (gamma == Dir) { \ + GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon); \ } + //////////////////////////////////////////////////////////////////// // All legs kernels ; comms then compute //////////////////////////////////////////////////////////////////// @@ -284,7 +287,36 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField } }; -template +#define DhopDirMacro(Dir,spProj,spRecon) \ + template \ + void WilsonKernels::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \ + int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \ + { \ + typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; \ + typedef decltype(coalescedRead(in[0])) calcSpinor; \ + calcHalfSpinor chi; \ + calcSpinor result; \ + calcHalfSpinor Uchi; \ + StencilEntry *SE; \ + int ptype; \ + const int Nsimd = SiteHalfSpinor::Nsimd(); \ + const int lane=SIMTlane(Nsimd); \ + \ + SE = st.GetEntry(ptype, dir, sF); \ + GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \ + coalescedWrite(out[sF], result,lane); \ + } + +DhopDirMacro(Xp,spProjXp,spReconXp); +DhopDirMacro(Yp,spProjYp,spReconYp); +DhopDirMacro(Zp,spProjZp,spReconZp); +DhopDirMacro(Tp,spProjTp,spReconTp); +DhopDirMacro(Xm,spProjXm,spReconXm); +DhopDirMacro(Ym,spProjYm,spReconYm); +DhopDirMacro(Zm,spProjZm,spReconZm); +DhopDirMacro(Tm,spProjTm,spReconTm); + +template void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) { @@ -299,18 +331,7 @@ void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si const int lane=SIMTlane(Nsimd); SE = st.GetEntry(ptype, dir, sF); - if (gamma == Xp) { - if (SE->_is_local ) { - int perm= SE->_permute; - auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); - spProjXp(chi,tmp); - } else { - chi = coalescedRead(buf[SE->_offset],lane); - } - Impl::multLink(Uchi, U[sU], chi, dir, SE, st); - spReconXp(result, Uchi); - } - + GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp); GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp); GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp); GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp); @@ -332,13 +353,28 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S auto in_v = in.View(); auto out_v = out.View(); auto st_v = st.View(); - accelerator_for(ss,Nsite,Simd::Nsimd(),{ - for(int s=0;s Date: Mon, 6 Jan 2020 11:43:59 -0500 Subject: [PATCH 20/43] Change to interface to minise comms in evaluating coarse space operator --- Grid/algorithms/CoarsenedMatrix.h | 129 ++++++++++-------- Grid/algorithms/LinearOperator.h | 16 +++ Grid/algorithms/SparseMatrix.h | 13 +- Grid/qcd/action/fermion/CayleyFermion5D.h | 3 +- .../fermion/ContinuedFractionFermion5D.h | 15 +- Grid/qcd/action/fermion/FermionOperator.h | 1 + .../action/fermion/ImprovedStaggeredFermion.h | 1 + .../fermion/ImprovedStaggeredFermion5D.h | 3 +- .../action/fermion/PartialFractionFermion5D.h | 11 +- Grid/qcd/action/fermion/WilsonFermion.h | 5 +- Grid/qcd/action/fermion/WilsonFermion5D.h | 10 +- Grid/qcd/action/fermion/g5HermitianLinop.h | 14 ++ .../CayleyFermion5DImplementation.h | 8 ++ ...ContinuedFractionFermion5DImplementation.h | 19 +++ ...ImprovedStaggeredFermion5DImplementation.h | 8 +- .../ImprovedStaggeredFermionImplementation.h | 12 +- .../PartialFractionFermion5DImplementation.h | 21 ++- .../WilsonFermion5DImplementation.h | 24 ++++ .../WilsonFermionImplementation.h | 40 ++++-- Grid/qcd/utils/CovariantLaplacian.h | 1 + 20 files changed, 262 insertions(+), 92 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 450b76df..ba71faf8 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -35,14 +35,13 @@ Author: paboyle NAMESPACE_BEGIN(Grid); class Geometry { - // int dimension; public: int npoint; std::vector directions ; std::vector displacements; Geometry(int _d) { - + int base = (_d==5) ? 1:0; // make coarse grid stencil for 4d , not 5d @@ -187,9 +186,10 @@ public: } } - // - // World of possibilities here. - // + //////////////////////////////////////////////////////////////////////////////////////////////// + // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) + // and this is the best I found + //////////////////////////////////////////////////////////////////////////////////////////////// virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase &hermop, int nn, double hi, @@ -249,11 +249,18 @@ public: hermop.HermOp(*Tn,y); - y=xscale*y+mscale*(*Tn); - - *Tnp=2.0*y-(*Tnm); + auto y_v = y.View(); + auto Tn_v = Tn->View(); + auto Tnp_v = Tnp->View(); + auto Tnm_v = Tnm->View(); + accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { + coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); + coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); + }); - if ( (n%orderstep)==0 ) { + // Possible more fine grained control is needed than a linear sweep, + // but huge productivity gain if this is simple algorithm and not a tunable + if ( (n%orderstep)==0 ) { Mn=*Tnp; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; subspace[b] = Mn; @@ -270,6 +277,7 @@ public: } } + assert(b==nn); } }; @@ -393,42 +401,15 @@ public: return norm2(out); } }; - - void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){ - - conformable(_grid,in.Grid()); - conformable(in.Grid(),out.Grid()); - + void MdirComms(const CoarseVector &in) + { SimpleCompressor compressor; - Stencil.HaloExchange(in,compressor); - - int ndim = in.Grid()->Nd(); - - ////////////// - // 4D action like wilson - // 0+ => 0 - // 0- => 1 - // 1+ => 2 - // 1- => 3 - // etc.. - ////////////// - // 5D action like DWF - // 1+ => 0 - // 1- => 1 - // 2+ => 2 - // 2- => 3 - // etc.. - - auto point = [dir, disp, ndim](){ - if(dir == 0 and disp == 0) - return 8; - else if ( ndim==4 ) { - return (4 * dir + 1 - disp) / 2; - } else { - return (4 * (dir-1) + 1 - disp) / 2; - } - }(); + } + void MdirCalc(const CoarseVector &in, CoarseVector &out, int point) + { + conformable(_grid,in.Grid()); + conformable(_grid,out.Grid()); typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -458,10 +439,54 @@ public: out_v[ss]=res; }); + } + void MdirAll(const CoarseVector &in,std::vector &out) + { + this->MdirComms(in); + int ndir=geom.npoint-1; + assert(out.size()==ndir); + for(int p=0;pMdirComms(in); + + int ndim = in.Grid()->Nd(); + + ////////////// + // 4D action like wilson + // 0+ => 0 + // 0- => 1 + // 1+ => 2 + // 1- => 3 + // etc.. + ////////////// + // 5D action like DWF + // 1+ => 0 + // 1- => 1 + // 2+ => 2 + // 2- => 3 + // etc.. + auto point = [dir, disp, ndim](){ + if(dir == 0 and disp == 0) + return 8; + else if ( ndim==4 ) { + return (4 * dir + 1 - disp) / 2; + } else { + return (4 * (dir-1) + 1 - disp) / 2; + } + }(); + + MdirCalc(in,out,point); + }; - void Mdiag(const CoarseVector &in, CoarseVector &out){ - Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil + void Mdiag(const CoarseVector &in, CoarseVector &out) + { + int point=geom.npoint-1; + MdirCalc(in, out, point); // No comms }; @@ -511,16 +536,12 @@ public: for(int i=0;i &out) = 0; // Abstract base virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base @@ -83,6 +84,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { _Mat.Mdir(in,out,dir,disp); } + void OpDirAll (const Field &in, std::vector &out){ + _Mat.MdirAll(in,out); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); } @@ -116,6 +120,9 @@ public: _Mat.Mdir(in,out,dir,disp); assert(0); } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); assert(0); @@ -154,6 +161,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { _Mat.Mdir(in,out,dir,disp); } + void OpDirAll (const Field &in, std::vector &out){ + _Mat.MdirAll(in,out); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); } @@ -183,6 +193,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { _Mat.Mdir(in,out,dir,disp); } + void OpDirAll (const Field &in, std::vector &out){ + _Mat.MdirAll(in,out); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); } @@ -234,6 +247,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; }; template class SchurDiagMooeeOperator : public SchurOperatorBase { diff --git a/Grid/algorithms/SparseMatrix.h b/Grid/algorithms/SparseMatrix.h index ffed7527..fd713e9f 100644 --- a/Grid/algorithms/SparseMatrix.h +++ b/Grid/algorithms/SparseMatrix.h @@ -47,6 +47,7 @@ public: } virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; + virtual void MdirAll (const Field &in, std::vector &out)=0; }; ///////////////////////////////////////////////////////////////////////////////////////////// @@ -56,12 +57,12 @@ template class CheckerBoardedSparseMatrixBase : public SparseMatrix public: virtual GridBase *RedBlackGrid(void)=0; - ////////////////////////////////////////////////////////////////////// - // Query the even even properties to make algorithmic decisions - ////////////////////////////////////////////////////////////////////// - virtual RealD Mass(void) { return 0.0; }; - virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden - virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better + ////////////////////////////////////////////////////////////////////// + // Query the even even properties to make algorithmic decisions + ////////////////////////////////////////////////////////////////////// + virtual RealD Mass(void) { return 0.0; }; + virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden + virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better // half checkerboard operaions virtual void Meooe (const Field &in, Field &out)=0; diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index 333ba49b..c2ccb98b 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -101,7 +101,8 @@ public: virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); // Efficient support for multigrid coarsening - virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + virtual void MdirAll(const FermionField &in, std::vector &out); void Meooe5D (const FermionField &in, FermionField &out); void MeooeDag5D (const FermionField &in, FermionField &out); diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h index 379c5f8f..5aa7bfbd 100644 --- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h +++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h @@ -62,14 +62,15 @@ public: // Efficient support for multigrid coarsening virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + virtual void MdirAll(const FermionField &in, std::vector &out); - /////////////////////////////////////////////////////////////// - // Physical surface field utilities - /////////////////////////////////////////////////////////////// - // virtual void Dminus(const FermionField &psi, FermionField &chi); // Inherit trivial case - // virtual void DminusDag(const FermionField &psi, FermionField &chi); // Inherit trivial case - virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); - virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); + /////////////////////////////////////////////////////////////// + // Physical surface field utilities + /////////////////////////////////////////////////////////////// + // virtual void Dminus(const FermionField &psi, FermionField &chi); // Inherit trivial case + // virtual void DminusDag(const FermionField &psi, FermionField &chi); // Inherit trivial case + virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); + virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); // Constructors ContinuedFractionFermion5D(GaugeField &_Umu, diff --git a/Grid/qcd/action/fermion/FermionOperator.h b/Grid/qcd/action/fermion/FermionOperator.h index c60a2e84..cbc6ca63 100644 --- a/Grid/qcd/action/fermion/FermionOperator.h +++ b/Grid/qcd/action/fermion/FermionOperator.h @@ -89,6 +89,7 @@ public: virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac + virtual void MdirAll(const FermionField &in, std::vector &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { assert(0);}; diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h index b4d8d60b..5cb95ca6 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -103,6 +103,7 @@ public: // Multigrid assistance; force term uses too /////////////////////////////////////////////////////////////// void Mdir(const FermionField &in, FermionField &out, int dir, int disp); + void MdirAll(const FermionField &in, std::vector &out); void DhopDir(const FermionField &in, FermionField &out, int dir, int disp); /////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index b10c0356..8e3d4be5 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -86,7 +86,8 @@ public: void MooeeDag (const FermionField &in, FermionField &out); void MooeeInvDag (const FermionField &in, FermionField &out); - void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + void MdirAll(const FermionField &in, std::vector &out); void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); // These can be overridden by fancy 5d chiral action diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h index d61515f0..928abd3f 100644 --- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h +++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h @@ -67,12 +67,13 @@ public: // Efficient support for multigrid coarsening virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + virtual void MdirAll(const FermionField &in, std::vector &out); - /////////////////////////////////////////////////////////////// - // Physical surface field utilities - /////////////////////////////////////////////////////////////// - virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); - virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); + /////////////////////////////////////////////////////////////// + // Physical surface field utilities + /////////////////////////////////////////////////////////////// + virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); + virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); // Constructors PartialFractionFermion5D(GaugeField &_Umu, diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index 3a712435..a3f5d2d7 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -115,9 +115,10 @@ public: // Multigrid assistance; force term uses too /////////////////////////////////////////////////////////////// void Mdir(const FermionField &in, FermionField &out, int dir, int disp); + void MdirAll(const FermionField &in, std::vector &out); void DhopDir(const FermionField &in, FermionField &out, int dir, int disp); - void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp, - int gamma, int dag); + void DhopDirAll(const FermionField &in, std::vector &out); + void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag); /////////////////////////////////////////////////////////////// // Extra methods added by derived diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 8f1073db..58b54421 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -111,15 +111,16 @@ public: virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);}; virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);}; virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac + virtual void MdirAll(const FermionField &in, std::vector &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac // These can be overridden by fancy 5d chiral action virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag); virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); - void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; - void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; - void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; // Implement hopping term non-hermitian hopping term; half cb or both // Implement s-diagonal DW @@ -131,6 +132,9 @@ public: // add a DhopComm // -- suboptimal interface will presently trigger multiple comms. void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); + void DhopDirAll(const FermionField &in,std::vector &out); + void DhopDirComms(const FermionField &in); + void DhopDirCalc(const FermionField &in, FermionField &out,int point); /////////////////////////////////////////////////////////////// // New methods added diff --git a/Grid/qcd/action/fermion/g5HermitianLinop.h b/Grid/qcd/action/fermion/g5HermitianLinop.h index 2e417ceb..90bb6d59 100644 --- a/Grid/qcd/action/fermion/g5HermitianLinop.h +++ b/Grid/qcd/action/fermion/g5HermitianLinop.h @@ -54,6 +54,14 @@ public: _Mat.Mdir(in,tmp,dir,disp); G5R5(out,tmp); } + void OpDirAll(const Field &in, std::vector &out) { + Field tmp(in.Grid()); + _Mat.MdirAll(in,out); + for(int p=0;p &out) { + _Mat.MdirAll(in,out); + for(int p=0;p::Mdir (const FermionField &psi, FermionField &chi,in Meo5D(psi,tmp); this->DhopDir(tmp,chi,dir,disp); } +template +void CayleyFermion5D::MdirAll(const FermionField &psi, std::vector &out) +{ + FermionField tmp(psi.Grid()); + Meo5D(psi,tmp); + this->DhopDirAll(tmp,out); +} + // force terms; five routines; default to Dhop on diagonal template void CayleyFermion5D::MDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag) diff --git a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h index 7af02263..beeb3e00 100644 --- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h @@ -143,6 +143,25 @@ void ContinuedFractionFermion5D::Mdir (const FermionField &psi, FermionFi } } template +void ContinuedFractionFermion5D::MdirAll (const FermionField &psi, std::vector &chi) +{ + int Ls = this->Ls; + + this->DhopDirAll(psi,chi); // Dslash on diagonal. g5 Dslash is hermitian + + for(int p=0;p void ContinuedFractionFermion5D::Meooe (const FermionField &psi, FermionField &chi) { int Ls = this->Ls; diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index c42e896f..fdaa2f71 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -538,10 +538,16 @@ void ImprovedStaggeredFermion5D::ZeroCounters(void) // Implement the general interface. Here we use SAME mass on all slices ///////////////////////////////////////////////////////////////////////// template -void ImprovedStaggeredFermion5D::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { +void ImprovedStaggeredFermion5D::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ DhopDir(in, out, dir, disp); } template +void ImprovedStaggeredFermion5D::MdirAll(const FermionField &in, std::vector &out) +{ + assert(0); +} +template RealD ImprovedStaggeredFermion5D::M(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerNo); diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index e2605d81..b4359879 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -362,12 +362,19 @@ void ImprovedStaggeredFermion::DhopEO(const FermionField &in, FermionField } template -void ImprovedStaggeredFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { +void ImprovedStaggeredFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ DhopDir(in, out, dir, disp); } +template +void ImprovedStaggeredFermion::MdirAll(const FermionField &in, std::vector &out) +{ + assert(0); // Not implemented yet +} template -void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { +void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) +{ Compressor compressor; Stencil.HaloExchange(in, compressor); @@ -380,6 +387,7 @@ void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionFiel }); }; + template void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, diff --git a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h index 9f8f91ad..edc674cc 100644 --- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h @@ -31,7 +31,7 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); -template + template void PartialFractionFermion5D::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){ // this does both dag and undag but is trivial; make a common helper routing int Ls = this->Ls; @@ -45,8 +45,25 @@ void PartialFractionFermion5D::Mdir (const FermionField &psi, FermionFiel ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); } ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1); - } +template +void PartialFractionFermion5D::MdirAll (const FermionField &psi, std::vector &chi){ + // this does both dag and undag but is trivial; make a common helper routing + int Ls = this->Ls; + + this->DhopDirAll(psi,chi); + + for(int point=0;point void PartialFractionFermion5D::Meooe_internal(const FermionField &psi, FermionField &chi,int dag) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 1bdc9a64..a2092960 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -241,6 +241,30 @@ void WilsonFermion5D::DhopDir(const FermionField &in, FermionField &out,in Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma); }; +template +void WilsonFermion5D::DhopDirAll(const FermionField &in, std::vector &out) +{ + Compressor compressor(DaggerNo); + Stencil.HaloExchange(in,compressor); + + assert( (out.size()==8)||(out.size()==9)); + for(int dir5=1;dir5<=4;dir5++) { + for(int disp=-1;disp<=1;disp+=2){ + int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil + // we drop off the innermost fifth dimension + assert( (disp==1)||(disp==-1) ); + assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t; + + int skip = (disp==1) ? 0 : 1; + int dirdisp = dir+skip*4; + int gamma = dir+(1-skip)*4; + + uint64_t Nsite = Umu.Grid()->oSites(); + Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out[dirdisp],dirdisp,gamma); + } + } +}; + template void WilsonFermion5D::DerivInternal(StencilImpl & st, diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 756bdbf4..76b904e9 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -319,28 +319,51 @@ void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int d } template -void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { +void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ DhopDir(in, out, dir, disp); } +template +void WilsonFermion::MdirAll(const FermionField &in, std::vector &out) +{ + DhopDirAll(in, out); +} template void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { + Compressor compressor(DaggerNo); + Stencil.HaloExchange(in, compressor); + int skip = (disp == 1) ? 0 : 1; int dirdisp = dir + skip * 4; int gamma = dir + (1 - skip) * 4; - DhopDirDisp(in, out, dirdisp, gamma, DaggerNo); + DhopDirCalc(in, out, dirdisp, gamma, DaggerNo); }; - template -void WilsonFermion::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) +void WilsonFermion::DhopDirAll(const FermionField &in, std::vector &out) { - Compressor compressor(dag); - + Compressor compressor(DaggerNo); Stencil.HaloExchange(in, compressor); + + assert((out.size()==8)||(out.size()==9)); + for(int dir=0;dir +void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) +{ int Ls=1; - int Nsite=in.oSites(); + uint64_t Nsite=in.oSites(); Kernels::DhopDirKernel(Stencil, Umu, Stencil.CommBuf(), Ls, Nsite, in, out, dirdisp, gamma); }; @@ -348,7 +371,8 @@ template void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) { + FermionField &out, int dag) +{ #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,in,out,dag); diff --git a/Grid/qcd/utils/CovariantLaplacian.h b/Grid/qcd/utils/CovariantLaplacian.h index 0e0620a7..94322507 100644 --- a/Grid/qcd/utils/CovariantLaplacian.h +++ b/Grid/qcd/utils/CovariantLaplacian.h @@ -92,6 +92,7 @@ public: }; void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);} + void MdirAll(const GaugeField&, std::vector &){ assert(0);} void Mdiag(const GaugeField&, GaugeField&){ assert(0);} void ImportGauge(const GaugeField& _U) { From 03da4040e21d41388b61d6fd9ddd6531ef6b5180 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 6 Jan 2020 11:47:48 -0500 Subject: [PATCH 21/43] Make summit happy --- configure.ac | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index f4a5e503..9a16989e 100644 --- a/configure.ac +++ b/configure.ac @@ -280,7 +280,8 @@ case ${CXX} in # CXXLD="nvcc -v -link" CXX="nvcc -x cu " CXXLD="nvcc -link" - CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr" +# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr" + CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" if test $ac_openmp = yes; then CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" fi From 55cdb17691fb2adefbd63c0b8cf2934dfca7927a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 12:27:45 -0500 Subject: [PATCH 22/43] Integer divide for blocking --- Grid/simd/Grid_gpu_vec.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 471fbccc..4584fb36 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -403,6 +403,10 @@ namespace Optimization { accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){ return a/b; } + accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){ + return a/b; + } + // Danger -- element wise divide fro complex, not complex div. // See Grid_vector_types.h lines around 735, applied after "toReal" accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b){ From 48008e4d8b3958191f754240791bb229daba7b8f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 12:28:16 -0500 Subject: [PATCH 23/43] Thread coordinate creation loop --- Grid/lattice/Lattice_coordinate.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Grid/lattice/Lattice_coordinate.h b/Grid/lattice/Lattice_coordinate.h index 16f3641b..a1abe58d 100644 --- a/Grid/lattice/Lattice_coordinate.h +++ b/Grid/lattice/Lattice_coordinate.h @@ -37,19 +37,18 @@ template inline void LatticeCoordinate(Lattice &l,int mu) GridBase *grid = l.Grid(); int Nsimd = grid->iSites(); - Coordinate gcoor; - ExtractBuffer mergebuf(Nsimd); - - vector_type vI; auto l_v = l.View(); - for(int o=0;ooSites();o++){ + thread_for( o, grid->oSites(), { + vector_type vI; + Coordinate gcoor; + ExtractBuffer mergebuf(Nsimd); for(int i=0;iiSites();i++){ grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); mergebuf[i]=(Integer)gcoor[mu]; } merge(vI,mergebuf); l_v[o]=vI; - } + }); }; // LatticeCoordinate(); From fa856c9669eda8793a58769d47c366bb2ce3a7ec Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 12:28:46 -0500 Subject: [PATCH 24/43] Disable information message --- Grid/threads/Pragmas.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Grid/threads/Pragmas.h b/Grid/threads/Pragmas.h index 4d713258..d05f8ee9 100644 --- a/Grid/threads/Pragmas.h +++ b/Grid/threads/Pragmas.h @@ -43,7 +43,6 @@ Author: paboyle #ifdef _OPENMP #define GRID_OMP #include -#warning "Grid is using OpenMP for host loops" #endif #ifdef GRID_OMP From 1bd87c35d7af979ae7181aced485658a432a3adb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 12:29:56 -0500 Subject: [PATCH 25/43] Read coalescing on Nvidia --- Grid/lattice/Lattice_transfer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 0041f47a..abe42733 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -196,7 +196,7 @@ inline void blockZAXPY(Lattice &fineZ, auto fineY_ = fineY.View(); auto coarseA_= coarseA.View(); - accelerator_for(sf, fine->oSites(), 1, { + accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { int sc; Coordinate coor_c(_ndimension); @@ -207,7 +207,7 @@ inline void blockZAXPY(Lattice &fineZ, Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); // z = A x + y - fineZ_[sf]=coarseA_[sc]*fineX_[sf]+fineY_[sf]; + coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf)); }); @@ -397,8 +397,8 @@ inline void blockPromote(const Lattice > &coarseData, Lattice cip(coarse); auto cip_ = cip.View(); auto ip_ = ip.View(); - accelerator_for(sc,coarse->oSites(),1,{ - cip_[sc] = ip_[sc](); + accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ + coalescedWrite(cip_[sc], ip_(sc)()); }); blockZAXPY(fineData,cip,Basis[i],fineData); } From d8b97420924b0b997b5674610af4b86f8524cc3f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 12:34:54 -0500 Subject: [PATCH 26/43] DhopDirAll for faster matrix elements of little Dirac operator --- Grid/qcd/action/fermion/WilsonKernels.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index bf06d000..7348a463 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -60,6 +60,9 @@ public: int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1,int exterior=1) ; + static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, + int Nsite, const FermionField &in, std::vector &out) ; + static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma); @@ -100,17 +103,17 @@ public: private: - static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf, + static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma); - static accelerator void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); - static accelerator void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); - static accelerator void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); - static accelerator void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); - static accelerator void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); - static accelerator void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); - static accelerator void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); - static accelerator void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); // Specialised variants static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, From 8016a465aeed007baa936ff8ed97acfbb8746dab Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 12:35:37 -0500 Subject: [PATCH 27/43] Remove extraneous variable --- .../implementation/ImprovedStaggeredFermionImplementation.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index b4359879..0b723c47 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -412,7 +412,6 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st #ifdef GRID_OMP Compressor compressor; int len = U.Grid()->oSites(); - const int LLs = 1; DhopTotalTime -= usecond(); From e5d1c0966577a037456ead69515b758bbe0ea776 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 12:38:54 -0500 Subject: [PATCH 28/43] Faster DhopDirAll for little dirac operator coarsening --- .../WilsonKernelsImplementation.h | 42 +++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index f13bfdde..9e032b04 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -342,6 +342,38 @@ void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si coalescedWrite(out[sF], result,lane); } +template +void WilsonKernels::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, + int Nsite, const FermionField &in, std::vector &out) +{ + auto U_v = U.View(); + auto in_v = in.View(); + auto st_v = st.View(); + + auto out_Xm = out[0].View(); + auto out_Ym = out[1].View(); + auto out_Zm = out[2].View(); + auto out_Tm = out[3].View(); + auto out_Xp = out[4].View(); + auto out_Yp = out[5].View(); + auto out_Zp = out[6].View(); + auto out_Tp = out[7].View(); + + accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{ + int sU=sss/Ls; + int sF =sss; + DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0); + DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1); + DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2); + DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3); + DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4); + DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5); + DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6); + DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7); + }); +} + + template void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) @@ -354,7 +386,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S auto out_v = out.View(); auto st_v = st.View(); #define LoopBody(Dir) \ - if (gamma==Dir) { \ + case Dir : \ accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \ for(int s=0;s::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\ } \ }); \ - } + break; + switch(gamma){ LoopBody(Xp); LoopBody(Yp); LoopBody(Zp); @@ -373,7 +406,10 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S LoopBody(Ym); LoopBody(Zm); LoopBody(Tm); - + default: + assert(0); + break; + } #undef LoopBody } From 7c061e20c95a01868546deece1989bb84d147a04 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 12:40:13 -0500 Subject: [PATCH 29/43] All directions of dirac operator for fastt coarsening --- .../WilsonFermion5DImplementation.h | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index a2092960..613eaa7b 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -246,23 +246,8 @@ void WilsonFermion5D::DhopDirAll(const FermionField &in, std::vector=0)&&(dir<4) ); //must do x,y,z or t; - - int skip = (disp==1) ? 0 : 1; - int dirdisp = dir+skip*4; - int gamma = dir+(1-skip)*4; - - uint64_t Nsite = Umu.Grid()->oSites(); - Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out[dirdisp],dirdisp,gamma); - } - } + uint64_t Nsite = Umu.Grid()->oSites(); + Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out); }; From afc7426f390687778a673d7a3073b224c2f388a7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 12:41:16 -0500 Subject: [PATCH 30/43] Much bigger pointer cache in case of Nvidia due to cost of setting up UVM allocations --- Grid/allocator/AlignedAllocator.cc | 10 ++++++++-- Grid/allocator/AlignedAllocator.h | 11 ++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index ed27a8bd..d53c4dc2 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -6,6 +6,12 @@ NAMESPACE_BEGIN(Grid); MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; +#ifdef GRID_NVCC +#define SMALL_LIMIT (0) +#else +#define SMALL_LIMIT (4096) +#endif + #ifdef POINTER_CACHE int PointerCache::victim; @@ -13,7 +19,7 @@ PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; void *PointerCache::Insert(void *ptr,size_t bytes) { - if (bytes < 4096 ) return ptr; + if (bytes < SMALL_LIMIT ) return ptr; #ifdef GRID_OMP assert(omp_in_parallel()==0); @@ -50,7 +56,7 @@ void *PointerCache::Insert(void *ptr,size_t bytes) { void *PointerCache::Lookup(size_t bytes) { - if (bytes < 4096 ) return NULL; + if (bytes < SMALL_LIMIT ) return NULL; #ifdef GRID_OMP assert(omp_in_parallel()==0); diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 2aa7d82b..8c189be8 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -49,8 +49,13 @@ NAMESPACE_BEGIN(Grid); #ifdef POINTER_CACHE class PointerCache { private: - +/*Pinning pages is costly*/ +/*Could maintain separate large and small allocation caches*/ +#ifdef GRID_NVCC + static const int Ncache=128; +#else static const int Ncache=8; +#endif static int victim; typedef struct { @@ -63,7 +68,6 @@ private: public: - static void *Insert(void *ptr,size_t bytes) ; static void *Lookup(size_t bytes) ; @@ -170,13 +174,14 @@ public: // Unified (managed) memory //////////////////////////////////// if ( ptr == (_Tp *) NULL ) { + // printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout); auto err = cudaMallocManaged((void **)&ptr,bytes); if( err != cudaSuccess ) { ptr = (_Tp *) NULL; std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " < Date: Mon, 27 Jan 2020 12:41:59 -0500 Subject: [PATCH 31/43] Less sloppy convergence test on PowerMethod --- Grid/algorithms/iterative/PowerMethod.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/algorithms/iterative/PowerMethod.h b/Grid/algorithms/iterative/PowerMethod.h index 8a238ea6..6aa8e923 100644 --- a/Grid/algorithms/iterative/PowerMethod.h +++ b/Grid/algorithms/iterative/PowerMethod.h @@ -30,12 +30,12 @@ template class PowerMethod RealD vden = norm2(src_n); RealD na = vnum/vden; - if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) { + if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { evalMaxApprox = na; + std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; return evalMaxApprox; } evalMaxApprox = na; - std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; src_n = tmp; } assert(0); From b2736ec80b92d99abc5a7ff7539156021a19e50f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 12:42:48 -0500 Subject: [PATCH 32/43] Make PrecGCR recursive - it can precondition itself --- .../PrecGeneralisedConjugateResidual.h | 70 +++++++------------ 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h index d3188ecb..a61b62e0 100644 --- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h +++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h @@ -38,10 +38,11 @@ Author: Peter Boyle /////////////////////////////////////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(Grid); +#define GCRLogLevel std::cout << GridLogMessage < -class PrecGeneralisedConjugateResidual : public OperatorFunction { +class PrecGeneralisedConjugateResidual : public LinearFunction { public: - using OperatorFunction::operator(); RealD Tolerance; Integer MaxIterations; @@ -49,23 +50,29 @@ public: int mmax; int nstep; int steps; + int level; GridStopWatch PrecTimer; GridStopWatch MatTimer; GridStopWatch LinalgTimer; - LinearFunction &Preconditioner; + LinearFunction &Preconditioner; + LinearOperatorBase &Linop; - PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction &Prec,int _mmax,int _nstep) : + void Level(int lv) { level=lv; }; + + PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase &_Linop,LinearFunction &Prec,int _mmax,int _nstep) : Tolerance(tol), MaxIterations(maxit), + Linop(_Linop), Preconditioner(Prec), mmax(_mmax), nstep(_nstep) { + level=1; verbose=1; }; - void operator() (LinearOperatorBase &Linop,const Field &src, Field &psi){ + void operator() (const Field &src, Field &psi){ psi=Zero(); RealD cp, ssq,rsq; @@ -84,9 +91,9 @@ public: steps=0; for(int k=0;k &Linop,const Field &src, Field &psi,RealD rsq){ + RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD cp; RealD a, b; @@ -134,9 +143,7 @@ public: std::vector p(mmax,grid); std::vector qq(mmax); - std::cout< Date: Mon, 27 Jan 2020 12:43:29 -0500 Subject: [PATCH 33/43] Normal Equations can be used in HDCR now --- Grid/algorithms/iterative/NormalEquations.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/Grid/algorithms/iterative/NormalEquations.h b/Grid/algorithms/iterative/NormalEquations.h index 270b0115..df82b6dc 100644 --- a/Grid/algorithms/iterative/NormalEquations.h +++ b/Grid/algorithms/iterative/NormalEquations.h @@ -33,26 +33,30 @@ NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////////////////////////////////////////////// // Take a matrix and form an NE solver calling a Herm solver /////////////////////////////////////////////////////////////////////////////////////////////////////// -template class NormalEquations : public OperatorFunction{ +template class NormalEquations { private: SparseMatrixBase & _Matrix; OperatorFunction & _HermitianSolver; - + LinearFunction & _Guess; public: ///////////////////////////////////////////////////// // Wrap the usual normal equations trick ///////////////////////////////////////////////////// - NormalEquations(SparseMatrixBase &Matrix, OperatorFunction &HermitianSolver) - : _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; + NormalEquations(SparseMatrixBase &Matrix, OperatorFunction &HermitianSolver, + LinearFunction &Guess) + : _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; void operator() (const Field &in, Field &out){ Field src(in.Grid()); + Field tmp(in.Grid()); + MdagMLinearOperator,Field> MdagMOp(_Matrix); _Matrix.Mdag(in,src); - _HermitianSolver(src,out); // Mdag M out = Mdag in - + _Guess(src,out); + _HermitianSolver(MdagMOp,src,out); // Mdag M out = Mdag in + } }; From 8cec294ec99a5a63436849ce1de0f6aebc35bf25 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 12:44:04 -0500 Subject: [PATCH 34/43] Make CG a bit less verbose as gettign annoying in nested algorithms. Can use Iterative logging if you want to see more --- Grid/algorithms/iterative/ConjugateGradient.h | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index 398f578f..32ba4035 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -71,7 +71,6 @@ public: // Initial residual computation & set up RealD guess = norm2(psi); assert(std::isnan(guess) == 0); - Linop.HermOpAndNorm(psi, mmp, d, b); @@ -154,18 +153,18 @@ public: RealD resnorm = std::sqrt(norm2(p)); RealD true_residual = resnorm / srcnorm; - std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; - std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)< Date: Mon, 27 Jan 2020 12:44:51 -0500 Subject: [PATCH 35/43] Use explicit linalg calls to get coalesce optimisations on GPU --- Grid/algorithms/approx/Chebyshev.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 74789ead..133db2b4 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -252,20 +252,20 @@ public: RealD xscale = 2.0/(hi-lo); RealD mscale = -(hi+lo)/(hi-lo); Linop.HermOp(T0,y); - T1=y*xscale+in*mscale; + axpby(T1,xscale,mscale,y,in); // sum = .5 c[0] T0 + c[1] T1 - out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1; + // out = ()*T0 + Coeffs[1]*T1; + axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1); for(int n=2;n Date: Mon, 27 Jan 2020 13:42:51 -0500 Subject: [PATCH 36/43] Optional MdagM without norms --- Grid/algorithms/SparseMatrix.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Grid/algorithms/SparseMatrix.h b/Grid/algorithms/SparseMatrix.h index fd713e9f..b959f53c 100644 --- a/Grid/algorithms/SparseMatrix.h +++ b/Grid/algorithms/SparseMatrix.h @@ -45,6 +45,10 @@ public: ni=M(in,tmp); no=Mdag(tmp,out); } + virtual void MdagM(const Field &in, Field &out) { + RealD ni, no; + MdagM(in,out,ni,no); + } virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void MdirAll (const Field &in, std::vector &out)=0; From 76c823781e6b54b88657b695a8f77e2e13db0026 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 13:43:19 -0500 Subject: [PATCH 37/43] Much faster coarsening --- Grid/algorithms/CoarsenedMatrix.h | 496 ++++++++++++++++++++++++------ 1 file changed, 409 insertions(+), 87 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index ba71faf8..01b0da4d 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -1,3 +1,14 @@ + // blockZaxpy in bockPromote - 3s, 5% + // noncoalesced linalg in Preconditionoer ~ 3s 5% + // Lancos tuning or replace 10-20s ~ 25%, open ended + // setup tuning 5s ~ 8% + // -- e.g. ordermin, orderstep tunables. + // MdagM path without norm in LinOp code. few seconds + + // Mdir calc blocking kernels + // Fuse kernels in blockMaskedInnerProduct + // preallocate Vectors in Cayley 5D ~ few percent few seconds + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -34,6 +45,34 @@ Author: paboyle NAMESPACE_BEGIN(Grid); +template +inline void blockMaskedInnerProduct(Lattice &CoarseInner1, + Lattice &CoarseInner2, + const Lattice &FineMask1, + const Lattice &FineMask2, + const Lattice &fineX, + const Lattice &fineY) +{ + typedef decltype(innerProduct(vobj(),vobj())) dotp; + + GridBase *coarse(CoarseInner1.Grid()); + GridBase *fine (fineX.Grid()); + + Lattice fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); + Lattice fine_inner_msk(fine); + + // Multiply could be fused with innerProduct + // Single block sum kernel could do both masks. + fine_inner = localInnerProduct(fineX,fineY); + + mult(fine_inner_msk, fine_inner,FineMask1); + blockSum(CoarseInner1,fine_inner_msk); + + mult(fine_inner_msk, fine_inner,FineMask2); + blockSum(CoarseInner2,fine_inner_msk); +} + + class Geometry { public: int npoint; @@ -51,10 +90,10 @@ public: directions.resize(npoint); displacements.resize(npoint); for(int d=0;d<_d;d++){ - directions[2*d ] = d+base; - directions[2*d+1] = d+base; - displacements[2*d ] = +1; - displacements[2*d+1] = -1; + directions[d ] = d+base; + directions[d+_d] = d+base; + displacements[d ] = +1; + displacements[d+_d]= -1; } directions [2*_d]=0; displacements[2*_d]=0; @@ -136,20 +175,15 @@ public: std::cout< &hermop, int nn, double hi, double lo, - int order, - int orderstep + int orderfilter, + int ordermin, + int orderstep, + double filterlo ) { RealD scale; @@ -215,7 +252,7 @@ public: int b =0; { // Filter - Chebyshev Cheb(lo,hi,order); + Chebyshev Cheb(lo,hi,orderfilter); Cheb(hermop,noise,Mn); // normalise scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; @@ -227,7 +264,7 @@ public: // Generate a full sequence of Chebyshevs { - lo=0; + lo=filterlo; noise=Mn; FineField T0(FineGrid); T0 = noise; @@ -245,7 +282,7 @@ public: hermop.HermOp(T0,y); T1=y*xscale+noise*mscale; - for(int n=2;n<=orderstep*(nn-1);n++){ + for(int n=2;n<=ordermin+orderstep*(nn-2);n++){ hermop.HermOp(*Tn,y); @@ -253,6 +290,7 @@ public: auto Tn_v = Tn->View(); auto Tnp_v = Tnp->View(); auto Tnm_v = Tnm->View(); + const int Nsimd = CComplex::Nsimd(); accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); @@ -260,12 +298,14 @@ public: // Possible more fine grained control is needed than a linear sweep, // but huge productivity gain if this is simple algorithm and not a tunable - if ( (n%orderstep)==0 ) { + int m =1; + if ( n>=ordermin ) m=n-ordermin; + if ( (m%orderstep)==0 ) { Mn=*Tnp; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; subspace[b] = Mn; hermop.Op(Mn,tmp); - std::cout< "< "< &hermop, + int nn, + double hi, + double lo, + int orderfilter, + int ordermin, + int orderstep, + double filterlo + ) { + + RealD scale; + + FineField noise(FineGrid); + FineField Mn(FineGrid); + FineField tmp(FineGrid); + FineField combined(FineGrid); + + // New normalised noise + gaussian(RNG,noise); + scale = std::pow(norm2(noise),-0.5); + noise=noise*scale; + + // Initial matrix element + hermop.Op(noise,Mn); std::cout< "< Cheb(llo,hhi,oorder); \ + Cheb(hermop,noise,Mn); \ + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ + subspace[b] = Mn; \ + hermop.Op(Mn,tmp); \ + std::cout< "< Cheb(0.002,60.0,1500,-0.5,3.5); \ + + RealD alpha=-0.8; + RealD beta =-0.8; +#define FILTER(llo,hhi,oorder) \ + { \ + Chebyshev Cheb(llo,hhi,oorder); \ + /* JacobiPolynomial Cheb(0.0,60.0,oorder,alpha,beta);*/\ + Cheb(hermop,noise,Mn); \ + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ + subspace[b] = Mn; \ + hermop.Op(Mn,tmp); \ + std::cout< "< Cheb(llo,hhi,oorder); \ + Cheb(hermop,noise,combined); \ + } + + double node = 0.000; + FILTERb(lo,hi,orderfilter);// 0 + // FILTERc(node,hi,51);// 0 + noise = Mn; + int base = 0; + int mult = 100; + FILTER(node,hi,base+1*mult); + FILTER(node,hi,base+2*mult); + FILTER(node,hi,base+3*mult); + FILTER(node,hi,base+4*mult); + FILTER(node,hi,base+5*mult); + FILTER(node,hi,base+6*mult); + FILTER(node,hi,base+7*mult); + FILTER(node,hi,base+8*mult); + FILTER(node,hi,base+9*mult); + FILTER(node,hi,base+10*mult); + FILTER(node,hi,base+11*mult); + FILTER(node,hi,base+12*mult); + FILTER(node,hi,base+13*mult); + FILTER(node,hi,base+14*mult); + FILTER(node,hi,base+15*mult); + assert(b==nn); + } +#endif + +#if 0 + virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase &hermop, + int nn, + double hi, + double lo, + int orderfilter, + int ordermin, + int orderstep, + double filterlo + ) { + + RealD scale; + + FineField noise(FineGrid); + FineField Mn(FineGrid); + FineField tmp(FineGrid); + FineField combined(FineGrid); + + // New normalised noise + gaussian(RNG,noise); + scale = std::pow(norm2(noise),-0.5); + noise=noise*scale; + + // Initial matrix element + hermop.Op(noise,Mn); std::cout< "< JacobiPoly(0.005,60.,1500); + // JacobiPolynomial JacobiPoly(0.002,60.0,1500,-0.5,3.5); + //JacobiPolynomial JacobiPoly(0.03,60.0,500,-0.5,3.5); + // JacobiPolynomial JacobiPoly(0.00,60.0,1000,-0.5,3.5); + JacobiPoly(hermop,noise,Mn); + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "< "< class CoarsenedMatrix : public SparseMatrixBase > > { public: - typedef iVector siteVector; + typedef iVector siteVector; + typedef Lattice CoarseComplexField; typedef Lattice CoarseVector; typedef Lattice > CoarseMatrix; typedef iMatrix Cobj; @@ -304,7 +541,6 @@ public: CartesianStencil Stencil; std::vector A; - /////////////////////// // Interface @@ -316,7 +552,6 @@ public: conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); - // RealD Nin = norm2(in); SimpleCompressor compressor; @@ -333,16 +568,14 @@ public: Aview *Aview_p = & AcceleratorViewContainer[0]; const int Nsimd = CComplex::Nsimd(); - typedef decltype(coalescedRead(in_v[0])) calcVector; typedef decltype(coalescedRead(in_v[0](0))) calcComplex; GridStopWatch ArithmeticTimer; int osites=Grid()->oSites(); - double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint; - double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); + // double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint; + // double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); double usecs =-usecond(); - // assert(geom.npoint==9); accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { @@ -418,7 +651,37 @@ public: auto out_v = out.View(); auto in_v = in.View(); + + const int Nsimd = CComplex::Nsimd(); + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + + accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + int lane=SIMTlane(Nsimd); + SE=Stencil.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); + } else { + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); + } + synchronise(); + + for(int bb=0;bboSites(),1,{ + siteVector res = Zero(); siteVector nbr; int ptype; @@ -433,18 +696,23 @@ public: } else { nbr = Stencil.CommBuf()[SE->_offset]; } + synchronise(); res = res + Aview_p[point][ss]*nbr; out_v[ss]=res; }); - +#endif } void MdirAll(const CoarseVector &in,std::vector &out) { this->MdirComms(in); int ndir=geom.npoint-1; - assert(out.size()==ndir); + if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { + std::cout <<"MdirAll out size "<< out.size()< > &linop, - Aggregation & Subspace){ + Aggregation & Subspace) + { + typedef Lattice FineComplexField; + typedef typename Fobj::scalar_type scalar_type; - FineField iblock(FineGrid); // contributions from within this block - FineField oblock(FineGrid); // contributions from outwith this block + FineComplexField one(FineGrid); one=scalar_type(1.0,0.0); + FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0); + + std::vector masks(geom.npoint,FineGrid); + FineComplexField imask(FineGrid); // contributions from within this block + FineComplexField omask(FineGrid); // contributions from outwith this block + + FineComplexField evenmask(FineGrid); + FineComplexField oddmask(FineGrid); FineField phi(FineGrid); FineField tmp(FineGrid); FineField zz(FineGrid); zz=Zero(); FineField Mphi(FineGrid); + FineField Mphie(FineGrid); + FineField Mphio(FineGrid); std::vector Mphi_p(geom.npoint,FineGrid); - Lattice > coor(FineGrid); + Lattice > coor (FineGrid); + Lattice > bcoor(FineGrid); + Lattice > bcb (FineGrid); CoarseVector iProj(Grid()); CoarseVector oProj(Grid()); - CoarseScalar InnerProd(Grid()); + CoarseVector SelfProj(Grid()); + CoarseComplexField iZProj(Grid()); + CoarseComplexField oZProj(Grid()); + CoarseScalar InnerProd(Grid()); // Orthogonalise the subblocks over the basis blockOrthogonalise(InnerProd,Subspace.subspace); @@ -525,22 +810,46 @@ public: // Compute the matrix elements of linop between this orthonormal // set of vectors. int self_stencil=-1; - for(int p=0;p_rdimensions[dir])/(Grid()->_rdimensions[dir]); + + LatticeCoordinate(coor,dir); + + /////////////////////////////////////////////////////// + // Work out even and odd block checkerboarding for fast diagonal term + /////////////////////////////////////////////////////// + if ( disp==1 ) { + bcb = bcb + div(coor,block); + } + + if ( disp==0 ) { + masks[p]= Zero(); + } else if ( disp==1 ) { + masks[p] = where(mod(coor,block)==(block-1),one,zero); + } else if ( disp==-1 ) { + masks[p] = where(mod(coor,block)==(Integer)0,one,zero); + } } + evenmask = where(mod(bcb,2)==(Integer)0,one,zero); + oddmask = one-evenmask; + assert(self_stencil!=-1); for(int i=0;i_rdimensions[dir])/(Grid()->_rdimensions[dir]); + if ( (disp==-1) || (!hermitian ) ) { - LatticeCoordinate(coor,dir); + //////////////////////////////////////////////////////////////////////// + // Pick out contributions coming from this cell and neighbour cell + //////////////////////////////////////////////////////////////////////// + omask = masks[p]; + imask = one-omask; + + for(int j=0;joSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); - //////////////////////////////////////////////////////////////////////// - // Pick out contributions coming from this cell and neighbour cell - //////////////////////////////////////////////////////////////////////// - if ( disp==0 ) { - iblock = Mphi; - oblock = Zero(); - } else if ( disp==1 ) { - oblock = where(mod(coor,block)==(block-1),Mphi,zz); - iblock = where(mod(coor,block)!=(block-1),Mphi,zz); - } else if ( disp==-1 ) { - oblock = where(mod(coor,block)==(Integer)0,Mphi,zz); - iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz); - } else { - assert(0); + } + } + } + + /////////////////////////////////////////// + // Faster alternate self coupling.. use hermiticity to save 2x + /////////////////////////////////////////// + { + mult(tmp,phi,evenmask); linop.Op(tmp,Mphie); + mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio); + + // tmp = Mphie*evenmask + Mphio*oddmask; + { + auto tmp_ = tmp.View(); + auto evenmask_ = evenmask.View(); + auto oddmask_ = oddmask.View(); + auto Mphie_ = Mphie.View(); + auto Mphio_ = Mphio.View(); + accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ + coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); + }); } - // Could do local inner products, - // and then block pick the IP's. - // Ideally write a routine to do two masked block sums at once - std::cout << GridLogMessage<< "CoarsenMatrix picked "<oSites(),1,{ + accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ for(int j=0;j Date: Mon, 27 Jan 2020 13:43:49 -0500 Subject: [PATCH 38/43] Add Jacobi polynomials --- Grid/algorithms/Algorithms.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index f1ac1c81..97ab4dc1 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -35,6 +35,7 @@ Author: Peter Boyle #include #include +#include #include #include #include From 2b5de5bba5c178eb072d753866cba42436545082 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 13:44:30 -0500 Subject: [PATCH 39/43] MdagM operator without norm option --- Grid/algorithms/LinearOperator.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h index 4ea8ca8b..26f22ad2 100644 --- a/Grid/algorithms/LinearOperator.h +++ b/Grid/algorithms/LinearOperator.h @@ -97,8 +97,7 @@ public: _Mat.MdagM(in,out,n1,n2); } void HermOp(const Field &in, Field &out){ - RealD n1,n2; - HermOpAndNorm(in,out,n1,n2); + _Mat.MdagM(in,out); } }; @@ -172,7 +171,6 @@ public: } void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ _Mat.M(in,out); - ComplexD dot= innerProduct(in,out); n1=real(dot); n2=norm2(out); } From 852fc1b00174b255e41d00591363ce494de8b8f1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jan 2020 13:45:10 -0500 Subject: [PATCH 40/43] True Hierachical multigrid for DWF --- tests/solver/Test_dwf_hdcr.cc | 715 +++++++++++----------------------- 1 file changed, 224 insertions(+), 491 deletions(-) diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index 5a131a57..873530ff 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -1,3 +1,5 @@ + + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -29,357 +31,174 @@ Author: paboyle /* END LEGAL */ #include #include -//#include using namespace std; using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 -class myclass: Serializable { -public: - - GRID_SERIALIZABLE_CLASS_MEMBERS(myclass, - int, domaindecompose, - int, domainsize, - int, order, - int, Ls, - double, mq, - double, lo, - double, hi, - int, steps); - - myclass(){}; - -}; + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ RealD InverseApproximation(RealD x){ return 1.0/x; } -template +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template class MultiGridPreconditioner : public LinearFunction< Lattice > { public: typedef Aggregation Aggregates; typedef CoarsenedMatrix CoarseOperator; - - typedef typename Aggregation::siteVector siteVector; - typedef typename Aggregation::CoarseScalar CoarseScalar; typedef typename Aggregation::CoarseVector CoarseVector; typedef typename Aggregation::CoarseMatrix CoarseMatrix; typedef typename Aggregation::FineField FineField; typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; Aggregates & _Aggregates; CoarseOperator & _CoarseOperator; Matrix & _FineMatrix; FineOperator & _FineOperator; - Matrix & _SmootherMatrix; - FineOperator & _SmootherOperator; Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; - double cheby_hi; - double cheby_lo; - int cheby_ord; + int level; void Level(int lv) {level = lv; }; - myclass _params; +#define GridLogLevel std::cout << GridLogMessage < fMdagMOp(_FineMatrix); - - p1=in; - for(int i=0;i<50;i++){ - RealD absp1=std::sqrt(norm2(p1)); - fMdagMOp.HermOp(p1,p2);// this is the G5 herm bit - // _FineOperator.Op(p1,p2);// this is the G5 herm bit - RealD absp2=std::sqrt(norm2(p2)); - if(i%10==9) - std::cout< CG(1.0e-3,100,false); - ConjugateGradient fCG(1.0e-3,10,false); - - HermitianLinearOperator HermOp(_CoarseOperator); - MdagMLinearOperator MdagMOp(_CoarseOperator); - MdagMLinearOperator fMdagMOp(_FineMatrix); - - FineField tmp(in.Grid()); - FineField res(in.Grid()); - FineField Min(in.Grid()); - - // Monitor completeness of low mode space - _Aggregates.ProjectToSubspace (Csrc,in); - _Aggregates.PromoteFromSubspace(Csrc,out); - std::cout< CG(1.0e-10,100000); - ConjugateGradient fCG(1.0e-3,1000); - - HermitianLinearOperator HermOp(_CoarseOperator); - MdagMLinearOperator MdagMOp(_CoarseOperator); - ShiftedMdagMLinearOperator fMdagMOp(_FineMatrix,0.1); - - FineField tmp(in.Grid()); - FineField res(in.Grid()); - FineField Qin(in.Grid()); - - // Monitor completeness of low mode space - // _Aggregates.ProjectToSubspace (Csrc,in); - // _Aggregates.PromoteFromSubspace(Csrc,out); - // std::cout< fMdagMOp(_FineMatrix); - ShiftedMdagMLinearOperator fMdagMOp(_SmootherMatrix,0.0); - - RealD Ni,r; - - Ni = norm2(in); - - for(int ilo=0;ilo<3;ilo++){ - for(int ord=5;ord<50;ord*=2){ - - std::cout << " lo "< Cheby (lo[ilo],70.0,ord,InverseApproximation); - Cheby(fMdagMOp,vec1,vec2); // solves MdagM = g5 M g5M - - _FineOperator.Op(vec2,vec1);// this is the G5 herm bit - vec1 = in - vec1; // tmp = in - A Min - r=norm2(vec1); - std::cout< CG(5.0e-2,100000); - - HermitianLinearOperator HermOp(_CoarseOperator); - MdagMLinearOperator MdagMOp(_CoarseOperator); - // MdagMLinearOperator fMdagMOp(_FineMatrix); - ShiftedMdagMLinearOperator fMdagMOp(_SmootherMatrix,0.0); - FineField vec1(in.Grid()); FineField vec2(in.Grid()); - Chebyshev Cheby (_params.lo,_params.hi,_params.order,InverseApproximation); - Chebyshev ChebyAccu(_params.lo,_params.hi,_params.order,InverseApproximation); + double t; + // Fine Smoother + t=-usecond(); + _Smoother(in,out); + t+=usecond(); + GridLogLevel << "Smoother took "<< t/1000.0<< "ms" < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); const int nbasis= 32; + const int nbasisc= 32; auto clatt = GridDefaultLatt(); for(int d=0;d seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); @@ -407,49 +233,20 @@ int main (int argc, char ** argv) GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); - - Gamma g5(Gamma::Algebra::Gamma5); - LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; - LatticeFermion result(FGrid); result=Zero(); - LatticeFermion ref(FGrid); ref=Zero(); - LatticeFermion tmp(FGrid); - LatticeFermion err(FGrid); + LatticeFermion result(FGrid); LatticeGaugeField Umu(UGrid); - LatticeGaugeField UmuDD(UGrid); - LatticeColourMatrix U(UGrid); - LatticeColourMatrix zz(UGrid); FieldMetaData header; std::string file("./ckpoint_lat.4000"); NerscIO::readConfiguration(Umu,header,file); - - if ( params.domaindecompose ) { - Lattice > coor(UGrid); - zz=Zero(); - for(int mu=0;mu(Umu,mu); - U = where(mod(coor,params.domainsize)==(Integer)0,zz,U); - PokeIndex(UmuDD,U,mu); - } - } else { - UmuDD = Umu; - } - // SU3::ColdConfiguration(RNG4,Umu); - // SU3::TepidConfiguration(RNG4,Umu); - // SU3::HotConfiguration(RNG4,Umu); - // Umu=Zero(); - - RealD mass=params.mq; - RealD M5=1.8; - std::cout< Subspace; typedef CoarsenedMatrix CoarseOperator; @@ -463,204 +260,140 @@ int main (int argc, char ** argv) Subspace Aggregates(Coarse5d,FGrid,0); assert ( (nbasis & 0x1)==0); - { int nb=nbasis/2; - std::cout< HermIndefOp(Ddwf); - Gamma5R5HermitianLinearOperator HermIndefOpDD(DdwfDD); - CoarsenedMatrix LDOp(*Coarse5d,1); // Hermitian matrix - LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); - exit(0); + typedef CoarsenedMatrix Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); - CoarseVector c_src (Coarse5d); - CoarseVector c_res (Coarse5d); - gaussian(CRNG,c_src); - result=Zero(); - c_res=Zero(); ////////////////////////////////////////////////// // Deflate the course space. Recursive multigrid? ////////////////////////////////////////////////// - - typedef CoarsenedMatrix Level1Op; - typedef CoarsenedMatrix,nbasis> Level2Op; - - auto cclatt = clatt; - for(int d=0;d,nbasis> CoarseSubspace; + typedef Aggregation,nbasisc> CoarseSubspace; CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); - double c_first = 0.2; - double c_div = 1.2; - std::vector c_lo(nbasis/2); - c_lo[0] = c_first; - for(int b=1;b c_ord(nbasis/2,200); - c_ord[0]=500; - -#define RECURSIVE_MULTIGRID -#ifdef RECURSIVE_MULTIGRID std::cout< PosdefLdop(LDOp); - // CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nbasis,14.0,c_lo,c_ord); - // CoarseAggregates.CreateSubspaceRandom(CRNG); + { + int nb=nbasisc/2; + CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,12.0,0.02,500,100,100,0.0); + for(int n=0;noSites();site++){ + subspace_g5[site](nn) = subspace[site](nn); + subspace_g5[site](nn+nb)=-subspace[site](nn+nb); + } + } + } + } - // Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix - // HermitianLinearOperator L1LinOp(LDOp); - // L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); -#endif + Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + typedef Level2Op::CoarseVector CoarseCoarseVector; + HermitianLinearOperator L1LinOp(LDOp); + L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); - // std::cout< simple; - // ConjugateGradient fCG(1.0e-8,100000); - // fCG(HermDefOp,src,result); + MdagMLinearOperator IRLHermOpL2(L2Op); + Chebyshev IRLChebyL2(0.001,4.2,71); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + int cNk=24; + int cNm=36; + int cNstop=24; + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); - std::cout< HermOpEO(Ddwf); - ConjugateGradient pCG(1.0e-8,10000); - // pCG(HermOpEO,src_o,result_o); + int cNconv; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0; + IRLL2.calc(eval2,evec2,cc_src,cNconv); + + ConjugateGradient CoarseCoarseCG(0.1,1000); + DeflatedGuesser DeflCoarseCoarseGuesser(evec2,eval2); + NormalEquations DeflCoarseCoarseCGNE(L2Op,CoarseCoarseCG,DeflCoarseCoarseGuesser); + + std::cout< , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + ChebyshevSmoother CoarseSmoother(0.1,12.0,3,L1LinOp,LDOp); + ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); + + // MirsSmoother CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp); + // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); + + CoarseMG Level2Precon (CoarseAggregates, L2Op, + L1LinOp,LDOp, + CoarseSmoother, + DeflCoarseCoarseGuesser, + DeflCoarseCoarseCGNE); + Level2Precon.Level(2); + + // PGCR Applying this solver to solve the coarse space problem + PrecGeneralisedConjugateResidual l2PGCR(0.1, 100, L1LinOp,Level2Precon,16,16); + l2PGCR.Level(2); - std::cout< IRLHermOp(LDOp); - Chebyshev IRLCheby(0.005,16.0,51); - // IRLCheby.InitLowPass(0.01,18.0,51); - FunctionHermOp IRLOpCheby(IRLCheby,IRLHermOp); - PlainHermOp IRLOp (IRLHermOp); - - int Nstop=24; - int Nk=24; - int Nm=48; - ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); - int Nconv; - std::vector eval(Nm); - std::vector evec(Nm,Coarse5d); - IRL.calc(eval,evec,c_src,Nconv); - - - std::cout< CG(3.0e-3,100000); - // CG(PosdefLdop,c_src,c_res); - - std::cout< DeflCoarseGuesser(evec,eval); - DeflCoarseGuesser(c_src,c_res); - // CG(PosdefLdop,c_src,c_res); - - std::cout< CoarseZeroGuesser; + ThreeLevelMG ThreeLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + l2PGCR); + ThreeLevelPrecon.Level(1); - MultiGridPreconditioner > - Precon (Aggregates, LDOp, - HermIndefOp,Ddwf, - HermIndefOp,Ddwf, - CoarseZeroGuesser, - params); + // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,1000,HermIndefOp,ThreeLevelPrecon,16,16); + l1PGCR.Level(1); - // Precon.PowerMethod(src); - - /* std::cout<,nbasis,Level1Op,ZeroGuesser > - CoarsePrecon (CoarseAggregates, - L2Op, - L1LinOp,LDOp, - L1LinOp,LDOp, - CoarseZeroGuesser, - cparams); - - CoarsePrecon.PowerMethod(c_src); - */ - - /* - std::cout< PGCR(1.0e-8,100000,Precon,8,8); - std::cout< > - DeflatedPrecon (Aggregates, LDOp, - HermIndefOp,Ddwf, - HermIndefOp,Ddwf, - DeflCoarseGuesser, - params); - - PrecGeneralisedConjugateResidual deflPGCR(1.0e-8,100000,DeflatedPrecon,16,16); - - std::cout< CPGCR(1.0e-3,10000,CoarsePrecon,8,8); - std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); std::cout< Date: Fri, 3 Apr 2020 19:52:15 +0100 Subject: [PATCH 41/43] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4b0a86f8..9f690ce0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid) +# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid) **Data parallel C++ mathematical object library.** From 3b0e07882f3f4f808c69cd5922a5aacc50315eee Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 10 Apr 2020 11:28:33 -0400 Subject: [PATCH 42/43] Adding another form of polynomial --- Grid/algorithms/approx/JacobiPolynomial.h | 129 ++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 Grid/algorithms/approx/JacobiPolynomial.h diff --git a/Grid/algorithms/approx/JacobiPolynomial.h b/Grid/algorithms/approx/JacobiPolynomial.h new file mode 100644 index 00000000..e68d1dd7 --- /dev/null +++ b/Grid/algorithms/approx/JacobiPolynomial.h @@ -0,0 +1,129 @@ +#ifndef GRID_JACOBIPOLYNOMIAL_H +#define GRID_JACOBIPOLYNOMIAL_H + +#include + +NAMESPACE_BEGIN(Grid); + +template +class JacobiPolynomial : public OperatorFunction { + private: + using OperatorFunction::operator(); + + int order; + RealD hi; + RealD lo; + RealD alpha; + RealD beta; + + public: + void csv(std::ostream &out){ + csv(out,lo,hi); + } + void csv(std::ostream &out,RealD llo,RealD hhi){ + RealD diff = hhi-llo; + RealD delta = diff*1.0e-5; + for (RealD x=llo-delta; x<=hhi; x+=delta) { + RealD f = approx(x); + out<< x<<" "< &Linop, const Field &in, Field &out) { + GridBase *grid=in.Grid(); + + int vol=grid->gSites(); + + Field T0(grid); + Field T1(grid); + Field T2(grid); + Field y(grid); + + + Field *Tnm = &T0; + Field *Tn = &T1; + Field *Tnp = &T2; + + // RealD T0=1.0; + T0=in; + + // RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo)); + // = x * 2/(hi-lo) - (hi+lo)/(hi-lo) + Linop.HermOp(T0,y); + RealD xscale = 2.0/(hi-lo); + RealD mscale = -(hi+lo)/(hi-lo); + Linop.HermOp(T0,y); + y=y*xscale+in*mscale; + + // RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y; + RealD halfAmB = (alpha-beta)*0.5; + RealD halfApBp2= (alpha+beta+2.0)*0.5; + T1 = halfAmB * in + halfApBp2*y; + + for(int n=2;n<=order;n++){ + + Linop.HermOp(*Tn,y); + y=xscale*y+mscale*(*Tn); + + RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta); + RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta); + RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta); + RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta); + + // Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp; + cny=cny/cnp; + cn1=cn1/cnp; + cn1=cn1/cnp; + cnm=cnm/cnp; + + *Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm); + + // Cycle pointers to avoid copies + Field *swizzle = Tnm; + Tnm =Tn; + Tn =Tnp; + Tnp =swizzle; + } + out=*Tnp; + + } +}; +NAMESPACE_END(Grid); +#endif From 014dbfa4645687770741012d894734258ad7320c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 10 Apr 2020 11:57:09 -0400 Subject: [PATCH 43/43] Compile fix with OpDirAll --- Hadrons/Modules/MDistil/LapEvec.hpp | 1 + tests/lanczos/Test_synthetic_lanczos.cc | 1 + 2 files changed, 2 insertions(+) diff --git a/Hadrons/Modules/MDistil/LapEvec.hpp b/Hadrons/Modules/MDistil/LapEvec.hpp index 3c1122ca..4576ffe3 100644 --- a/Hadrons/Modules/MDistil/LapEvec.hpp +++ b/Hadrons/Modules/MDistil/LapEvec.hpp @@ -198,6 +198,7 @@ public: } } + void OpDirAll (const Field &in, std::vector &out){ HADRONS_ERROR(Definition, "OpDirAll() undefined"); }; void OpDiag (const Field &in, Field &out) { HADRONS_ERROR(Definition, "OpDiag() undefined"); }; void OpDir (const Field &in, Field &out,int dir,int disp) { HADRONS_ERROR(Definition, "OpDir() undefined"); }; void Op (const Field &in, Field &out) { HADRONS_ERROR(Definition, "Op() undefined"); }; diff --git a/tests/lanczos/Test_synthetic_lanczos.cc b/tests/lanczos/Test_synthetic_lanczos.cc index 7a3591dd..a1e0e672 100644 --- a/tests/lanczos/Test_synthetic_lanczos.cc +++ b/tests/lanczos/Test_synthetic_lanczos.cc @@ -73,6 +73,7 @@ public: } // Support for coarsening to a multigrid + void OpDirAll (const Field &in, std::vector &out){}; void OpDiag (const Field &in, Field &out) {}; void OpDir (const Field &in, Field &out,int dir,int disp){};