diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h index 8bee43cc..3d0a2a75 100644 --- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -37,211 +37,6 @@ Author: Christoph Lehner NAMESPACE_BEGIN(Grid); - //////////////////////////////////////////////////////// - // Move following 100 LOC to lattice/Lattice_basis.h - //////////////////////////////////////////////////////// -template -void basisOrthogonalize(std::vector &basis,Field &w,int k) -{ - // If assume basis[j] are already orthonormal, - // can take all inner products in parallel saving 2x bandwidth - // Save 3x bandwidth on the second line of loop. - // perhaps 2.5x speed up. - // 2x overall in Multigrid Lanczos - for(int j=0; j -void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) -{ - typedef decltype(basis[0].View()) View; - auto tmp_v = basis[0].View(); - Vector basis_v(basis.size(),tmp_v); - typedef typename Field::vector_object vobj; - GridBase* grid = basis[0].Grid(); - - for(int k=0;k > Bt(thread_max() * Nm); // Thread private - thread_region - { - vobj* B = Bt.data() + Nm * thread_num(); - - thread_for_in_region(ss, grid->oSites(),{ - for(int j=j0; joSites(); - uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead - - // printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock); - - Vector Bt(siteBlock * nrot); - auto Bp=&Bt[0]; - - // GPU readable copy of Eigen matrix - Vector Qt_jv(Nm*Nm); - double *Qt_p = & Qt_jv[0]; - for(int k=0;k -void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) -{ - typedef decltype(basis[0].View()) View; - typedef typename Field::vector_object vobj; - GridBase* grid = basis[0].Grid(); - - result.Checkerboard() = basis[0].Checkerboard(); - auto result_v=result.View(); - Vector basis_v(basis.size(),result_v); - for(int k=0;k Qt_jv(Nm); - double * Qt_j = & Qt_jv[0]; - for(int k=0;koSites(),vobj::Nsimd(),{ - auto B=coalescedRead(zz); - for(int k=k0; k -void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, std::vector& idx) -{ - int vlen = idx.size(); - - assert(vlen>=1); - assert(vlen<=sort_vals.size()); - assert(vlen<=_v.size()); - - for (size_t i=0;ii for which _vnew[j] = _vold[i], - // track the move idx[j] => idx[i] - // track the move idx[i] => i - ////////////////////////////////////// - size_t j; - for (j=i;j i); assert(j!=idx.size()); assert(idx[j]==i); - - swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy - std::swap(sort_vals[i],sort_vals[idx[i]]); - - idx[j] = idx[i]; - idx[i] = i; - } - } -} - -inline std::vector basisSortGetIndex(std::vector& sort_vals) -{ - std::vector idx(sort_vals.size()); - std::iota(idx.begin(), idx.end(), 0); - - // sort indexes based on comparing values in v - std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) { - return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]); - }); - return idx; -} - -template -void basisSortInPlace(std::vector & _v,std::vector& sort_vals, bool reverse) -{ - std::vector idx = basisSortGetIndex(sort_vals); - if (reverse) - std::reverse(idx.begin(), idx.end()); - - basisReorderInPlace(_v,sort_vals,idx); -} - -// PAB: faster to compute the inner products first then fuse loops. -// If performance critical can improve. -template -void basisDeflate(const std::vector &_v,const std::vector& eval,const Field& src_orig,Field& result) { - result = Zero(); - assert(_v.size()==eval.size()); - int N = (int)_v.size(); - for (int i=0;i #include #include #include - +#include diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 0b03dea0..74525cc1 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -116,7 +116,6 @@ public: int target; cudaGetDevice(&target); cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target); - //std::cout<< GridLogMessage << "To Device " << target << std::endl; #endif #endif }; @@ -125,7 +124,6 @@ public: #ifdef GRID_NVCC #ifndef __CUDA_ARCH__ // only on host cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId); - //std::cout<< GridLogMessage << "To Host" << std::endl; #endif #endif }; @@ -425,7 +423,6 @@ public: // copy constructor /////////////////////////////////////////// Lattice(const Lattice& r){ - // std::cout << "Lattice constructor(const Lattice &) "<_grid = r.Grid(); resize(this->_grid->oSites()); *this = r; @@ -448,7 +445,6 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - //std::cout << GridLogMessage << "Copy other" << std::endl; auto me = AcceleratorView(ViewWrite); auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ @@ -463,7 +459,6 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - //std::cout << GridLogMessage << "Copy same" << std::endl; auto me = AcceleratorView(ViewWrite); auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index c80e7db2..c23ddcdc 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -6,6 +6,7 @@ Copyright (C) 2015 Author: Peter Boyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -63,6 +64,7 @@ template inline void pickCheckerboard(int cb,Lattice &half,con } }); } + template inline void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.Checkerboard(); auto half_v = half.View(); @@ -81,25 +83,130 @@ template inline void setCheckerboard(Lattice &full,const Latti } }); } - -template + +//////////////////////////////////////////////////////////////////////////////////////////// +// Flexible Type Conversion for internal promotion to double as well as graceful +// treatment of scalar-compatible types +//////////////////////////////////////////////////////////////////////////////////////////// +accelerator_inline void convertType(ComplexD & out, const std::complex & in) { + out = in; +} + +accelerator_inline void convertType(ComplexF & out, const std::complex & in) { + out = in; +} + +#ifdef __CUDA_ARCH__ +accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { + ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in; +} +accelerator_inline void convertType(vComplexD & out, const ComplexD & in) { + ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in; +} +accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) { + ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in; +} +#endif + +accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) { + out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v); +} + +accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) { + Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v); +} + +template + accelerator_inline void convertType(iMatrix & out, const iMatrix & in); +template + accelerator_inline void convertType(iVector & out, const iVector & in); + +template::value, T1>::type* = nullptr> +accelerator_inline void convertType(T1 & out, const iScalar & in) { + convertType(out,in._internal); +} + +template +accelerator_inline void convertType(iScalar & out, const T2 & in) { + convertType(out._internal,in); +} + +template +accelerator_inline void convertType(iMatrix & out, const iMatrix & in) { + for (int i=0;i +accelerator_inline void convertType(iVector & out, const iVector & in) { + for (int i=0;i::value, T>::type* = nullptr> +accelerator_inline void convertType(T & out, const T & in) { + out = in; +} + +template +accelerator_inline void convertType(Lattice & out, const Lattice & in) { + auto out_v = out.AcceleratorView(ViewWrite); + auto in_v = in.AcceleratorView(ViewRead); + + accelerator_for(ss,out_v.size(),T1::Nsimd(),{ + convertType(out_v[ss],in_v(ss)); + }); +} + +//////////////////////////////////////////////////////////////////////////////////////////// +// precision-promoted local inner product +//////////////////////////////////////////////////////////////////////////////////////////// +template +inline auto localInnerProductD(const Lattice &lhs,const Lattice &rhs) +-> Lattice> +{ + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); + + typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner; + Lattice> ret(lhs.Grid()); + auto ret_v = ret.AcceleratorView(ViewWrite); + + accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ + convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss))); + }); + + return ret; +} + +//////////////////////////////////////////////////////////////////////////////////////////// +// block routines +//////////////////////////////////////////////////////////////////////////////////////////// +template inline void blockProject(Lattice > &coarseData, - const Lattice &fineData, - const std::vector > &Basis) + const Lattice &fineData, + const VLattice &Basis) { GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); - Lattice ip(coarse); + Lattice> ip(coarse); + Lattice fineDataRed = fineData; // auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.View(); - auto ip_ = ip.View(); + auto coarseData_ = coarseData.AcceleratorView(ViewWrite); + auto ip_ = ip.AcceleratorView(ViewReadWrite); for(int v=0;v accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { - coalescedWrite(coarseData_[sc](v),ip_(sc)); + convertType(coarseData_[sc](v),ip_[sc]); }); + + // improve numerical stability of projection + // |fine> = |fine> - |basis> + ip=-ip; + blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); } } @@ -166,11 +273,11 @@ inline void blockProject1(Lattice > &coarseData, return; } -template -inline void blockZAXPY(Lattice &fineZ, - const Lattice &coarseA, - const Lattice &fineX, - const Lattice &fineY) +template + inline void blockZAXPY(Lattice &fineZ, + const Lattice &coarseA, + const Lattice &fineX, + const Lattice &fineY) { GridBase * fine = fineZ.Grid(); GridBase * coarse= coarseA.Grid(); @@ -182,7 +289,7 @@ inline void blockZAXPY(Lattice &fineZ, conformable(fineX,fineZ); int _ndimension = coarse->_ndimension; - + Coordinate block_r (_ndimension); // FIXME merge with subdivide checking routine as this is redundant @@ -191,29 +298,65 @@ inline void blockZAXPY(Lattice &fineZ, assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); } - auto fineZ_ = fineZ.View(); - auto fineX_ = fineX.View(); - auto fineY_ = fineY.View(); - auto coarseA_= coarseA.View(); + auto fineZ_ = fineZ.AcceleratorView(ViewWrite); + auto fineX_ = fineX.AcceleratorView(ViewRead); + auto fineY_ = fineY.AcceleratorView(ViewRead); + auto coarseA_= coarseA.AcceleratorView(ViewRead); accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { - - int sc; - Coordinate coor_c(_ndimension); - Coordinate coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + int sc; + Coordinate coor_c(_ndimension); + Coordinate coor_f(_ndimension); - // z = A x + y - coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf)); + Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; + Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - }); + // z = A x + y +#ifdef __CUDA_ARCH__ + typename vobj2::tensor_reduced::scalar_object cA; + typename vobj::scalar_object cAx; +#else + typename vobj2::tensor_reduced cA; + vobj cAx; +#endif + convertType(cA,TensorRemove(coarseA_(sc))); + auto prod = cA*fineX_(sf); + convertType(cAx,prod); + coalescedWrite(fineZ_[sf],cAx+fineY_(sf)); + + }); return; } + template + inline void blockInnerProductD(Lattice &CoarseInner, + const Lattice &fineX, + const Lattice &fineY) +{ + typedef iScalar dotp; + + GridBase *coarse(CoarseInner.Grid()); + GridBase *fine (fineX.Grid()); + + Lattice fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); + Lattice coarse_inner(coarse); + + auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite); + auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite); + + // Precision promotion + fine_inner = localInnerProductD(fineX,fineY); + blockSum(coarse_inner,fine_inner); + accelerator_for(ss, coarse->oSites(), 1, { + convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); + }); + +} + +template // deprecate inline void blockInnerProduct(Lattice &CoarseInner, const Lattice &fineX, const Lattice &fineY) @@ -227,8 +370,8 @@ inline void blockInnerProduct(Lattice &CoarseInner, Lattice coarse_inner(coarse); // Precision promotion? - auto CoarseInner_ = CoarseInner.View(); - auto coarse_inner_ = coarse_inner.View(); + auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite); + auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite); fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); @@ -236,6 +379,7 @@ inline void blockInnerProduct(Lattice &CoarseInner, CoarseInner_[ss] = coarse_inner_[ss]; }); } + template inline void blockNormalise(Lattice &ip,Lattice &fineX) { @@ -248,7 +392,7 @@ inline void blockNormalise(Lattice &ip,Lattice &fineX) // useful in multigrid project; // Generic name : Coarsen? template -inline void blockSum(Lattice &coarseData,const Lattice &fineData) +inline void blockSum(Lattice &coarseData,const Lattice &fineData) { GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); @@ -256,42 +400,41 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) subdivides(coarse,fine); // require they map int _ndimension = coarse->_ndimension; - + Coordinate block_r (_ndimension); - + for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } int blockVol = fine->oSites()/coarse->oSites(); - // Turn this around to loop threaded over sc and interior loop - // over sf would thread better - auto coarseData_ = coarseData.View(); - auto fineData_ = fineData.View(); + auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite); + auto fineData_ = fineData.AcceleratorView(ViewRead); accelerator_for(sc,coarse->oSites(),1,{ - // One thread per sub block - Coordinate coor_c(_ndimension); - Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate - coarseData_[sc]=Zero(); + // One thread per sub block + Coordinate coor_c(_ndimension); + Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate + coarseData_[sc]=Zero(); - for(int sb=0;sb_rdimensions); + for(int sb=0;sb_rdimensions); - }); + coarseData_[sc]=coarseData_[sc]+fineData_[sf]; + } + + }); return; } + template inline void blockPick(GridBase *coarse,const Lattice &unpicked,Lattice &picked,Coordinate coor) { @@ -313,8 +456,8 @@ inline void blockPick(GridBase *coarse,const Lattice &unpicked,Lattice -inline void blockOrthogonalise(Lattice &ip,std::vector > &Basis) +template +inline void blockOrthonormalize(Lattice &ip,VLattice &Basis) { GridBase *coarse = ip.Grid(); GridBase *fine = Basis[0].Grid(); @@ -322,23 +465,30 @@ inline void blockOrthogonalise(Lattice &ip,std::vector > int nbasis = Basis.size() ; // checks - subdivides(coarse,fine); + subdivides(coarse,fine); for(int i=0;i (Basis[v],ip,Basis[u],Basis[v]); + blockZAXPY(Basis[v],ip,Basis[u],Basis[v]); } blockNormalise(ip,Basis[v]); } } +template +inline void blockOrthogonalise(Lattice &ip,std::vector > &Basis) // deprecated inaccurate naming +{ + blockOrthonormalize(ip,Basis); +} + #if 0 +// TODO: CPU optimized version here template inline void blockPromote(const Lattice > &coarseData, Lattice &fineData, @@ -383,24 +533,18 @@ inline void blockPromote(const Lattice > &coarseData, } #else -template +template inline void blockPromote(const Lattice > &coarseData, Lattice &fineData, - const std::vector > &Basis) + const VLattice &Basis) { GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); - fineData=Zero(); for(int i=0;i > ip = PeekIndex<0>(coarseData,i); - Lattice cip(coarse); - auto cip_ = cip.View(); - auto ip_ = ip.View(); - accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ - coalescedWrite(cip_[sc], ip_(sc)()); - }); - blockZAXPY(fineData,cip,Basis[i],fineData); + auto ip_ = ip.AcceleratorView(ViewRead); + blockZAXPY(fineData,ip,Basis[i],fineData); } } #endif @@ -470,8 +614,8 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Coordinate rdt = Tg->_rdimensions; Coordinate ist = Tg->_istride; Coordinate ost = Tg->_ostride; - auto t_v = To.View(); - auto f_v = From.View(); + auto t_v = To.AcceleratorView(ViewWrite); + auto f_v = From.AcceleratorView(ViewRead); accelerator_for(idx,Fg->lSites(),1,{ sobj s; Coordinate Fcoor(nd);