diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h new file mode 100644 index 00000000..22b7725e --- /dev/null +++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h @@ -0,0 +1,241 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_PREC_GCR_NON_HERM_H +#define GRID_PREC_GCR_NON_HERM_H + +/////////////////////////////////////////////////////////////////////////////////////////////////////// +//VPGCR Abe and Zhang, 2005. +//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING +//Computing and Information Volume 2, Number 2, Pages 147-161 +//NB. Likely not original reference since they are focussing on a preconditioner variant. +// but VPGCR was nicely written up in their paper +/////////////////////////////////////////////////////////////////////////////////////////////////////// +NAMESPACE_BEGIN(Grid); + +#define GCRLogLevel std::cout << GridLogMessage < +class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction { +public: + + RealD Tolerance; + Integer MaxIterations; + int verbose; + int mmax; + int nstep; + int steps; + int level; + GridStopWatch PrecTimer; + GridStopWatch MatTimer; + GridStopWatch LinalgTimer; + + LinearFunction &Preconditioner; + LinearOperatorBase &Linop; + + void Level(int lv) { level=lv; }; + + PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase &_Linop,LinearFunction &Prec,int _mmax,int _nstep) : + Tolerance(tol), + MaxIterations(maxit), + Linop(_Linop), + Preconditioner(Prec), + mmax(_mmax), + nstep(_nstep) + { + level=1; + verbose=1; + }; + + void operator() (const Field &src, Field &psi){ + + psi=Zero(); + RealD cp, ssq,rsq; + ssq=norm2(src); + rsq=Tolerance*Tolerance*ssq; + + Field r(src.Grid()); + + PrecTimer.Reset(); + MatTimer.Reset(); + LinalgTimer.Reset(); + + GridStopWatch SolverTimer; + SolverTimer.Start(); + + steps=0; + for(int k=0;k q(mmax,grid); + std::vector p(mmax,grid); + std::vector qq(mmax); + + GCRLogLevel<< "PGCR nStep("<(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history. + for(int back=0;back=0); + + b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; + p[peri_kp]=p[peri_kp]+b*p[peri_back]; + q[peri_kp]=q[peri_kp]+b*q[peri_back]; + + } + qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm + LinalgTimer.Stop(); + } + assert(0); // never reached + return cp; + } +}; +NAMESPACE_END(Grid); +#endif diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 6c6dd7d8..ebb3162b 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -52,41 +52,79 @@ public: pointer allocate(size_type __n, const void* _p= 0) { size_type bytes = __n*sizeof(_Tp); - profilerAllocate(bytes); - _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); - assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); - return ptr; } void deallocate(pointer __p, size_type __n) { size_type bytes = __n * sizeof(_Tp); - profilerFree(bytes); - MemoryManager::CpuFree((void *)__p,bytes); } + // FIXME: hack for the copy constructor, eventually it must be avoided + //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; + void construct(pointer __p, const _Tp& __val) { assert(0);}; + void construct(pointer __p) { }; + void destroy(pointer __p) { }; +}; +template inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } +template inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } + +template +class uvmAllocator { +public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef _Tp* pointer; + typedef const _Tp* const_pointer; + typedef _Tp& reference; + typedef const _Tp& const_reference; + typedef _Tp value_type; + + template struct rebind { typedef uvmAllocator<_Tp1> other; }; + uvmAllocator() throw() { } + uvmAllocator(const uvmAllocator&) throw() { } + template uvmAllocator(const uvmAllocator<_Tp1>&) throw() { } + ~uvmAllocator() throw() { } + pointer address(reference __x) const { return &__x; } + size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } + + pointer allocate(size_type __n, const void* _p= 0) + { + size_type bytes = __n*sizeof(_Tp); + profilerAllocate(bytes); + _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); + assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); + return ptr; + } + + void deallocate(pointer __p, size_type __n) + { + size_type bytes = __n * sizeof(_Tp); + profilerFree(bytes); + MemoryManager::SharedFree((void *)__p,bytes); + } + // FIXME: hack for the copy constructor, eventually it must be avoided void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; //void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p) { }; void destroy(pointer __p) { }; }; -template inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } -template inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } +template inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; } +template inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; } //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -template using commAllocator = alignedAllocator; -template using Vector = std::vector >; -template using commVector = std::vector >; -template using Matrix = std::vector > >; +template using commAllocator = uvmAllocator; +template using Vector = std::vector >; +template using commVector = std::vector >; +//template using Matrix = std::vector > >; NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index 6d638b60..e11ce948 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -7,13 +7,24 @@ NAMESPACE_BEGIN(Grid); #define CpuSmall (1) #define Acc (2) #define AccSmall (3) +#define Shared (4) +#define SharedSmall (5) +uint64_t total_shared; +uint64_t total_device; +uint64_t total_host;; +void MemoryManager::PrintBytes(void) +{ + std::cout << " MemoryManager : "<=0) && (Nc < NallocCacheMax)) { Ncache[Cpu]=Nc; Ncache[Acc]=Nc; + Ncache[Shared]=Nc; } } @@ -84,6 +133,7 @@ void MemoryManager::Init(void) if ( (Nc>=0) && (Nc < NallocCacheMax)) { Ncache[CpuSmall]=Nc; Ncache[AccSmall]=Nc; + Ncache[SharedSmall]=Nc; } } std::cout << GridLogMessage<< "MemoryManager::Init() setting up"< @@ -93,7 +92,9 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites) ssum = ssum+sumarray[i]; } - return ssum; + typedef typename vobj::scalar_object ssobj; + ssobj ret = ssum; + return ret; } @@ -154,7 +155,7 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & const uint64_t sites = grid->oSites(); // Might make all code paths go this way. - typedef decltype(innerProduct(vobj(),vobj())) inner_t; + typedef decltype(innerProductD(vobj(),vobj())) inner_t; Vector inner_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; @@ -163,16 +164,16 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & autoView( right_v,right, AcceleratorRead); // GPU - SIMT lane compliance... - accelerator_for( ss, sites, nsimd,{ - auto x_l = left_v(ss); - auto y_l = right_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); - }) + accelerator_for( ss, sites, 1,{ + auto x_l = left_v[ss]; + auto y_l = right_v[ss]; + inner_tmp_v[ss]=innerProductD(x_l,y_l); + }); } // This is in single precision and fails some tests - // Need a sumD that sums in double - nrm = TensorRemove(sumD(inner_tmp_v,sites)); + auto anrm = sum(inner_tmp_v,sites); + nrm = anrm; return nrm; } @@ -218,16 +219,16 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt autoView( y_v, y, AcceleratorRead); autoView( z_v, z, AcceleratorWrite); - typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; + typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; Vector inner_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; - accelerator_for( ss, sites, nsimd,{ - auto tmp = a*x_v(ss)+b*y_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); - coalescedWrite(z_v[ss],tmp); + accelerator_for( ss, sites, 1,{ + auto tmp = a*x_v[ss]+b*y_v[ss]; + inner_tmp_v[ss]=innerProductD(tmp,tmp); + z_v[ss]=tmp; }); - nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); + nrm = real(TensorRemove(sum(inner_tmp_v,sites))); grid->GlobalSum(nrm); return nrm; } @@ -243,29 +244,28 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti GridBase *grid = left.Grid(); - const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); // GPU - typedef decltype(innerProduct(vobj(),vobj())) inner_t; - typedef decltype(innerProduct(vobj(),vobj())) norm_t; + typedef decltype(innerProductD(vobj(),vobj())) inner_t; + typedef decltype(innerProductD(vobj(),vobj())) norm_t; Vector inner_tmp(sites); - Vector norm_tmp(sites); + Vector norm_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; auto norm_tmp_v = &norm_tmp[0]; { autoView(left_v,left, AcceleratorRead); autoView(right_v,right,AcceleratorRead); - accelerator_for( ss, sites, nsimd,{ - auto left_tmp = left_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss))); - coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp)); + accelerator_for( ss, sites, 1,{ + auto left_tmp = left_v[ss]; + inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]); + norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp); }); } - tmp[0] = TensorRemove(sumD(inner_tmp_v,sites)); - tmp[1] = TensorRemove(sumD(norm_tmp_v,sites)); + tmp[0] = TensorRemove(sum(inner_tmp_v,sites)); + tmp[1] = TensorRemove(sum(norm_tmp_v,sites)); grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector ip = tmp[0]; diff --git a/Grid/qcd/action/scalar/ScalarImpl.h b/Grid/qcd/action/scalar/ScalarImpl.h index febb315e..14675b11 100644 --- a/Grid/qcd/action/scalar/ScalarImpl.h +++ b/Grid/qcd/action/scalar/ScalarImpl.h @@ -1,5 +1,13 @@ #pragma once +#define CPS_MD_TIME + +#ifdef CPS_MD_TIME +#define HMC_MOMENTUM_DENOMINATOR (2.0) +#else +#define HMC_MOMENTUM_DENOMINATOR (1.0) +#endif + NAMESPACE_BEGIN(Grid); template @@ -20,7 +28,9 @@ public: typedef Field PropagatorField; static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){ + RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling gaussian(pRNG, P); + P *= scale; } static inline Field projectForce(Field& P){return P;} @@ -66,7 +76,7 @@ public: } static void FreePropagator(const Field &in, Field &out, - const Field &momKernel) + const Field &momKernel) { FFT fft((GridCartesian *)in.Grid()); Field inFT(in.Grid()); @@ -139,14 +149,17 @@ public: static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) { + RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling #ifndef USE_FFT_ACCELERATION Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P); + #else Field Pgaussian(P.Grid()), Pp(P.Grid()); ComplexField p2(P.Grid()); p2 = zero; RealD M = FFT_MASS; - + + Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian); FFT theFFT((GridCartesian*)P.Grid()); @@ -156,17 +169,17 @@ public: p2 = sqrt(p2); Pp *= p2; theFFT.FFT_all_dim(P, Pp, FFT::backward); - #endif //USE_FFT_ACCELERATION + P *= scale; } - static inline Field projectForce(Field& P) {return P;} + static inline Field projectForce(Field& P) {return Ta(P);} static inline void update_field(Field &P, Field &U, double ep) { #ifndef USE_FFT_ACCELERATION double t0=usecond(); - U += P*ep; + U += P*ep; double t1=usecond(); double total_time = (t1-t0)/1e6; std::cout << GridLogIntegrator << "Total time for updating field (s) : " << total_time << std::endl; diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index cfb19d69..b268b684 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -7,6 +7,7 @@ Copyright (C) 2019 Author: Felix Erben + Author: Raoul Hodgson This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -58,9 +59,12 @@ public: const Gamma GammaA_right, const Gamma GammaB_right, const int parity, - const int * wick_contractions, + const bool * wick_contractions, robj &result); public: + static void Wick_Contractions(std::string qi, + std::string qf, + bool* wick_contractions); static void ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, const PropagatorField &q3_left, @@ -68,8 +72,7 @@ public: const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const char * quarks_left, - const char * quarks_right, + const bool* wick_contractions, const int parity, ComplexField &baryon_corr); template @@ -80,10 +83,59 @@ public: const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const char * quarks_left, - const char * quarks_right, + const bool* wick_contractions, const int parity, + const int nt, robj &result); + private: + template + static void Baryon_Gamma_3pt_Group1_Site( + const mobj &Dq1_ti, + const mobj2 &Dq2_spec, + const mobj2 &Dq3_spec, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result); + + template + static void Baryon_Gamma_3pt_Group2_Site( + const mobj2 &Dq1_spec, + const mobj &Dq2_ti, + const mobj2 &Dq3_spec, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result); + + template + static void Baryon_Gamma_3pt_Group3_Site( + const mobj2 &Dq1_spec, + const mobj2 &Dq2_spec, + const mobj &Dq3_ti, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result); + public: + template + static void Baryon_Gamma_3pt( + const PropagatorField &q_ti, + const mobj &Dq_spec1, + const mobj &Dq_spec2, + const PropagatorField &q_tf, + int group, + int wick_contraction, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + SpinMatrixField &stn_corr); private: template static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, @@ -166,111 +218,137 @@ const Real BaryonUtils::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.}; template template void BaryonUtils::baryon_site(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const int * wick_contraction, - robj &result) + const mobj &D2, + const mobj &D3, + const Gamma GammaA_i, + const Gamma GammaB_i, + const Gamma GammaA_f, + const Gamma GammaB_f, + const int parity, + const bool * wick_contraction, + robj &result) { Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) - auto gD1a = GammaA_left * GammaA_right * D1; - auto gD1b = GammaA_left * g4 * GammaA_right * D1; - auto pD1 = 0.5* (gD1a + (Real)parity * gD1b); - auto gD3 = GammaB_right * D3; - auto D2g = D2 * GammaB_left; - auto pD1g = pD1 * GammaB_left; - auto gD3g = gD3 * GammaB_left; + + auto D1_GAi = D1 * GammaA_i; + auto D1_GAi_g4 = D1_GAi * g4; + auto D1_GAi_P = 0.5*(D1_GAi + (Real)parity * D1_GAi_g4); + auto GAf_D1_GAi_P = GammaA_f * D1_GAi_P; + auto GBf_D1_GAi_P = GammaB_f * D1_GAi_P; - for (int ie_left=0; ie_left < 6 ; ie_left++){ - int a_left = epsilon[ie_left][0]; //a - int b_left = epsilon[ie_left][1]; //b - int c_left = epsilon[ie_left][2]; //c - for (int ie_right=0; ie_right < 6 ; ie_right++){ - int a_right = epsilon[ie_right][0]; //a' - int b_right = epsilon[ie_right][1]; //b' - int c_right = epsilon[ie_right][2]; //c' - Real ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; + auto D2_GBi = D2 * GammaB_i; + auto GBf_D2_GBi = GammaB_f * D2_GBi; + auto GAf_D2_GBi = GammaA_f * D2_GBi; + + auto GBf_D3 = GammaB_f * D3; + auto GAf_D3 = GammaA_f * D3; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = epsilon[ie_f][0]; //a + int b_f = epsilon[ie_f][1]; //b + int c_f = epsilon[ie_f][2]; //c + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = epsilon[ie_i][0]; //a' + int b_i = epsilon[ie_i][1]; //b' + int c_i = epsilon[ie_i][2]; //c' + + Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; //This is the \delta_{456}^{123} part - if (wick_contraction[0]){ - for (int gamma_left=0; gamma_left +void BaryonUtils::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) { + const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; + for (int ie=0; ie < 6 ; ie++) { + wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3 + && qi[0] == qf[epsilon[ie][0]] + && qi[1] == qf[epsilon[ie][1]] + && qi[2] == qf[epsilon[ie][2]]); } } +/* The array wick_contractions must be of length 6. The order * + * corresponds to the to that shown in the Hadrons documentation * + * at https://aportelli.github.io/Hadrons-doc/#/mcontraction * + * This can be computed from the quark flavours using the * + * Wick_Contractions function above */ template void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, @@ -279,8 +357,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const char * quarks_left, - const char * quarks_right, + const bool* wick_contractions, const int parity, ComplexField &baryon_corr) { @@ -288,7 +365,6 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; @@ -297,11 +373,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); GridBase *grid = q1_left.Grid(); - - int wick_contraction[6]; - for (int ie=0; ie < 6 ; ie++) - wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; - + autoView(vbaryon_corr, baryon_corr,CpuWrite); autoView( v1 , q1_left, CpuRead); autoView( v2 , q2_left, CpuRead); @@ -311,10 +383,10 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); for (int ie=0; ie < 6 ; ie++){ if(ie==0 or ie==3){ - bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contraction[ie]; + bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie]; } else{ - bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contraction[ie]; + bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie]; } } Real t=0.; @@ -325,7 +397,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto D2 = v2[ss]; auto D3 = v3[ss]; vobj result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites @@ -334,6 +406,12 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; } + +/* The array wick_contractions must be of length 6. The order * + * corresponds to the to that shown in the Hadrons documentation * + * at https://aportelli.github.io/Hadrons-doc/#/mcontraction * + * This can also be computed from the quark flavours using the * + * Wick_Contractions function above */ template template void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, @@ -343,16 +421,15 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const char * quarks_left, - const char * quarks_right, + const bool* wick_contractions, const int parity, + const int nt, robj &result) { assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; @@ -360,17 +437,347 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); - int wick_contraction[6]; - for (int ie=0; ie < 6 ; ie++) - wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; - - result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + for (int t=0; t +template +void BaryonUtils::Baryon_Gamma_3pt_Group1_Site( + const mobj &Dq1_ti, + const mobj2 &Dq2_spec, + const mobj2 &Dq3_spec, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result) +{ + Gamma g5(Gamma::Algebra::Gamma5); + + auto adjD4_g_D1 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq1_ti; + auto Gf_adjD4_g_D1 = GammaBf * adjD4_g_D1; + auto D2_Gi = Dq2_spec * GammaBi; + auto Gf_D2_Gi = GammaBf * D2_Gi; + auto Gf_D3 = GammaBf * Dq3_spec; + + int a_f, b_f, c_f; + int a_i, b_i, c_i; + + Real ee; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + a_f = epsilon[ie_f][0]; //a + b_f = epsilon[ie_f][1]; //b + c_f = epsilon[ie_f][2]; //c + for (int ie_i=0; ie_i < 6 ; ie_i++){ + a_i = epsilon[ie_i][0]; //a' + b_i = epsilon[ie_i][1]; //b' + c_i = epsilon[ie_i][2]; //c' + + ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + + for (int alpha_f=0; alpha_f +template +void BaryonUtils::Baryon_Gamma_3pt_Group2_Site( + const mobj2 &Dq1_spec, + const mobj &Dq2_ti, + const mobj2 &Dq3_spec, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result) +{ + Gamma g5(Gamma::Algebra::Gamma5); + + auto adjD4_g_D2_Gi = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi; + auto Gf_adjD4_g_D2_Gi = GammaBf * adjD4_g_D2_Gi; + auto Gf_D1 = GammaBf * Dq1_spec; + auto Gf_D3 = GammaBf * Dq3_spec; + + int a_f, b_f, c_f; + int a_i, b_i, c_i; + + Real ee; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + a_f = epsilon[ie_f][0]; //a + b_f = epsilon[ie_f][1]; //b + c_f = epsilon[ie_f][2]; //c + for (int ie_i=0; ie_i < 6 ; ie_i++){ + a_i = epsilon[ie_i][0]; //a' + b_i = epsilon[ie_i][1]; //b' + c_i = epsilon[ie_i][2]; //c' + + ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + + for (int alpha_f=0; alpha_f +template +void BaryonUtils::Baryon_Gamma_3pt_Group3_Site( + const mobj2 &Dq1_spec, + const mobj2 &Dq2_spec, + const mobj &Dq3_ti, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result) +{ + Gamma g5(Gamma::Algebra::Gamma5); + + auto adjD4_g_D3 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti; + auto Gf_adjD4_g_D3 = GammaBf * adjD4_g_D3; + auto Gf_D1 = GammaBf * Dq1_spec; + auto D2_Gi = Dq2_spec * GammaBi; + auto Gf_D2_Gi = GammaBf * D2_Gi; + + int a_f, b_f, c_f; + int a_i, b_i, c_i; + + Real ee; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + a_f = epsilon[ie_f][0]; //a + b_f = epsilon[ie_f][1]; //b + c_f = epsilon[ie_f][2]; //c + for (int ie_i=0; ie_i < 6 ; ie_i++){ + a_i = epsilon[ie_i][0]; //a' + b_i = epsilon[ie_i][1]; //b' + c_i = epsilon[ie_i][2]; //c' + + ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + + for (int alpha_f=0; alpha_f +template +void BaryonUtils::Baryon_Gamma_3pt( + const PropagatorField &q_ti, + const mobj &Dq_spec1, + const mobj &Dq_spec2, + const PropagatorField &q_tf, + int group, + int wick_contraction, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + SpinMatrixField &stn_corr) +{ + GridBase *grid = q_tf.Grid(); + + autoView( vcorr, stn_corr, CpuWrite); + autoView( vq_ti , q_ti, CpuRead); + autoView( vq_tf , q_tf, CpuRead); + + if (group == 1) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti[ss]; + auto Dq_tf = vq_tf[ss]; + sobj result=Zero(); + Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + vcorr[ss] += result; + });//end loop over lattice sites + } else if (group == 2) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti[ss]; + auto Dq_tf = vq_tf[ss]; + sobj result=Zero(); + Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + vcorr[ss] += result; + });//end loop over lattice sites + } else if (group == 3) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti[ss]; + auto Dq_tf = vq_tf[ss]; + sobj result=Zero(); + Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + + vcorr[ss] += result; + });//end loop over lattice sites + } +} + + +/*********************************************************************** + * End of BaryonGamma3pt-function code. * + * * * The following code is for Sigma -> N rare hypeon decays * **********************************************************************/ diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h index dbcbae8d..36becc49 100644 --- a/Grid/tensors/Tensor_class.h +++ b/Grid/tensors/Tensor_class.h @@ -59,6 +59,20 @@ class GridTensorBase {}; using DoublePrecision2= typename Traits::DoublePrecision2; \ static constexpr int TensorLevel = Traits::TensorLevel +/////////////////////////////////////////////////////////// +// Allows to turn scalar>>> back to double. +/////////////////////////////////////////////////////////// +template +accelerator_inline typename std::enable_if::value, T>::type +TensorRemove(T arg) { + return arg; +} +template +accelerator_inline auto TensorRemove(iScalar arg) + -> decltype(TensorRemove(arg._internal)) { + return TensorRemove(arg._internal); +} + template class iScalar { public: @@ -135,9 +149,10 @@ public: operator ComplexD() const { return (TensorRemove(_internal)); } + // instantiation of "Grid::iScalar::operator Grid::RealD() const [with vtype=Grid::Real, U=Grid::Real, V=Grid::RealD, =0, =0U]" template = 0,IfNotSimd = 0> accelerator_inline operator RealD() const { - return TensorRemove(_internal); + return (RealD) TensorRemove(_internal); } template = 0, IfNotSimd = 0> accelerator_inline operator Integer() const { @@ -169,20 +184,6 @@ public: strong_inline scalar_type * end() { return begin() + Traits::count; } }; -/////////////////////////////////////////////////////////// -// Allows to turn scalar>>> back to double. -/////////////////////////////////////////////////////////// -template -accelerator_inline typename std::enable_if::value, T>::type -TensorRemove(T arg) { - return arg; -} -template -accelerator_inline auto TensorRemove(iScalar arg) - -> decltype(TensorRemove(arg._internal)) { - return TensorRemove(arg._internal); -} - template class iVector { public: diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 1cb6d637..74a3ea22 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -28,6 +28,8 @@ Author: paboyle /* END LEGAL */ #pragma once +#include + #ifdef HAVE_MALLOC_MALLOC_H #include #endif @@ -334,12 +336,11 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc ////////////////////////////////////////////// // CPU Target - No accelerator just thread instead ////////////////////////////////////////////// +#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned #if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) #undef GRID_SIMT -#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned - #define accelerator #define accelerator_inline strong_inline #define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ }); @@ -365,6 +366,14 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr);}; #endif // CPU target +#ifdef HAVE_MM_MALLOC_H +inline void *acceleratorAllocCpu(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; +inline void acceleratorFreeCpu (void *ptr){_mm_free(ptr);}; +#else +inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; +inline void acceleratorFreeCpu (void *ptr){free(ptr);}; +#endif + /////////////////////////////////////////////////// // Synchronise across local threads for divergence resynch /////////////////////////////////////////////////// diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index e93f3046..656e29a9 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -318,6 +318,11 @@ void Grid_init(int *argc,char ***argv) Grid_debug_handler_init(); } + ////////////////////////////////////////////////////////// + // Memory manager + ////////////////////////////////////////////////////////// + MemoryManager::Init(); + ////////////////////////////////////////////////////////// // MPI initialisation ////////////////////////////////////////////////////////// @@ -357,11 +362,6 @@ void Grid_init(int *argc,char ***argv) std::cout << GridLogMessage << "================================================ "< +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; + +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +#define GridLogLevel std::cout << GridLogMessage < +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &Smoother, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _Smoother(Smoother), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _Aggregates.CoarseGrid; + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + double t; + // Fine Smoother + t=-usecond(); + _Smoother(in,out); + t+=usecond(); + GridLogLevel << "Smoother took "<< t/1000.0<< "ms" < +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + + MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, + FineOperator &Fine, + FineSmoother &Smoother, + Guesser &Guess_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _CoarseOperator(Coarse), + _FineOperator(Fine), + _Smoother(Smoother), + _Guess(Guess_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + CoarseVector Csrc(_CoarseOperator.Grid()); + CoarseVector Csol(_CoarseOperator.Grid()); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + double t; + // Fine Smoother + t=-usecond(); + _Smoother(in,out); + t+=usecond(); + GridLogLevel << "Smoother took "<< t/1000.0<< "ms" < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 32; + const int nbasisc= 32; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("./ckpoint_lat.4000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,500,100,100,0.0); + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + Gamma5R5HermitianLinearOperator HermIndefOpPV(Dpv); + + std::cout< CoarseBiCGSTAB(tol,MaxIt); + ConjugateGradient CoarseCG(tol,MaxIt); + // GeneralisedMinimalResidual CoarseGMRES(tol,MaxIt,20); + + BiCGSTAB FineBiCGSTAB(tol,MaxIt); + ConjugateGradient FineCG(tol,MaxIt); + // GeneralisedMinimalResidual FineGMRES(tol,MaxIt,20); + + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + PVdagMLinearOperator FinePVdagM(Ddwf,Dpv);// M_{pv}^\dag M + SchurDiagMooeeOperator FineDiagMooee(Ddwf); // M_ee - Meo Moo^-1 Moe + SchurDiagOneOperator FineDiagOne(Ddwf); // 1 - M_ee^{-1} Meo Moo^{-1} Moe e + + MdagMLinearOperator CoarseMdagM(LDOp); + PVdagMLinearOperator CoarsePVdagM(LDOp,LDOpPV); + + std::cout< IRLCheby(0.03,12.0,71); // 1 iter + FunctionHermOp IRLOpCheby(IRLCheby,CoarseMdagM); + PlainHermOp IRLOp (CoarseMdagM); + int Nk=64; + int Nm=128; + int Nstop=Nk; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + IRL.calc(eval,evec,c_src,Nconv); + + std::cout< DeflCoarseGuesser(evec,eval); + NormalEquations DeflCoarseCGNE (LDOp,CoarseCG,DeflCoarseGuesser); + c_res=Zero(); + DeflCoarseCGNE(c_src,c_res); + + + std::cout< CoarseMgridCG(0.001,1000); + ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); + + typedef HDCRPreconditioner > TwoLevelHDCR; + TwoLevelHDCR TwoLevelPrecon(Aggregates, + HermIndefOp, + FineSmoother, + DeflCoarseCGNE); + TwoLevelPrecon.Level(1); + // PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,100,HermIndefOp,TwoLevelPrecon,16,16); + PrecGeneralisedConjugateResidualNonHermitian l1PGCR(1.0e-8,100,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + + f_res=Zero(); + + CoarseCG.Tolerance=0.02; + l1PGCR(f_src,f_res); + + std::cout< CoarseMgridBiCGSTAB(0.01,1000); + BiCGSTAB FineMgridBiCGSTAB(0.0,24); + ZeroGuesser CoarseZeroGuesser; + ZeroGuesser FineZeroGuesser; + + SolverWrapper FineBiCGSmoother( FinePVdagM, FineMgridBiCGSTAB, FineZeroGuesser); + SolverWrapper CoarsePVdagMSolver(CoarsePVdagM,CoarseMgridBiCGSTAB,CoarseZeroGuesser); + typedef HDCRPreconditioner > TwoLevelMG; + + TwoLevelMG _TwoLevelMG(Aggregates, + FinePVdagM, + FineBiCGSmoother, + CoarsePVdagMSolver); + _TwoLevelMG.Level(1); + + PrecGeneralisedConjugateResidualNonHermitian pvPGCR(1.0e-8,100,FinePVdagM,_TwoLevelMG,16,16); + pvPGCR.Level(1); + + f_res=Zero(); + pvPGCR(f_src,f_res); + + std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +//#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; + +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +#define GridLogLevel std::cout << GridLogMessage < +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &Smoother, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _Smoother(Smoother), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _Aggregates.CoarseGrid; + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + double t; + // Fine Smoother + t=-usecond(); + _Smoother(in,out); + t+=usecond(); + GridLogLevel << "Smoother took "<< t/1000.0<< "ms" < block ({2,2,2,2}); + const int nbasis= 8; + + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds({1,2,3,4}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(UGrid); + FieldMetaData header; + std::string file("./ckpoint_lat.4000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + + std::cout< SubspaceOp(Dw); + + Subspace Aggregates4D(Coarse4d,UGrid,0); + Subspace Aggregates5D(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + std::cout< Level1Op; + + NonHermitianLinearOperator LinOpDwf(Ddwf); + + Level1Op LDOp (*Coarse5d,0); + + std::cout< CoarseMdagM(LDOp); + BiCGSTAB CoarseBiCGSTAB(tol,MaxIt); + ConjugateGradient CoarseCG(tol,MaxIt); + + c_res=Zero(); + CoarseCG(CoarseMdagM,c_src,c_res); + + std::cout<