From 3f1c4d878920ed0bd55ccf7d538db9df9de008f8 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Thu, 2 May 2019 10:24:36 +0100 Subject: [PATCH 01/18] fix comment hash --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index cee071d5..5b9e31dc 100644 --- a/configure.ac +++ b/configure.ac @@ -474,7 +474,7 @@ esac case ${ac_COMMS} in *-auto) LX_FIND_MPI -# if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi +## if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS" AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS" AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS" From b52fa38f8ce23e0ffcb566207473175c8f0b425a Mon Sep 17 00:00:00 2001 From: gfilaci Date: Tue, 12 Mar 2019 18:07:58 +0000 Subject: [PATCH 02/18] seed initialisation of RNG5 --- benchmarks/Benchmark_mooee.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index dfaaae30..f4c2b998 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -76,7 +76,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "< Date: Thu, 2 May 2019 10:53:37 +0100 Subject: [PATCH 03/18] allocator copy constructor (to be fixed) --- Grid/allocator/AlignedAllocator.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index ed1fbec2..644fc3c7 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -220,7 +220,10 @@ public: #endif #endif } - void construct(pointer __p, const _Tp& __val) { }; + + // FIXME: hack for the copy constructor, eventually it must be avoided + void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; + //void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p) { }; void destroy(pointer __p) { }; }; From 44e0360b97130ce08caf8a690145313a22df3a64 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Mon, 18 Mar 2019 15:49:29 +0000 Subject: [PATCH 04/18] replace std::vector with Vector --- Grid/qcd/action/fermion/CayleyFermion5D.cc | 48 +++++++++--------- Grid/qcd/action/fermion/CayleyFermion5D.h | 50 +++++++++---------- .../action/fermion/CayleyFermion5Dcache.cc | 12 ++--- Grid/qcd/action/fermion/CayleyFermion5Dssp.cc | 12 ++--- Grid/qcd/action/fermion/CayleyFermion5Dvec.cc | 12 ++--- .../fermion/ContinuedFractionFermion5D.h | 12 ++--- .../action/fermion/DomainWallEOFAFermion.cc | 26 +++++----- .../action/fermion/DomainWallEOFAFermion.h | 10 ++-- .../fermion/DomainWallEOFAFermioncache.cc | 14 +++--- .../fermion/DomainWallEOFAFermionssp.cc | 4 +- .../fermion/DomainWallEOFAFermionvec.cc | 4 +- Grid/qcd/action/fermion/MobiusEOFAFermion.cc | 32 ++++++------ Grid/qcd/action/fermion/MobiusEOFAFermion.h | 30 +++++------ .../action/fermion/MobiusEOFAFermioncache.cc | 12 ++--- .../action/fermion/MobiusEOFAFermionssp.cc | 12 ++--- .../action/fermion/MobiusEOFAFermionvec.cc | 12 ++--- .../action/fermion/PartialFractionFermion5D.h | 4 +- Grid/qcd/action/fermion/SchurDiagTwoKappa.h | 2 +- Grid/qcd/action/fermion/ZMobiusFermion.h | 4 +- 19 files changed, 156 insertions(+), 156 deletions(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.cc b/Grid/qcd/action/fermion/CayleyFermion5D.cc index 4a6c4c91..ad732825 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5D.cc @@ -197,18 +197,18 @@ template void CayleyFermion5D::M5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - std::vector diag (Ls,1.0); - std::vector upper(Ls,-1.0); upper[Ls-1]=mass; - std::vector lower(Ls,-1.0); lower[0] =mass; + Vector diag (Ls,1.0); + Vector upper(Ls,-1.0); upper[Ls-1]=mass; + Vector lower(Ls,-1.0); lower[0] =mass; M5D(psi,chi,chi,lower,diag,upper); } template void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &Din) { int Ls=this->Ls; - std::vector diag = bs; - std::vector upper= cs; - std::vector lower= cs; + Vector diag = bs; + Vector upper= cs; + Vector lower= cs; upper[Ls-1]=-mass*upper[Ls-1]; lower[0] =-mass*lower[0]; M5D(psi,psi,Din,lower,diag,upper); @@ -217,9 +217,9 @@ void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &D template void CayleyFermion5D::Meo5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - std::vector diag = beo; - std::vector upper(Ls); - std::vector lower(Ls); + Vector diag = beo; + Vector upper(Ls); + Vector lower(Ls); for(int i=0;i void CayleyFermion5D::Mooee (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - std::vector diag = bee; - std::vector upper(Ls); - std::vector lower(Ls); + Vector diag = bee; + Vector upper(Ls); + Vector lower(Ls); for(int i=0;i void CayleyFermion5D::MooeeDag (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - std::vector diag = bee; - std::vector upper(Ls); - std::vector lower(Ls); + Vector diag = bee; + Vector upper(Ls); + Vector lower(Ls); for (int s=0;s void CayleyFermion5D::M5Ddag (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - std::vector diag(Ls,1.0); - std::vector upper(Ls,-1.0); - std::vector lower(Ls,-1.0); + Vector diag(Ls,1.0); + Vector upper(Ls,-1.0); + Vector lower(Ls,-1.0); upper[Ls-1]=-mass*upper[Ls-1]; lower[0] =-mass*lower[0]; M5Ddag(psi,chi,chi,lower,diag,upper); @@ -289,9 +289,9 @@ template void CayleyFermion5D::MeooeDag5D (const FermionField &psi, FermionField &Din) { int Ls=this->Ls; - std::vector diag =bs; - std::vector upper=cs; - std::vector lower=cs; + Vector diag =bs; + Vector upper=cs; + Vector lower=cs; for (int s=0;s::MeoDeriv(GaugeField &mat,const FermionField &U,const template void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) { - std::vector gamma(this->Ls); + Vector gamma(this->Ls); for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; SetCoefficientsInternal(1.0,gamma,b,c); } @@ -436,13 +436,13 @@ void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re template void CayleyFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) { - std::vector gamma(this->Ls); + Vector gamma(this->Ls); for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; SetCoefficientsInternal(zolo_hi,gamma,b,c); } //Zolo template -void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector & gamma,RealD b,RealD c) +void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c) { int Ls=this->Ls; diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index 6dbc630e..e4587308 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -108,16 +108,16 @@ public: void M5D(const FermionField &psi, const FermionField &phi, FermionField &chi, - std::vector &lower, - std::vector &diag, - std::vector &upper); + Vector &lower, + Vector &diag, + Vector &upper); void M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, - std::vector &lower, - std::vector &diag, - std::vector &upper); + Vector &lower, + Vector &diag, + Vector &upper); void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv); void MooeeInternalCompute(int dag, int inv, Vector > & Matp, Vector > & Matm); @@ -149,29 +149,29 @@ public: RealD mass; // Save arguments to SetCoefficientsInternal - std::vector _gamma; + Vector _gamma; RealD _zolo_hi; RealD _b; RealD _c; // Cayley form Moebius (tanh and zolotarev) - std::vector omega; - std::vector bs; // S dependent coeffs - std::vector cs; - std::vector as; + Vector omega; + Vector bs; // S dependent coeffs + Vector cs; + Vector as; // For preconditioning Cayley form - std::vector bee; - std::vector cee; - std::vector aee; - std::vector beo; - std::vector ceo; - std::vector aeo; + Vector bee; + Vector cee; + Vector aee; + Vector beo; + Vector ceo; + Vector aeo; // LDU factorisation of the eeoo matrix - std::vector lee; - std::vector leem; - std::vector uee; - std::vector ueem; - std::vector dee; + Vector lee; + Vector leem; + Vector uee; + Vector ueem; + Vector dee; // Matrices of 5d ee inverse params Vector > MatpInv; @@ -203,16 +203,16 @@ public: protected: virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c); - virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector & gamma,RealD b,RealD c); + virtual void SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c); }; NAMESPACE_END(Grid); #define INSTANTIATE_DPERP(A) \ template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi, \ - std::vector &lower,std::vector &diag,std::vector &upper); \ + Vector &lower,Vector &diag,Vector &upper); \ template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi, \ - std::vector &lower,std::vector &diag,std::vector &upper); \ + Vector &lower,Vector &diag,Vector &upper); \ template void CayleyFermion5D< A >::MooeeInv (const FermionField &psi, FermionField &chi); \ template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi); diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc b/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc index 560f5dcb..8964582c 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc @@ -41,9 +41,9 @@ template void CayleyFermion5D::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - std::vector &lower, - std::vector &diag, - std::vector &upper) + Vector &lower, + Vector &diag, + Vector &upper) { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); @@ -87,9 +87,9 @@ template void CayleyFermion5D::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - std::vector &lower, - std::vector &diag, - std::vector &upper) + Vector &lower, + Vector &diag, + Vector &upper) { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dssp.cc b/Grid/qcd/action/fermion/CayleyFermion5Dssp.cc index 80cc4688..650c391c 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dssp.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dssp.cc @@ -41,9 +41,9 @@ template void CayleyFermion5D::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi, - std::vector &lower, - std::vector &diag, - std::vector &upper) + Vector &lower, + Vector &diag, + Vector &upper) { Coeff_t one(1.0); int Ls=this->Ls; @@ -64,9 +64,9 @@ template void CayleyFermion5D::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, - std::vector &lower, - std::vector &diag, - std::vector &upper) + Vector &lower, + Vector &diag, + Vector &upper) { Coeff_t one(1.0); int Ls=this->Ls; diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc b/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc index 95bd31bd..5482c384 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -54,9 +54,9 @@ template void CayleyFermion5D::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - std::vector &lower, - std::vector &diag, - std::vector &upper) + Vector &lower, + Vector &diag, + Vector &upper) { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); @@ -200,9 +200,9 @@ template void CayleyFermion5D::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - std::vector &lower, - std::vector &diag, - std::vector &upper) + Vector &lower, + Vector &diag, + Vector &upper) { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h index 0e0c1d75..379c5f8f 100644 --- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h +++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h @@ -89,12 +89,12 @@ protected: RealD mass; RealD R; RealD ZoloHiInv; - std::vector Beta; - std::vector cc;; - std::vector cc_d;; - std::vector sqrt_cc; - std::vector See; - std::vector Aee; + Vector Beta; + Vector cc;; + Vector cc_d;; + Vector sqrt_cc; + Vector See; + Vector Aee; }; diff --git a/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc b/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc index 6aa848cc..89de7315 100644 --- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc +++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc @@ -131,9 +131,9 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi, FermionField& chi else{ shiftm = -shift*(mq3-mq2); } } - std::vector diag(Ls,1.0); - std::vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm; - std::vector lower(Ls,-1.0); lower[0] = mq1 + shiftp; + Vector diag(Ls,1.0); + Vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm; + Vector lower(Ls,-1.0); lower[0] = mq1 + shiftp; #if(0) std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl; @@ -168,9 +168,9 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi, FermionField& else{ shiftm = -shift*(mq3-mq2); } } - std::vector diag(Ls,1.0); - std::vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp; - std::vector lower(Ls,-1.0); lower[0] = mq1 + shiftm; + Vector diag(Ls,1.0); + Vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp; + Vector lower(Ls,-1.0); lower[0] = mq1 + shiftm; #if(0) std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl; @@ -194,9 +194,9 @@ void DomainWallEOFAFermion::Mooee(const FermionField& psi, FermionField& c { int Ls = this->Ls; - std::vector diag = this->bee; - std::vector upper(Ls); - std::vector lower(Ls); + Vector diag = this->bee; + Vector upper(Ls); + Vector lower(Ls); for(int s=0; scee[s]; @@ -213,9 +213,9 @@ void DomainWallEOFAFermion::MooeeDag(const FermionField& psi, FermionField { int Ls = this->Ls; - std::vector diag = this->bee; - std::vector upper(Ls); - std::vector lower(Ls); + Vector diag = this->bee; + Vector upper(Ls); + Vector lower(Ls); for(int s=0; scee[s]; @@ -231,7 +231,7 @@ void DomainWallEOFAFermion::MooeeDag(const FermionField& psi, FermionField //Zolo template -void DomainWallEOFAFermion::SetCoefficientsInternal(RealD zolo_hi, std::vector& gamma, RealD b, RealD c) +void DomainWallEOFAFermion::SetCoefficientsInternal(RealD zolo_hi, Vector& gamma, RealD b, RealD c) { int Ls = this->Ls; int pm = this->pm; diff --git a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h index a3344ec4..eab56346 100644 --- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h +++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h @@ -70,10 +70,10 @@ public: // Instantiate different versions depending on Impl ///////////////////////////////////////////////////// void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, - std::vector& lower, std::vector& diag, std::vector& upper); + Vector& lower, Vector& diag, Vector& upper); void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, - std::vector& lower, std::vector& diag, std::vector& upper); + Vector& lower, Vector& diag, Vector& upper); void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv); @@ -94,16 +94,16 @@ public: RealD _M5, const ImplParams& p=ImplParams()); protected: - void SetCoefficientsInternal(RealD zolo_hi, std::vector& gamma, RealD b, RealD c); + void SetCoefficientsInternal(RealD zolo_hi, Vector& gamma, RealD b, RealD c); }; NAMESPACE_END(Grid); #define INSTANTIATE_DPERP_DWF_EOFA(A) \ template void DomainWallEOFAFermion::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \ - std::vector& lower, std::vector& diag, std::vector& upper); \ + Vector& lower, Vector& diag, Vector& upper); \ template void DomainWallEOFAFermion::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \ - std::vector& lower, std::vector& diag, std::vector& upper); \ + Vector& lower, Vector& diag, Vector& upper); \ template void DomainWallEOFAFermion::MooeeInv(const FermionField& psi, FermionField& chi); \ template void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi, FermionField& chi); diff --git a/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc b/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc index 58aee4ff..74e9a62f 100644 --- a/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc +++ b/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc @@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid); // Pplus backwards.. template void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, - std::vector& lower, std::vector& diag, std::vector& upper) + Vector& lower, Vector& diag, Vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); int Ls = this->Ls; @@ -81,7 +81,7 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi template void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, - std::vector& lower, std::vector& diag, std::vector& upper) + Vector& lower, Vector& diag, Vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); @@ -180,11 +180,11 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi_i, Fermion assert(psi.Checkerboard() == psi.Checkerboard()); - std::vector ueec(Ls); - std::vector deec(Ls+1); - std::vector leec(Ls); - std::vector ueemc(Ls); - std::vector leemc(Ls); + Vector ueec(Ls); + Vector deec(Ls+1); + Vector leec(Ls); + Vector ueemc(Ls); + Vector leemc(Ls); for(int s=0; suee[s]); diff --git a/Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc b/Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc index 1825d07e..c9e638e5 100644 --- a/Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc +++ b/Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc @@ -40,7 +40,7 @@ NAMESPACE_BEGIN(Grid); // Pplus backwards template void DomainWallEOFAFermion::M5D(const FermionField& psi, const FermionField& phi, - FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper) + FermionField& chi, Vector& lower, Vector& diag, Vector& upper) { Coeff_t one(1.0); int Ls = this->Ls; @@ -60,7 +60,7 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi, const FermionFiel template void DomainWallEOFAFermion::M5Ddag(const FermionField& psi, const FermionField& phi, - FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper) + FermionField& chi, Vector& lower, Vector& diag, Vector& upper) { Coeff_t one(1.0); int Ls = this->Ls; diff --git a/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc b/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc index 43fa16ec..5ad8be27 100644 --- a/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc +++ b/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc @@ -53,7 +53,7 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi, FermionField template void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, - std::vector& lower, std::vector& diag, std::vector& upper) + Vector& lower, Vector& diag, Vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); @@ -201,7 +201,7 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi template void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, - std::vector& lower, std::vector& diag, std::vector& upper) + Vector& lower, Vector& diag, Vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); diff --git a/Grid/qcd/action/fermion/MobiusEOFAFermion.cc b/Grid/qcd/action/fermion/MobiusEOFAFermion.cc index d368e1da..86ce3e56 100644 --- a/Grid/qcd/action/fermion/MobiusEOFAFermion.cc +++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.cc @@ -197,9 +197,9 @@ void MobiusEOFAFermion::M5D(const FermionField& psi, FermionField& chi) { int Ls = this->Ls; - std::vector diag(Ls,1.0); - std::vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; - std::vector lower(Ls,-1.0); lower[0] = this->mq1; + Vector diag(Ls,1.0); + Vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; + Vector lower(Ls,-1.0); lower[0] = this->mq1; // no shift term if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); } @@ -213,9 +213,9 @@ void MobiusEOFAFermion::M5Ddag(const FermionField& psi, FermionField& chi) { int Ls = this->Ls; - std::vector diag(Ls,1.0); - std::vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; - std::vector lower(Ls,-1.0); lower[0] = this->mq1; + Vector diag(Ls,1.0); + Vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; + Vector lower(Ls,-1.0); lower[0] = this->mq1; // no shift term if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); } @@ -231,9 +231,9 @@ void MobiusEOFAFermion::Mooee(const FermionField& psi, FermionField& chi) int Ls = this->Ls; // coefficients of Mooee - std::vector diag = this->bee; - std::vector upper(Ls); - std::vector lower(Ls); + Vector diag = this->bee; + Vector upper(Ls); + Vector lower(Ls); for(int s=0; scee[s]; lower[s] = -this->cee[s]; @@ -254,9 +254,9 @@ void MobiusEOFAFermion::MooeeDag(const FermionField& psi, FermionField& ch int Ls = this->Ls; // coefficients of MooeeDag - std::vector diag = this->bee; - std::vector upper(Ls); - std::vector lower(Ls); + Vector diag = this->bee; + Vector upper(Ls); + Vector lower(Ls); for(int s=0; scee[s+1]; @@ -315,10 +315,10 @@ void MobiusEOFAFermion::SetCoefficientsPrecondShiftOps() // Tridiagonal solve for MooeeInvDag_shift_lc { Coeff_t m(0.0); - std::vector d = Mooee_shift; - std::vector u(Ls,0.0); - std::vector y(Ls,0.0); - std::vector q(Ls,0.0); + Vector d = Mooee_shift; + Vector u(Ls,0.0); + Vector y(Ls,0.0); + Vector q(Ls,0.0); if(pm == 1){ u[0] = 1.0; } else{ u[Ls-1] = 1.0; } diff --git a/Grid/qcd/action/fermion/MobiusEOFAFermion.h b/Grid/qcd/action/fermion/MobiusEOFAFermion.h index e30bfb47..e7e4df39 100644 --- a/Grid/qcd/action/fermion/MobiusEOFAFermion.h +++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.h @@ -42,11 +42,11 @@ public: public: // Shift operator coefficients for red-black preconditioned Mobius EOFA - std::vector Mooee_shift; - std::vector MooeeInv_shift_lc; - std::vector MooeeInv_shift_norm; - std::vector MooeeInvDag_shift_lc; - std::vector MooeeInvDag_shift_norm; + Vector Mooee_shift; + Vector MooeeInv_shift_lc; + Vector MooeeInv_shift_norm; + Vector MooeeInvDag_shift_lc; + Vector MooeeInvDag_shift_norm; virtual void Instantiatable(void) {}; @@ -74,18 +74,18 @@ public: // Instantiate different versions depending on Impl ///////////////////////////////////////////////////// void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, - std::vector& lower, std::vector& diag, std::vector& upper); + Vector& lower, Vector& diag, Vector& upper); void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, - std::vector& lower, std::vector& diag, std::vector& upper, - std::vector& shift_coeffs); + Vector& lower, Vector& diag, Vector& upper, + Vector& shift_coeffs); void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, - std::vector& lower, std::vector& diag, std::vector& upper); + Vector& lower, Vector& diag, Vector& upper); void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, - std::vector& lower, std::vector& diag, std::vector& upper, - std::vector& shift_coeffs); + Vector& lower, Vector& diag, Vector& upper, + Vector& shift_coeffs); void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv); @@ -113,13 +113,13 @@ NAMESPACE_END(Grid); #define INSTANTIATE_DPERP_MOBIUS_EOFA(A) \ template void MobiusEOFAFermion::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \ - std::vector& lower, std::vector& diag, std::vector& upper); \ + Vector& lower, Vector& diag, Vector& upper); \ template void MobiusEOFAFermion::M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \ - std::vector& lower, std::vector& diag, std::vector& upper, std::vector& shift_coeffs); \ + Vector& lower, Vector& diag, Vector& upper, Vector& shift_coeffs); \ template void MobiusEOFAFermion::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \ - std::vector& lower, std::vector& diag, std::vector& upper); \ + Vector& lower, Vector& diag, Vector& upper); \ template void MobiusEOFAFermion::M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \ - std::vector& lower, std::vector& diag, std::vector& upper, std::vector& shift_coeffs); \ + Vector& lower, Vector& diag, Vector& upper, Vector& shift_coeffs); \ template void MobiusEOFAFermion::MooeeInv(const FermionField& psi, FermionField& chi); \ template void MobiusEOFAFermion::MooeeInv_shift(const FermionField& psi, FermionField& chi); \ template void MobiusEOFAFermion::MooeeInvDag(const FermionField& psi, FermionField& chi); \ diff --git a/Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc b/Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc index dc865b4f..8d0b0524 100644 --- a/Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc +++ b/Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc @@ -37,7 +37,7 @@ NAMESPACE_BEGIN(Grid); template void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - std::vector &lower, std::vector &diag, std::vector &upper) + Vector &lower, Vector &diag, Vector &upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -79,8 +79,8 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField template void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - std::vector &lower, std::vector &diag, std::vector &upper, - std::vector &shift_coeffs) + Vector &lower, Vector &diag, Vector &upper, + Vector &shift_coeffs) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -127,7 +127,7 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion template void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - std::vector &lower, std::vector &diag, std::vector &upper) + Vector &lower, Vector &diag, Vector &upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -169,8 +169,8 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie template void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - std::vector &lower, std::vector &diag, std::vector &upper, - std::vector &shift_coeffs) + Vector &lower, Vector &diag, Vector &upper, + Vector &shift_coeffs) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); diff --git a/Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc b/Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc index 937b5019..254cdb54 100644 --- a/Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc +++ b/Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc @@ -40,7 +40,7 @@ NAMESPACE_BEGIN(Grid); // Pplus backwards template void MobiusEOFAFermion::M5D(const FermionField& psi, const FermionField& phi, - FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper) + FermionField& chi, Vector& lower, Vector& diag, Vector& upper) { Coeff_t one(1.0); int Ls = this->Ls; @@ -60,8 +60,8 @@ void MobiusEOFAFermion::M5D(const FermionField& psi, const FermionField& p template void MobiusEOFAFermion::M5D_shift(const FermionField& psi, const FermionField& phi, - FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper, - std::vector& shift_coeffs) + FermionField& chi, Vector& lower, Vector& diag, Vector& upper, + Vector& shift_coeffs) { Coeff_t one(1.0); int Ls = this->Ls; @@ -83,7 +83,7 @@ void MobiusEOFAFermion::M5D_shift(const FermionField& psi, const FermionFi template void MobiusEOFAFermion::M5Ddag(const FermionField& psi, const FermionField& phi, - FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper) + FermionField& chi, Vector& lower, Vector& diag, Vector& upper) { Coeff_t one(1.0); int Ls = this->Ls; @@ -103,8 +103,8 @@ void MobiusEOFAFermion::M5Ddag(const FermionField& psi, const FermionField template void MobiusEOFAFermion::M5Ddag_shift(const FermionField& psi, const FermionField& phi, - FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper, - std::vector& shift_coeffs) + FermionField& chi, Vector& lower, Vector& diag, Vector& upper, + Vector& shift_coeffs) { Coeff_t one(1.0); int Ls = this->Ls; diff --git a/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc b/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc index ff8c5816..1cd99ab5 100644 --- a/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc +++ b/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc @@ -64,7 +64,7 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField& psi, Fermion template void MobiusEOFAFermion::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, - std::vector& lower, std::vector& diag, std::vector& upper) + Vector& lower, Vector& diag, Vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); @@ -211,8 +211,8 @@ void MobiusEOFAFermion::M5D(const FermionField& psi_i, const FermionField& template void MobiusEOFAFermion::M5D_shift(const FermionField& psi_i, const FermionField& phi_i, - FermionField& chi_i, std::vector& lower, std::vector& diag, std::vector& upper, - std::vector& shift_coeffs) + FermionField& chi_i, Vector& lower, Vector& diag, Vector& upper, + Vector& shift_coeffs) { #if 0 auto & psi = psi_i; @@ -397,7 +397,7 @@ void MobiusEOFAFermion::M5D_shift(const FermionField& psi_i, const Fermion template void MobiusEOFAFermion::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, - std::vector& lower, std::vector& diag, std::vector& upper) + Vector& lower, Vector& diag, Vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); @@ -542,8 +542,8 @@ void MobiusEOFAFermion::M5Ddag(const FermionField& psi_i, const FermionFie template void MobiusEOFAFermion::M5Ddag_shift(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, - std::vector& lower, std::vector& diag, std::vector& upper, - std::vector& shift_coeffs) + Vector& lower, Vector& diag, Vector& upper, + Vector& shift_coeffs) { #if 0 auto & psi = psi_i; diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h index 7a3de997..d61515f0 100644 --- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h +++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h @@ -93,8 +93,8 @@ protected: RealD R; RealD amax; RealD scale; - std::vector p; - std::vector q; + Vector p; + Vector q; }; diff --git a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h index c6e5470b..b6ab8a55 100644 --- a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h +++ b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h @@ -36,7 +36,7 @@ template class KappaSimilarityTransform { public: INHERIT_IMPL_TYPES(Matrix); - std::vector kappa, kappaDag, kappaInv, kappaInvDag; + Vector kappa, kappaDag, kappaInv, kappaInvDag; KappaSimilarityTransform (Matrix &zmob) { for (int i=0;i<(int)zmob.bs.size();i++) { diff --git a/Grid/qcd/action/fermion/ZMobiusFermion.h b/Grid/qcd/action/fermion/ZMobiusFermion.h index 4c2390d6..bdea3271 100644 --- a/Grid/qcd/action/fermion/ZMobiusFermion.h +++ b/Grid/qcd/action/fermion/ZMobiusFermion.h @@ -48,7 +48,7 @@ public: GridCartesian &FourDimGrid, GridRedBlackCartesian &FourDimRedBlackGrid, RealD _mass,RealD _M5, - std::vector &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : + Vector &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : CayleyFermion5D(_Umu, FiveDimGrid, @@ -59,7 +59,7 @@ public: { // RealD eps = 1.0; std::cout< zgamma(this->Ls); + Vector zgamma(this->Ls); for(int s=0;sLs;s++){ zgamma[s] = gamma[s]; } From 6da9aa99714e808ed77d275977f25db645466cb8 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Mon, 18 Mar 2019 18:25:46 +0000 Subject: [PATCH 05/18] replace std::vector with Vector in benchmark --- benchmarks/Benchmark_mooee.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index f4c2b998..77cc35b5 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -184,7 +184,7 @@ int main (int argc, char ** argv) RealD b=1.5;// Scale factor b+c=2, b-c=1 RealD c=0.5; - std::vector gamma(Ls,std::complex(1.0,0.0)); + Vector gamma(Ls,std::complex(1.0,0.0)); ZMobiusFermionVec5dR zDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5,gamma,b,c); std::cout< Date: Fri, 29 Mar 2019 16:43:31 +0000 Subject: [PATCH 06/18] Fix gpu MultRealPart and MaddRealPart bug --- Grid/simd/Grid_gpu.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Grid/simd/Grid_gpu.h b/Grid/simd/Grid_gpu.h index 2f7d47ec..6dc5123a 100644 --- a/Grid/simd/Grid_gpu.h +++ b/Grid/simd/Grid_gpu.h @@ -245,18 +245,18 @@ namespace Optimization { struct MultRealPart{ accelerator_inline float4 operator()(float4 a, float4 b){ float4 ymm0; - ymm0.x = a.y; - ymm0.y = a.y; - ymm0.z = a.w; - ymm0.w = a.w; + ymm0.x = a.x; + ymm0.y = a.x; + ymm0.z = a.z; + ymm0.w = a.z; return ymm0*b; // ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, // return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br } accelerator_inline double2 operator()(double2 a, double2 b){ double2 ymm0; - ymm0.x = a.y; - ymm0.y = a.y; + ymm0.x = a.x; + ymm0.y = a.x; return ymm0*b; // ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 // return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br @@ -265,17 +265,17 @@ namespace Optimization { struct MaddRealPart{ accelerator_inline float4 operator()(float4 a, float4 b, float4 c){ float4 ymm0; // = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, - ymm0.x = a.y; - ymm0.y = a.y; - ymm0.z = a.w; - ymm0.w = a.w; + ymm0.x = a.x; + ymm0.y = a.x; + ymm0.z = a.z; + ymm0.w = a.z; return c+ymm0*b; } accelerator_inline double2 operator()(double2 a, double2 b, double2 c){ // ymm0 = _mm_shuffle_pd( a, a, 0x0 ); double2 ymm0; - ymm0.x = a.y; - ymm0.y = a.y; + ymm0.x = a.x; + ymm0.y = a.x; return c+ymm0*b; } }; From d3b5c02e2d798c672eaa4cf54f41a900e1a9dc66 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Thu, 2 May 2019 11:02:39 +0100 Subject: [PATCH 07/18] measure M5D bandwidth and fix M5D flop count --- Grid/qcd/action/fermion/CayleyFermion5D.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.cc b/Grid/qcd/action/fermion/CayleyFermion5D.cc index ad732825..ecb87b2f 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5D.cc @@ -163,10 +163,16 @@ template void CayleyFermion5D::CayleyReport(void) std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl; - // Flops = 6.0*(Nc*Ns) *Ls*vol - RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting + // Flops = 10.0*(Nc*Ns) *Ls*vol + RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + + // Bytes = sizeof(RealD) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting) + // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 ) + // write = 1 + RealD Gbytes = sizeof(RealD) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9; + std::cout << GridLogMessage << "Average bandwidth (GB/s) : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl; } if ( MooeeInvCalls > 0 ) { From b23305dbe236555dd7937057cd8ffccf2e51643c Mon Sep 17 00:00:00 2001 From: gfilaci Date: Thu, 2 May 2019 11:08:21 +0100 Subject: [PATCH 08/18] fix M5D flop count --- Grid/qcd/action/fermion/CayleyFermion5Dcache.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc b/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc index 8964582c..c84b7f8d 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc @@ -52,7 +52,8 @@ void CayleyFermion5D::M5D(const FermionField &psi_i, auto chi = chi_i.View(); int Ls =this->Ls; assert(phi.Checkerboard() == psi.Checkerboard()); - // Flops = 6.0*(Nc*Ns) *Ls*vol + // 10 = 3 complex mult + 2 complex add + // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) M5Dcalls++; M5Dtime-=usecond(); From d9438627d935401d3a7baaadad0a0e17f0897df4 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Thu, 2 May 2019 11:10:57 +0100 Subject: [PATCH 09/18] M5D benchmark without vector copy overhead --- benchmarks/Benchmark_mooee.cc | 38 +++++++++++++++-------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index 77cc35b5..6233d333 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -82,7 +82,14 @@ int main (int argc, char ** argv) DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); double t0,t1; - + + typedef typename DomainWallFermionR::Coeff_t Coeff_t; + Vector diag = Dw.bs; + Vector upper= Dw.cs; + Vector lower= Dw.cs; + upper[Ls-1]=-Dw.mass*upper[Ls-1]; + lower[0] =-Dw.mass*lower[0]; + LatticeFermion r_eo(FGrid); LatticeFermion src_e (FrbGrid); LatticeFermion src_o (FrbGrid); @@ -99,13 +106,13 @@ int main (int argc, char ** argv) r_o = Zero(); -#define BENCH_DW(A,in,out) \ +#define BENCH_DW(A,...) \ Dw.CayleyZeroCounters(); \ - Dw. A (in,out); \ + Dw. A (__VA_ARGS__); \ FGrid->Barrier(); \ t0=usecond(); \ for(int i=0;iBarrier(); \ @@ -143,23 +150,10 @@ int main (int argc, char ** argv) std::cout<Barrier(); \ - t0=usecond(); \ - for(int i=0;iBarrier(); \ - Dw.CayleyReport(); \ - std::cout< Date: Wed, 8 May 2019 11:51:37 +0100 Subject: [PATCH 10/18] duplicate CayleyFermion5D for gpu --- Grid/qcd/action/fermion/CayleyFermion5D.h | 4 + Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc | 247 ++++++++++++++++++ 2 files changed, 251 insertions(+) create mode 100644 Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index e4587308..916bd0c0 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -216,9 +216,13 @@ NAMESPACE_END(Grid); template void CayleyFermion5D< A >::MooeeInv (const FermionField &psi, FermionField &chi); \ template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi); +#ifdef GRID_NVCC +#define CAYLEY_DPERP_GPU +#else #undef CAYLEY_DPERP_DENSE #define CAYLEY_DPERP_CACHE #undef CAYLEY_DPERP_LINALG +#endif #define CAYLEY_DPERP_VEC #endif diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc new file mode 100644 index 00000000..f99804a5 --- /dev/null +++ b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc @@ -0,0 +1,247 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include +#include + + +NAMESPACE_BEGIN(Grid); + +// Pminus fowards +// Pplus backwards.. +template +void CayleyFermion5D::M5D(const FermionField &psi_i, + const FermionField &phi_i, + FermionField &chi_i, + Vector &lower, + Vector &diag, + Vector &upper) +{ + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + auto psi = psi_i.View(); + auto phi = phi_i.View(); + auto chi = chi_i.View(); + int Ls =this->Ls; + assert(phi.Checkerboard() == psi.Checkerboard()); + // 10 = 3 complex mult + 2 complex add + // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) + M5Dcalls++; + M5Dtime-=usecond(); + + thread_loop( (int ss=0;ssoSites();ss+=Ls),{ // adds Ls + for(int s=0;s +void CayleyFermion5D::M5Ddag(const FermionField &psi_i, + const FermionField &phi_i, + FermionField &chi_i, + Vector &lower, + Vector &diag, + Vector &upper) +{ + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + auto psi = psi_i.View(); + auto phi = phi_i.View(); + auto chi = chi_i.View(); + int Ls =this->Ls; + assert(phi.Checkerboard() == psi.Checkerboard()); + + // Flops = 6.0*(Nc*Ns) *Ls*vol + M5Dcalls++; + M5Dtime-=usecond(); + + thread_loop( (int ss=0;ssoSites();ss+=Ls),{ // adds Ls + auto tmp = psi[0]; + for(int s=0;s +void CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi_i) +{ + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + + auto psi = psi_i.View(); + auto chi = chi_i.View(); + + int Ls=this->Ls; + + MooeeInvCalls++; + MooeeInvTime-=usecond(); + + thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls + auto tmp = psi[0]; + + // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops + // Apply (L^{\prime})^{-1} + chi[ss]=psi[ss]; // chi[0]=psi[0] + for(int s=1;s=0;s--){ + spProj5m(tmp,chi[ss+s+1]); + chi[ss+s] = chi[ss+s] - uee[s]*tmp; + } + }); + + MooeeInvTime+=usecond(); + +} + +template +void CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi_i) +{ + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + int Ls=this->Ls; + + auto psi = psi_i.View(); + auto chi = chi_i.View(); + + assert(psi.Checkerboard() == psi.Checkerboard()); + + MooeeInvCalls++; + MooeeInvTime-=usecond(); + + thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls + + auto tmp = psi[0]; + + // Apply (U^{\prime})^{-dagger} + chi[ss]=psi[ss]; + for (int s=1;s=0;s--){ + spProj5p(tmp,chi[ss+s+1]); + chi[ss+s] = chi[ss+s] - conjugate(lee[s])*tmp; + } + }); + + MooeeInvTime+=usecond(); + +} + +#ifdef CAYLEY_DPERP_GPU +INSTANTIATE_DPERP(WilsonImplF); +INSTANTIATE_DPERP(WilsonImplD); +INSTANTIATE_DPERP(GparityWilsonImplF); +INSTANTIATE_DPERP(GparityWilsonImplD); +INSTANTIATE_DPERP(ZWilsonImplF); +INSTANTIATE_DPERP(ZWilsonImplD); + +INSTANTIATE_DPERP(WilsonImplFH); +INSTANTIATE_DPERP(WilsonImplDF); +INSTANTIATE_DPERP(GparityWilsonImplFH); +INSTANTIATE_DPERP(GparityWilsonImplDF); +INSTANTIATE_DPERP(ZWilsonImplFH); +INSTANTIATE_DPERP(ZWilsonImplDF); +#endif + +NAMESPACE_END(Grid); From 2b3c22f03df863de1337082e4336d0ec305a5807 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Wed, 8 May 2019 12:01:11 +0100 Subject: [PATCH 11/18] bandwidth dependent on grid default precision --- Grid/qcd/action/fermion/CayleyFermion5D.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.cc b/Grid/qcd/action/fermion/CayleyFermion5D.cc index ecb87b2f..afffba65 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5D.cc @@ -168,10 +168,10 @@ template void CayleyFermion5D::CayleyReport(void) std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; - // Bytes = sizeof(RealD) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting) + // Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting) // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 ) // write = 1 - RealD Gbytes = sizeof(RealD) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9; + RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9; std::cout << GridLogMessage << "Average bandwidth (GB/s) : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl; } From f1744b3f01e81f6e6ce5c015185cb535e10af276 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Thu, 9 May 2019 11:17:55 +0100 Subject: [PATCH 12/18] M5D offloaded to GPU --- Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc | 48 ++++++++++--------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc index f99804a5..83f119ca 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc @@ -50,35 +50,39 @@ void CayleyFermion5D::M5D(const FermionField &psi_i, auto psi = psi_i.View(); auto phi = phi_i.View(); auto chi = chi_i.View(); + Coeff_t *lower_v = &lower[0]; + Coeff_t *diag_v = &diag[0]; + Coeff_t *upper_v = &upper[0]; int Ls =this->Ls; assert(phi.Checkerboard() == psi.Checkerboard()); + + const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites4d = nsimd * grid->oSites() / Ls; + // 10 = 3 complex mult + 2 complex add // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) M5Dcalls++; M5Dtime-=usecond(); - - thread_loop( (int ss=0;ssoSites();ss+=Ls),{ // adds Ls + + typedef typename SiteSpinor::scalar_object ScalarSiteSpinor; + + accelerator_loopN( sss, sites4d ,{ + uint64_t lane = sss % nsimd; + uint64_t ss = Ls * (sss / nsimd); + for(int s=0;s Date: Thu, 9 May 2019 11:19:39 +0100 Subject: [PATCH 13/18] remove unused typedef --- Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc index 83f119ca..6b532c7a 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc @@ -64,8 +64,6 @@ void CayleyFermion5D::M5D(const FermionField &psi_i, M5Dcalls++; M5Dtime-=usecond(); - typedef typename SiteSpinor::scalar_object ScalarSiteSpinor; - accelerator_loopN( sss, sites4d ,{ uint64_t lane = sss % nsimd; uint64_t ss = Ls * (sss / nsimd); From 22e35c9ddd9d8ccd812b875695c7b01770f0e11f Mon Sep 17 00:00:00 2001 From: gfilaci Date: Fri, 10 May 2019 12:23:39 +0100 Subject: [PATCH 14/18] M5Ddag offloaded to GPU --- Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc | 46 ++++++++++--------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc index 6b532c7a..d184b70e 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc @@ -99,35 +99,37 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi_i, auto psi = psi_i.View(); auto phi = phi_i.View(); auto chi = chi_i.View(); + Coeff_t *lower_v = &lower[0]; + Coeff_t *diag_v = &diag[0]; + Coeff_t *upper_v = &upper[0]; int Ls =this->Ls; assert(phi.Checkerboard() == psi.Checkerboard()); - // Flops = 6.0*(Nc*Ns) *Ls*vol + const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites4d = nsimd * grid->oSites() / Ls; + + // 10 = 3 complex mult + 2 complex add + // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) M5Dcalls++; M5Dtime-=usecond(); - thread_loop( (int ss=0;ssoSites();ss+=Ls),{ // adds Ls - auto tmp = psi[0]; + accelerator_loopN( sss, sites4d ,{ + uint64_t lane = sss % nsimd; + uint64_t ss = Ls * (sss / nsimd); + for(int s=0;s Date: Mon, 13 May 2019 12:37:12 +0100 Subject: [PATCH 15/18] MooeeInv offloaded to GPU --- Grid/qcd/action/fermion/CayleyFermion5D.cc | 7 +- Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc | 75 ++++++++++++------- 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.cc b/Grid/qcd/action/fermion/CayleyFermion5D.cc index afffba65..be4f9127 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5D.cc @@ -180,11 +180,16 @@ template void CayleyFermion5D::CayleyReport(void) std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl; std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; - +#ifdef GRID_NVCC + RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; +#else // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; +#endif } } diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc index d184b70e..00f83c21 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc @@ -143,42 +143,59 @@ void CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField auto psi = psi_i.View(); auto chi = chi_i.View(); - + Coeff_t *lee_v = &lee[0]; + Coeff_t *leem_v = &leem[0]; + Coeff_t *uee_v = &uee[0]; + Coeff_t *ueem_v = &ueem[0]; + Coeff_t *dee_v = &dee[0]; + int Ls=this->Ls; - + const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites4d = nsimd * grid->oSites() / Ls; + + typedef typename SiteSpinor::scalar_object ScalarSiteSpinor; + MooeeInvCalls++; MooeeInvTime-=usecond(); - - thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls - auto tmp = psi[0]; - - // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops - // Apply (L^{\prime})^{-1} - chi[ss]=psi[ss]; // chi[0]=psi[0] - for(int s=1;s=0;s--){ - spProj5m(tmp,chi[ss+s+1]); - chi[ss+s] = chi[ss+s] - uee[s]*tmp; + res = extractLane(lane,chi[ss+s]); + res = (1.0/dee_v[s])*res - uee_v[s]*tmp - ueem_v[s]*acc; + spProj5m(tmp,res); + insertLane(lane,chi[ss+s],res); } }); - + MooeeInvTime+=usecond(); } From 955cc7790f6c236c3e025bc8639a65c03fde8545 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Mon, 13 May 2019 14:25:29 +0100 Subject: [PATCH 16/18] MooeeInvDag offloaded to GPU --- Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc | 82 +++++++++++-------- 1 file changed, 49 insertions(+), 33 deletions(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc index 00f83c21..367c5ff1 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc @@ -205,48 +205,64 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - int Ls=this->Ls; - + auto psi = psi_i.View(); auto chi = chi_i.View(); - - assert(psi.Checkerboard() == psi.Checkerboard()); - + Coeff_t *lee_v = &lee[0]; + Coeff_t *leem_v = &leem[0]; + Coeff_t *uee_v = &uee[0]; + Coeff_t *ueem_v = &ueem[0]; + Coeff_t *dee_v = &dee[0]; + + int Ls=this->Ls; + const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites4d = nsimd * grid->oSites() / Ls; + + typedef typename SiteSpinor::scalar_object ScalarSiteSpinor; + MooeeInvCalls++; MooeeInvTime-=usecond(); - - thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls - - auto tmp = psi[0]; - - // Apply (U^{\prime})^{-dagger} - chi[ss]=psi[ss]; - for (int s=1;s=0;s--){ - spProj5p(tmp,chi[ss+s+1]); - chi[ss+s] = chi[ss+s] - conjugate(lee[s])*tmp; + res = extractLane(lane,chi[ss+s]); + res = conjugate(1.0/dee_v[s])*res - conjugate(lee_v[s])*tmp - conjugate(leem_v[s])*acc; + spProj5p(tmp,res); + insertLane(lane,chi[ss+s],res); } }); - + MooeeInvTime+=usecond(); - + } #ifdef CAYLEY_DPERP_GPU From e3c56fd9b3825e940df05f197188ae299a9793f2 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Mon, 13 May 2019 15:52:00 +0100 Subject: [PATCH 17/18] CayleyZeroCounters before benchmark loop --- benchmarks/Benchmark_mooee.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index 6233d333..cbeb6c2f 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -107,9 +107,9 @@ int main (int argc, char ** argv) #define BENCH_DW(A,...) \ - Dw.CayleyZeroCounters(); \ Dw. A (__VA_ARGS__); \ FGrid->Barrier(); \ + Dw.CayleyZeroCounters(); \ t0=usecond(); \ for(int i=0;iBarrier(); \ + zDw.CayleyZeroCounters(); \ t0=usecond(); \ for(int i=0;iBarrier(); \ + Dw.CayleyZeroCounters(); \ t0=usecond(); \ for(int i=0;i Date: Tue, 14 May 2019 15:35:54 +0100 Subject: [PATCH 18/18] fix inner product with thrust reduction --- Grid/lattice/Lattice_reduction.h | 57 ++++++++++---------------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 2d92ead8..ba871d1f 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -23,12 +23,7 @@ Author: paboyle #include #ifdef GRID_NVCC -#include -#include -#include -#include -#include -#include +#include #endif NAMESPACE_BEGIN(Grid); @@ -41,23 +36,12 @@ template inline RealD norm2(const Lattice &arg){ return real(nrm); } -#if 0 -//#warning "ThrustReduce compiled" -//#include -template -vobj ThrustNorm(const Lattice &lat) +#ifdef GRID_NVCC +template +struct innerProductFunctor : public thrust::binary_function { - typedef typename vobj::scalar_type scalar_type; - auto lat_v=lat.View(); - Integer s0=0; - Integer sN=lat_v.end(); - scalar_type sum = 0; - scalar_type * begin = (scalar_type *)&lat_v[s0]; - scalar_type * end = (scalar_type *)&lat_v[sN]; - thrust::reduce(begin,end,sum); - std::cout <<" thrust::reduce sum "<< sum << std::endl; - return sum; -} + accelerator R operator()(T x, T y) { return innerProduct(x,y); } +}; #endif // Double inner product @@ -75,24 +59,17 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ auto left_v = left.View(); auto right_v=right.View(); -#if 0 +#ifdef GRID_NVCC - typedef decltype(TensorRemove(innerProduct(left_v[0],right_v[0]))) inner_t; + typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; + thrust::plus binary_sum; + innerProductFunctor binary_inner_p; + Integer sN = left_v.end(); + inner_t zero = Zero(); + // is there a way of using the efficient thrust reduction while maintaining memory coalescing? + inner_t vnrm = thrust::inner_product(thrust::device, &left_v[0], &left_v[sN], &right_v[0], zero, binary_sum, binary_inner_p); + nrm = Reduce(TensorRemove(vnrm));// sum across simd - Lattice inner_tmp(grid); - - ///////////////////////// - // localInnerProduct - ///////////////////////// - auto inner_tmp_v = inner_tmp.View(); - accelerator_loop(ss,left_v,{ - inner_tmp_v[ss] = TensorRemove(innerProduct(left_v[ss],right_v[ss])); - }); - ///////////////////////// - // and site sum the scalars - ///////////////////////// - inner_t vnrm = ThrustNorm(inner_tmp); - auto vvnrm = vnrm; #else thread_loop( (int thr=0;thrSumArraySize();thr++),{ int mywork, myoff; @@ -108,9 +85,9 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ vector_type vvnrm; vvnrm=Zero(); // sum across threads for(int i=0;iSumArraySize();i++){ vvnrm = vvnrm+sumarray[i]; - } -#endif + } nrm = Reduce(vvnrm);// sum across simd +#endif right.Grid()->GlobalSum(nrm); return nrm; }