From 70e276e1ab2c3243de279e440a67d3865679ba9b Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 28 Jan 2018 01:01:14 +0000 Subject: [PATCH] parallel_for elimination -> thread_loop --- lib/algorithms/CoarsenedMatrix.h | 12 +- .../iterative/ImplicitlyRestartedLanczos.h | 10 +- lib/cshift/Cshift_common.h | 16 +- lib/lattice/Lattice_reduction.h | 21 ++- .../action/fermion/CayleyFermion5Dcache.cc | 16 +- lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 16 +- .../fermion/DomainWallEOFAFermioncache.cc | 16 +- .../fermion/DomainWallEOFAFermionvec.cc | 16 +- lib/qcd/action/fermion/FermionOperatorImpl.h | 26 +-- .../fermion/ImprovedStaggeredFermion5D.cc | 12 +- .../action/fermion/MobiusEOFAFermioncache.cc | 32 ++-- .../action/fermion/MobiusEOFAFermionvec.cc | 24 +-- lib/qcd/action/fermion/SchurDiagTwoKappa.h | 4 +- lib/qcd/action/fermion/WilsonFermion.cc | 77 ++++---- lib/qcd/action/fermion/WilsonFermion5D.cc | 166 +++++++++--------- lib/qcd/action/gauge/GaugeImplTypes.h | 4 +- .../action/scalar/ScalarInteractionAction.h | 8 +- lib/qcd/utils/LinalgUtils.h | 32 ++-- lib/qcd/utils/SUn.h | 8 +- lib/stencil/Stencil.h | 30 ++-- lib/threads/Pragmas.h | 23 +-- 21 files changed, 269 insertions(+), 300 deletions(-) diff --git a/lib/algorithms/CoarsenedMatrix.h b/lib/algorithms/CoarsenedMatrix.h index 08df1978..9adc2e68 100644 --- a/lib/algorithms/CoarsenedMatrix.h +++ b/lib/algorithms/CoarsenedMatrix.h @@ -128,9 +128,9 @@ public: for(int i=0;ioSites();ss++){ + thread_loop( (int ss=0;ssoSites();ss++),{ eProj[ss](i)=CComplex(1.0); - } + }); eProj=eProj - iProj; std::cout< compressor; Stencil.HaloExchange(in,compressor); - parallel_for(int ss=0;ssoSites();ss++){ + thread_loop( (int ss=0;ssoSites();ss++),{ siteVector res = Zero(); siteVector nbr; int ptype; @@ -291,7 +291,7 @@ public: res = res + A[point][ss]*nbr; } vstream(out[ss],res); - } + }); return norm2(out); }; @@ -384,14 +384,14 @@ public: Subspace.ProjectToSubspace(oProj,oblock); // blockProject(iProj,iblock,Subspace.subspace); // blockProject(oProj,oblock,Subspace.subspace); - parallel_for(int ss=0;ssoSites();ss++){ + thread_loop( (int ss=0;ssoSites();ss++),{ for(int j=0;j &basis,Eigen::MatrixXd& Qt,int j0, int j1, i typedef typename Field::vector_object vobj; GridBase* grid = basis[0].Grid(); - parallel_region + thread_region { std::vector < vobj > B(Nm); // Thread private - parallel_for_internal(int ss=0;ss < grid->oSites();ss++){ + thread_loop_in_region( (int ss=0;ss < grid->oSites();ss++),{ for(int j=j0; j &basis,Eigen::MatrixXd& Qt,int j0, int j1, i for(int j=j0; j &basis,Eigen::MatrixXd& Qt,in GridBase* grid = basis[0].Grid(); result.Checkerboard() = basis[0].Checkerboard(); - parallel_for(int ss=0;ss < grid->oSites();ss++){ + thread_loop( (int ss=0;ss < grid->oSites();ss++),{ vobj B = Zero(); for(int k=k0; k diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index 778855ef..efec7c67 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -48,7 +48,7 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen int stride=rhs.Grid()->_slice_stride[dimension]; if ( cbmask == 0x3 ) { - thread_loop_collapse( (int n=0;n &rhs,std::vector_slice_stride[dimension]; if ( cbmask ==0x3){ - thread_loop_collapse( (int n=0;n &rhs,std::vector void Scatter_plane_simple (Lattice &rhs,commVector_slice_stride[dimension]; if ( cbmask ==0x3 ) { - thread_loop_collapse( (int n=0;n_slice_stride[dimension]; int bo =n*rhs.Grid()->_slice_block[dimension]; @@ -184,7 +184,7 @@ template void Scatter_plane_merge(Lattice &rhs,std::vector_slice_block[dimension]; if(cbmask ==0x3 ) { - thread_loop_collapse( (int n=0;n_slice_stride[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension]; @@ -228,7 +228,7 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int e2=rhs.Grid()->_slice_block[dimension]; int stride = rhs.Grid()->_slice_stride[dimension]; if(cbmask == 0x3 ){ - thread_loop_collapse( (int n=0;n void Copy_plane(Lattice& lhs,const Lattice &rhs } }); } else { - thread_loop_collapse( (int n=0;nCheckerBoardFromOindex(o); @@ -266,7 +266,7 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice_slice_block [dimension]; int stride = rhs.Grid()->_slice_stride[dimension]; - thread_loop_collapse( (int n=0;n &left,const Lattice &righ std::vector > sumarray(grid->SumArraySize()); - parallel_for(int thr=0;thrSumArraySize();thr++){ + thread_loop( (int thr=0;thrSumArraySize();thr++),{ int mywork, myoff; GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff); @@ -57,7 +57,7 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ vnrm = vnrm + innerProductD(left[ss],right[ss]); } sumarray[thr]=TensorRemove(vnrm) ; - } + }); vector_type vvnrm; vvnrm=Zero(); // sum across threads for(int i=0;iSumArraySize();i++){ @@ -104,7 +104,7 @@ inline typename vobj::scalar_object sum(const Lattice &arg) sumarray[i]=Zero(); } - parallel_for(int thr=0;thrSumArraySize();thr++){ + thread_loop( (int thr=0;thrSumArraySize();thr++),{ int mywork, myoff; GridThread::GetWork(grid->oSites(),thr,mywork,myoff); @@ -113,7 +113,7 @@ inline typename vobj::scalar_object sum(const Lattice &arg) vvsum = vvsum + arg[ss]; } sumarray[thr]=vvsum; - } + }); vobj vsum=Zero(); // sum across threads for(int i=0;iSumArraySize();i++){ @@ -172,8 +172,7 @@ template inline void sliceSum(const Lattice &Data,std::vector< int stride=grid->_slice_stride[orthogdim]; // sum over reduced dimension planes, breaking out orthog dir - // Parallel over orthog direction - parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane @@ -183,7 +182,7 @@ template inline void sliceSum(const Lattice &Data,std::vector< lvSum[r]=lvSum[r]+Data[ss]; } } - } + }); // Sum across simd lanes in the plane, breaking out orthog dir. std::vector icoor(Nd); @@ -252,7 +251,7 @@ static void sliceInnerProductVector( std::vector & result, const Latti int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; - parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane @@ -263,7 +262,7 @@ static void sliceInnerProductVector( std::vector & result, const Latti lvSum[r]=lvSum[r]+vv; } } - } + }); // Sum across simd lanes in the plane, breaking out orthog dir. std::vector icoor(Nd); @@ -359,12 +358,12 @@ static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice tensor_reduced at; at=av; - parallel_for_nest2(int n=0;n::M5D(const FermionField &psi, M5Dcalls++; M5Dtime-=usecond(); - parallel_for(int ss=0;ssoSites();ss+=Ls){ // adds Ls + thread_loop( (int ss=0;ssoSites();ss+=Ls),{ // adds Ls for(int s=0;s::M5D(const FermionField &psi, chi[ss+s]=chi[ss+s]+lower[s]*tmp; } } - } + }); M5Dtime+=usecond(); } @@ -97,7 +97,7 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi, M5Dcalls++; M5Dtime-=usecond(); - parallel_for(int ss=0;ssoSites();ss+=Ls){ // adds Ls + thread_loop( (int ss=0;ssoSites();ss+=Ls),{ // adds Ls auto tmp = psi[0]; for(int s=0;s::M5Ddag(const FermionField &psi, chi[ss+s]=chi[ss+s]+lower[s]*tmp; } } - } + }); M5Dtime+=usecond(); } @@ -135,7 +135,7 @@ void CayleyFermion5D::MooeeInv (const FermionField &psi, FermionField & MooeeInvCalls++; MooeeInvTime-=usecond(); - parallel_for(int ss=0;ssoSites();ss+=Ls){ // adds Ls + thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls auto tmp = psi[0]; // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops @@ -163,7 +163,7 @@ void CayleyFermion5D::MooeeInv (const FermionField &psi, FermionField & spProj5m(tmp,chi[ss+s+1]); chi[ss+s] = chi[ss+s] - uee[s]*tmp; } - } + }); MooeeInvTime+=usecond(); @@ -193,7 +193,7 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi, FermionField & MooeeInvCalls++; MooeeInvTime-=usecond(); - parallel_for(int ss=0;ssoSites();ss+=Ls){ // adds Ls + thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls auto tmp = psi[0]; @@ -221,7 +221,7 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi, FermionField & spProj5p(tmp,chi[ss+s+1]); chi[ss+s] = chi[ss+s] - leec[s]*tmp; } - } + }); MooeeInvTime+=usecond(); diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index eaed9b9c..04732567 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -93,7 +93,7 @@ void CayleyFermion5D::M5D(const FermionField &psi, assert(Nc==3); - parallel_for(int ss=0;ssoSites();ss+=LLs){ // adds LLs + thread_loop( (int ss=0;ssoSites();ss+=LLs),{ // adds LLs #if 0 alignas(64) SiteHalfSpinor hp; alignas(64) SiteHalfSpinor hm; @@ -190,7 +190,7 @@ void CayleyFermion5D::M5D(const FermionField &psi, } #endif - } + }); M5Dtime+=usecond(); } @@ -233,7 +233,7 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi, M5Dcalls++; M5Dtime-=usecond(); - parallel_for(int ss=0;ssoSites();ss+=LLs){ // adds LLs + thread_loop( (int ss=0;ssoSites();ss+=LLs),{ // adds LLs #if 0 alignas(64) SiteHalfSpinor hp; alignas(64) SiteHalfSpinor hm; @@ -327,7 +327,7 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi, vstream(chi[ss+v]()(3)(2),p_32); } #endif - } + }); M5Dtime+=usecond(); } @@ -792,13 +792,13 @@ void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField MooeeInvTime-=usecond(); if ( switcheroo::iscomplex() ) { - parallel_for(auto site=0;site::M5D(const FermionField& psi, const FermionFiel this->M5Dcalls++; this->M5Dtime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ // adds Ls + thread_loop( (int ss=0; ssoSites(); ss+=Ls),{ // adds Ls for(int s=0; s::M5D(const FermionField& psi, const FermionFiel chi[ss+s] = chi[ss+s] + lower[s]*tmp; } } - } + }); this->M5Dtime += usecond(); } @@ -90,7 +90,7 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi, const FermionF this->M5Dcalls++; this->M5Dtime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ // adds Ls + thread_loop((int ss=0; ssoSites(); ss+=Ls),{ // adds Ls auto tmp = psi[0]; for(int s=0; s::M5Ddag(const FermionField& psi, const FermionF chi[ss+s] = chi[ss+s] + lower[s]*tmp; } } - } + }); this->M5Dtime += usecond(); } @@ -126,7 +126,7 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi, FermionField this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ // adds Ls + thread_loop((int ss=0; ssoSites(); ss+=Ls),{ // adds Ls auto tmp1 = psi[0]; auto tmp2 = psi[0]; @@ -158,7 +158,7 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi, FermionField spProj5m(tmp1, chi[ss+s+1]); chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1; } - } + }); this->MooeeInvTime += usecond(); } @@ -190,7 +190,7 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi, FermionFi this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ // adds Ls + thread_loop((int ss=0; ssoSites(); ss+=Ls),{ // adds Ls auto tmp1 = psi[0]; auto tmp2 = psi[0]; @@ -221,7 +221,7 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi, FermionFi spProj5p(tmp1, chi[ss+s+1]); chi[ss+s] = chi[ss+s] - leec[s]*tmp1; } - } + }); this->MooeeInvTime += usecond(); } diff --git a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc b/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc index 3952ce9d..3d24999c 100644 --- a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc +++ b/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc @@ -89,7 +89,7 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi, const FermionFiel assert(Nc == 3); - parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs + thread_loop( (int ss=0; ssoSites(); ss+=LLs),{ // adds LLs #if 0 @@ -191,7 +191,7 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi, const FermionFiel } #endif - } + }); this->M5Dtime += usecond(); } @@ -232,7 +232,7 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi, const FermionF this->M5Dcalls++; this->M5Dtime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs + thread_loop((int ss=0; ssoSites(); ss+=LLs),{ // adds LLs #if 0 @@ -330,7 +330,7 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi, const FermionF } #endif - } + }); this->M5Dtime += usecond(); } @@ -566,13 +566,13 @@ void DomainWallEOFAFermion::MooeeInternal(const FermionField& psi, Fermion this->MooeeInvTime -= usecond(); if(switcheroo::iscomplex()){ - parallel_for(auto site=0; siteMooeeInvTime += usecond(); diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 99a66969..f4748eee 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -268,13 +268,13 @@ public: GaugeLinkField tmp(mat.Grid()); tmp = Zero(); - parallel_for(int sss=0;sssoSites();sss++){ + thread_loop( (int sss=0;sssoSites();sss++),{ int sU=sss; for(int s=0;s(outerProduct(Btilde[sF],Atilde[sF])); // ordering here } - } + }); PokeIndex(mat,tmp,mu); } @@ -591,10 +591,10 @@ public: Uconj = where(coor==neglink,-Uconj,Uconj); } - parallel_for(auto ss=U.begin();ss(outerProduct(Btilde, A)); - parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) { + thread_loop((auto ss = tmp.begin(); ss < tmp.end(); ss++), { link[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1)); - } + }); PokeIndex(mat, link, mu); return; } @@ -640,13 +640,13 @@ public: GaugeLinkField tmp(mat.Grid()); tmp = Zero(); - parallel_for(int ss = 0; ss < tmp.Grid()->oSites(); ss++) { + thread_loop((int ss = 0; ss < tmp.Grid()->oSites(); ss++) ,{ for (int s = 0; s < Ls; s++) { int sF = s + Ls * ss; auto ttmp = traceIndex(outerProduct(Btilde[sF], Atilde[sF])); tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); } - } + }); PokeIndex(mat, tmp, mu); return; } diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 9dbb0246..a4b59783 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -172,13 +172,13 @@ void ImprovedStaggeredFermion5D::DhopDir(const FermionField &in, FermionFi Compressor compressor; Stencil.HaloExchange(in,compressor); - parallel_for(int ss=0;ssoSites();ss++){ + thread_loop( (int ss=0;ssoSites();ss++),{ for(int s=0;s @@ -240,15 +240,15 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr DhopComputeTime -= usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion if (dag == DaggerYes) { - parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) { + thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++), { int sU=ss; Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out); - } + }); } else { - parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) { + thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{ int sU=ss; Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); - } + }); } DhopComputeTime += usecond(); DhopTotalTime += usecond(); diff --git a/lib/qcd/action/fermion/MobiusEOFAFermioncache.cc b/lib/qcd/action/fermion/MobiusEOFAFermioncache.cc index bdbc6fe0..c730e91b 100644 --- a/lib/qcd/action/fermion/MobiusEOFAFermioncache.cc +++ b/lib/qcd/action/fermion/MobiusEOFAFermioncache.cc @@ -50,7 +50,7 @@ void MobiusEOFAFermion::M5D(const FermionField &psi, const FermionField &p this->M5Dcalls++; this->M5Dtime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ + thread_loop( (int ss=0; ssoSites(); ss+=Ls),{ for(int s=0; s::M5D(const FermionField &psi, const FermionField &p chi[ss+s] = chi[ss+s] + lower[s]*tmp; } } - } + }); this->M5Dtime += usecond(); } @@ -91,7 +91,7 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi, const FermionFi this->M5Dcalls++; this->M5Dtime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ + thread_loop( (int ss=0; ssoSites(); ss+=Ls),{ for(int s=0; s::M5D_shift(const FermionField &psi, const FermionFi else{ spProj5m(tmp, psi[ss+shift_s]); } chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp; } - } + }); this->M5Dtime += usecond(); } @@ -133,7 +133,7 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi, const FermionField this->M5Dcalls++; this->M5Dtime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ + thread_loop( (int ss=0; ssoSites(); ss+=Ls),{ auto tmp = psi[0]; for(int s=0; s::M5Ddag(const FermionField &psi, const FermionField chi[ss+s] = chi[ss+s] + lower[s]*tmp; } } - } + }); this->M5Dtime += usecond(); } @@ -174,7 +174,7 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi, const Fermio this->M5Dcalls++; this->M5Dtime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ + thread_loop( (int ss=0; ssoSites(); ss+=Ls),{ chi[ss+Ls-1] = Zero(); auto tmp = psi[0]; for(int s=0; s::M5Ddag_shift(const FermionField &psi, const Fermio else{ spProj5m(tmp, psi[ss+s]); } chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp; } - } + }); this->M5Dtime += usecond(); } @@ -216,7 +216,7 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi, FermionField &ch this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ + thread_loop( (int ss=0; ssoSites(); ss+=Ls),{ auto tmp = psi[0]; @@ -245,7 +245,7 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi, FermionField &ch spProj5m(tmp, chi[ss+s+1]); chi[ss+s] = chi[ss+s] - this->uee[s]*tmp; } - } + }); this->MooeeInvTime += usecond(); } @@ -261,7 +261,7 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi, FermionFie this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ + thread_loop( (int ss=0; ssoSites(); ss+=Ls),{ auto tmp1 = psi[0]; auto tmp2 = psi[0]; @@ -300,7 +300,7 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi, FermionFie spProj5m(tmp1, chi[ss+s]); chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj; } - } + }); this->MooeeInvTime += usecond(); } @@ -318,7 +318,7 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi, FermionField this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ + thread_loop( (int ss=0; ssoSites(); ss+=Ls),{ auto tmp = psi[0]; @@ -347,7 +347,7 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi, FermionField spProj5p(tmp, chi[ss+s+1]); chi[ss+s] = chi[ss+s] - this->lee[s]*tmp; } - } + }); this->MooeeInvTime += usecond(); } @@ -363,7 +363,7 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi, Fermion this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=Ls){ + thread_loop( (int ss=0; ssoSites(); ss+=Ls),{ auto tmp1 = psi[0]; auto tmp2 = psi[0]; @@ -401,7 +401,7 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi, Fermion spProj5p(tmp1, chi[ss+s]); chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj; } - } + }); this->MooeeInvTime += usecond(); } diff --git a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc index f944b3be..330dab36 100644 --- a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc +++ b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc @@ -100,7 +100,7 @@ void MobiusEOFAFermion::M5D(const FermionField& psi, const FermionField& p assert(Nc == 3); - parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs + thread_loop( (int ss=0; ssoSites(); ss+=LLs),{ // adds LLs #if 0 @@ -202,7 +202,7 @@ void MobiusEOFAFermion::M5D(const FermionField& psi, const FermionField& p } #endif - } + }); this->M5Dtime += usecond(); } @@ -263,7 +263,7 @@ void MobiusEOFAFermion::M5D_shift(const FermionField& psi, const FermionFi assert(Nc == 3); - parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs + thread_loop( (int ss=0; ssoSites(); ss+=LLs),{ // adds LLs int vs = (this->pm == 1) ? LLs-1 : 0; Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0); @@ -381,7 +381,7 @@ void MobiusEOFAFermion::M5D_shift(const FermionField& psi, const FermionFi vstream(chi[ss+v]()(3)(1), p_31); vstream(chi[ss+v]()(3)(2), p_32); } - } + }); this->M5Dtime += usecond(); @@ -424,7 +424,7 @@ void MobiusEOFAFermion::M5Ddag(const FermionField& psi, const FermionField this->M5Dcalls++; this->M5Dtime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs + thread_loop( (int ss=0; ssoSites(); ss+=LLs),{ // adds LLs #if 0 @@ -525,7 +525,7 @@ void MobiusEOFAFermion::M5Ddag(const FermionField& psi, const FermionField #endif - } + }); this->M5Dtime += usecond(); } @@ -584,7 +584,7 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField& psi, const Fermio this->M5Dcalls++; this->M5Dtime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs + thread_loop( (int ss=0; ssoSites(); ss+=LLs),{ // adds LLs int vs = (this->pm == 1) ? LLs-1 : 0; Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0); @@ -703,7 +703,7 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField& psi, const Fermio } - } + }); this->M5Dtime += usecond(); @@ -943,13 +943,13 @@ void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionFiel this->MooeeInvTime -= usecond(); if(switcheroo::iscomplex()){ - parallel_for(auto site=0; siteMooeeInvTime += usecond(); diff --git a/lib/qcd/action/fermion/SchurDiagTwoKappa.h b/lib/qcd/action/fermion/SchurDiagTwoKappa.h index a8de7632..c6e5470b 100644 --- a/lib/qcd/action/fermion/SchurDiagTwoKappa.h +++ b/lib/qcd/action/fermion/SchurDiagTwoKappa.h @@ -54,10 +54,10 @@ public: out.Checkerboard() = in.Checkerboard(); assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now int Ls = grid->_rdimensions[0]; - parallel_for(int ss=0;ssoSites();ss++){ + thread_loop( (int ss=0;ssoSites();ss++),{ vobj tmp = s[ss % Ls]*in[ss]; vstream(out[ss],tmp); - } + }); } RealD sscale_norm(const Field& in, Field& out, Coeff_t* s) { diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index a398a7ff..4d7d340c 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -216,9 +216,9 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, //////////////////////// // Call the single hop //////////////////////// - parallel_for (int sss = 0; sss < B.Grid()->oSites(); sss++) { + thread_loop( (int sss = 0; sss < B.Grid()->oSites(); sss++) ,{ Kernels::DhopDirK(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma); - } + }); ////////////////////////////////////////////////// // spin trace outer product @@ -317,9 +317,9 @@ void WilsonFermion::DhopDirDisp(const FermionField &in, FermionField &out, Stencil.HaloExchange(in, compressor); - parallel_for (int sss = 0; sss < in.Grid()->oSites(); sss++) { + thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{ Kernels::DhopDirK(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma); - } + }); }; template @@ -333,13 +333,13 @@ void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, st.HaloExchange(in, compressor); if (dag == DaggerYes) { - parallel_for (int sss = 0; sss < in.Grid()->oSites(); sss++) { + thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{ Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); - } + }); } else { - parallel_for (int sss = 0; sss < in.Grid()->oSites(); sss++) { + thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{ Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); - } + }); } }; @@ -366,8 +366,7 @@ void WilsonFermion::ContractConservedCurrent(PropagatorField &q_in_1, // Inefficient comms method but not performance critical. tmp1 = Cshift(q_in_1, mu, 1); tmp2 = Cshift(q_in_2, mu, 1); - parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU) - { + thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), { Kernels::ContractConservedCurrentSiteFwd(tmp1[sU], q_in_2[sU], q_out[sU], @@ -376,7 +375,7 @@ void WilsonFermion::ContractConservedCurrent(PropagatorField &q_in_1, tmp2[sU], q_out[sU], Umu, sU, mu); - } + }); } template @@ -415,38 +414,36 @@ void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, tmp = ph*q_in; tmpBwd = Cshift(tmp, mu, -1); - parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU) - { - // Compute the sequential conserved current insertion only if our simd - // object contains a timeslice we need. - vInteger t_mask = ((coords[sU] >= tmin) && - (coords[sU] <= tmax)); - Integer timeSlices = Reduce(t_mask); + thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), { - if (timeSlices > 0) - { - Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sU], - q_out[sU], - Umu, sU, mu, t_mask); - } + // Compute the sequential conserved current insertion only if our simd + // object contains a timeslice we need. + vInteger t_mask = ((coords[sU] >= tmin) && + (coords[sU] <= tmax)); + Integer timeSlices = Reduce(t_mask); - // Repeat for backward direction. - t_mask = ((coords[sU] >= (tmin + tshift)) && - (coords[sU] <= (tmax + tshift))); - - //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3) - unsigned int t0 = 0; - if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 )); - - timeSlices = Reduce(t_mask); - - if (timeSlices > 0) - { - Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sU], - q_out[sU], - Umu, sU, mu, t_mask); - } + if (timeSlices > 0) { + Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sU], + q_out[sU], + Umu, sU, mu, t_mask); } + + // Repeat for backward direction. + t_mask = ((coords[sU] >= (tmin + tshift)) && + (coords[sU] <= (tmax + tshift))); + + //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3) + unsigned int t0 = 0; + if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 )); + + timeSlices = Reduce(t_mask); + + if (timeSlices > 0) { + Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sU], + q_out[sU], + Umu, sU, mu, t_mask); + } + }); } FermOpTemplateInstantiate(WilsonFermion); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index da62a633..f32df1fd 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -244,13 +244,13 @@ void WilsonFermion5D::DhopDir(const FermionField &in, FermionField &out,in assert(dirdisp<=7); assert(dirdisp>=0); - parallel_for(int ss=0;ssoSites();ss++){ + thread_loop( (int ss=0;ssoSites();ss++),{ for(int s=0;s @@ -293,7 +293,7 @@ void WilsonFermion5D::DerivInternal(StencilImpl & st, //////////////////////// DerivDhopComputeTime -= usecond(); - parallel_for (int sss = 0; sss < U.Grid()->oSites(); sss++) { + thread_loop( (int sss = 0; sss < U.Grid()->oSites(); sss++) ,{ for (int s = 0; s < Ls; s++) { int sU = sss; int sF = s + Ls * sU; @@ -307,7 +307,7 @@ void WilsonFermion5D::DerivInternal(StencilImpl & st, // spin trace outer product //////////////////////////// } - } + }); //////////////////////////// // spin trace outer product //////////////////////////// @@ -464,18 +464,18 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg DhopComputeTime2-=usecond(); if (dag == DaggerYes) { int sz=st.surface_list.size(); - parallel_for (int ss = 0; ss < sz; ss++) { + thread_loop( (int ss = 0; ss < sz; ss++) ,{ int sU = st.surface_list[ss]; int sF = LLs * sU; Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); - } + }); } else { int sz=st.surface_list.size(); - parallel_for (int ss = 0; ss < sz; ss++) { + thread_loop( (int ss = 0; ss < sz; ss++) ,{ int sU = st.surface_list[ss]; int sF = LLs * sU; Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); - } + }); } DhopComputeTime2+=usecond(); #else @@ -502,17 +502,17 @@ void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOr // Dhop takes the 4d grid from U, and makes a 5d index for fermion if (dag == DaggerYes) { - parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) { + thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{ int sU = ss; int sF = LLs * sU; Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); - } + }); } else { - parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) { + thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{ int sU = ss; int sF = LLs * sU; Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); - } + }); } DhopComputeTime+=usecond(); } @@ -739,41 +739,37 @@ void WilsonFermion5D::ContractConservedCurrent(PropagatorField &q_in_1, // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one. tmp1 = Cshift(q_in_1, mu + 1, 1); tmp2 = Cshift(q_in_2, mu + 1, 1); - parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU) - { - unsigned int sF1 = sU * LLs; - unsigned int sF2 = (sU + 1) * LLs - 1; + thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), { + unsigned int sF1 = sU * LLs; + unsigned int sF2 = (sU + 1) * LLs - 1; + + for (unsigned int s = 0; s < LLs; ++s) { - for (unsigned int s = 0; s < LLs; ++s) - { - bool axial_sign = ((curr_type == Current::Axial) && \ - (s < (LLs / 2))); - SitePropagator qSite2, qmuSite2; - - // If vectorised in 5th dimension, reverse q2 vector to match up - // sites correctly. - if (Impl::LsVectorised) - { - REVERSE_LS(q_in_2[sF2], qSite2, Ls / LLs); - REVERSE_LS(tmp2[sF2], qmuSite2, Ls / LLs); - } - else - { - qSite2 = q_in_2[sF2]; - qmuSite2 = tmp2[sF2]; - } - Kernels::ContractConservedCurrentSiteFwd(tmp1[sF1], - qSite2, - q_out[sU], - Umu, sU, mu, axial_sign); - Kernels::ContractConservedCurrentSiteBwd(q_in_1[sF1], - qmuSite2, - q_out[sU], - Umu, sU, mu, axial_sign); - sF1++; - sF2--; - } + bool axial_sign = ((curr_type == Current::Axial) && \ + (s < (LLs / 2))); + SitePropagator qSite2, qmuSite2; + + // If vectorised in 5th dimension, reverse q2 vector to match up + // sites correctly. + if (Impl::LsVectorised) { + REVERSE_LS(q_in_2[sF2], qSite2, Ls / LLs); + REVERSE_LS(tmp2[sF2], qmuSite2, Ls / LLs); + } else { + qSite2 = q_in_2[sF2]; + qmuSite2 = tmp2[sF2]; + } + Kernels::ContractConservedCurrentSiteFwd(tmp1[sF1], + qSite2, + q_out[sU], + Umu, sU, mu, axial_sign); + Kernels::ContractConservedCurrentSiteBwd(q_in_1[sF1], + qmuSite2, + q_out[sU], + Umu, sU, mu, axial_sign); + sF1++; + sF2--; } + }); } @@ -817,50 +813,46 @@ void WilsonFermion5D::SeqConservedCurrent(PropagatorField &q_in, tmp = ph*q_in; tmpBwd = Cshift(tmp, mu + 1, -1); - parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU) - { - // Compute the sequential conserved current insertion only if our simd - // object contains a timeslice we need. - vInteger t_mask = ((coords[sU] >= tmin) && - (coords[sU] <= tmax)); - Integer timeSlices = Reduce(t_mask); + thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU) ,{ + // Compute the sequential conserved current insertion only if our simd + // object contains a timeslice we need. + vInteger t_mask = ((coords[sU] >= tmin) && + (coords[sU] <= tmax)); + Integer timeSlices = Reduce(t_mask); + + if (timeSlices > 0) { - if (timeSlices > 0) - { - unsigned int sF = sU * LLs; - for (unsigned int s = 0; s < LLs; ++s) - { - bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2))); - Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sF], - q_out[sF], Umu, sU, - mu, t_mask, axial_sign); - ++sF; - } - } - - // Repeat for backward direction. - t_mask = ((coords[sU] >= (tmin + tshift)) && - (coords[sU] <= (tmax + tshift))); - - //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3) - unsigned int t0 = 0; - if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 )); - - timeSlices = Reduce(t_mask); - - if (timeSlices > 0) - { - unsigned int sF = sU * LLs; - for (unsigned int s = 0; s < LLs; ++s) - { - bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2))); - Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sF], - q_out[sF], Umu, sU, - mu, t_mask, axial_sign); - ++sF; - } - } + unsigned int sF = sU * LLs; + for (unsigned int s = 0; s < LLs; ++s) { + bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2))); + Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sF], + q_out[sF], Umu, sU, + mu, t_mask, axial_sign); + ++sF; + } } + + // Repeat for backward direction. + t_mask = ((coords[sU] >= (tmin + tshift)) && + (coords[sU] <= (tmax + tshift))); + + //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3) + unsigned int t0 = 0; + if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 )); + + timeSlices = Reduce(t_mask); + + if (timeSlices > 0) { + unsigned int sF = sU * LLs; + for (unsigned int s = 0; s < LLs; ++s) { + bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2))); + Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sF], + q_out[sF], Umu, sU, + mu, t_mask, axial_sign); + ++sF; + } + } + }); } FermOpTemplateInstantiate(WilsonFermion5D); diff --git a/lib/qcd/action/gauge/GaugeImplTypes.h b/lib/qcd/action/gauge/GaugeImplTypes.h index 8a8ecf62..fbe60039 100644 --- a/lib/qcd/action/gauge/GaugeImplTypes.h +++ b/lib/qcd/action/gauge/GaugeImplTypes.h @@ -101,10 +101,10 @@ public: //static std::chrono::duration diff; //auto start = std::chrono::high_resolution_clock::now(); - parallel_for(int ss=0;ssoSites();ss++){ + thread_loop( (int ss=0;ssoSites();ss++),{ for (int mu = 0; mu < Nd; mu++) U[ss]._internal[mu] = ProjectOnGroup(Exponentiate(P[ss]._internal[mu], ep, Nexp) * U[ss]._internal[mu]); - } + }); //auto end = std::chrono::high_resolution_clock::now(); // diff += end - start; diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h index d3ef28b3..244bee2e 100644 --- a/lib/qcd/action/scalar/ScalarInteractionAction.h +++ b/lib/qcd/action/scalar/ScalarInteractionAction.h @@ -82,7 +82,7 @@ public: action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared; for (int mu = 0; mu < Ndim; mu++) { // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils - parallel_for (int i = 0; i < p.Grid()->oSites(); i++) { + thread_loop( (int i = 0; i < p.Grid()->oSites(); i++) ,{ int permute_type; StencilEntry *SE; vobj temp2; @@ -101,7 +101,7 @@ public: } else { action[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset]; } - } + }); // action -= pshift*p + p*pshift; } // NB the trace in the algebra is normalised to 1/2 @@ -118,7 +118,7 @@ public: //for (int mu = 0; mu < Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1); for (int point = 0; point < npoint; point++) { - parallel_for (int i = 0; i < p.Grid()->oSites(); i++) { + thread_loop( (int i = 0; i < p.Grid()->oSites(); i++) ,{ const vobj *temp; vobj temp2; int permute_type; @@ -136,7 +136,7 @@ public: } else { force[i] -= phiStencil.CommBuf()[SE->_offset]; } - } + }); } } }; diff --git a/lib/qcd/utils/LinalgUtils.h b/lib/qcd/utils/LinalgUtils.h index a1a40619..cc023faf 100644 --- a/lib/qcd/utils/LinalgUtils.h +++ b/lib/qcd/utils/LinalgUtils.h @@ -48,12 +48,12 @@ void axpibg5x(Lattice &z,const Lattice &x,Coeff a,Coeff b) GridBase *grid=x.Grid(); Gamma G5(Gamma::Algebra::Gamma5); - parallel_for(int ss=0;ssoSites();ss++){ + thread_loop( (int ss=0;ssoSites();ss++),{ vobj tmp; tmp = a*x[ss]; tmp = tmp + G5*(b*timesI(x[ss])); vstream(z[ss],tmp); - } + }); } template @@ -64,10 +64,10 @@ void axpby_ssp(Lattice &z, Coeff a,const Lattice &x,Coeff b,const La conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - parallel_for(int ss=0;ssoSites();ss+=Ls){ // adds Ls + thread_loop( (int ss=0;ssoSites();ss+=Ls),{ // adds Ls vobj tmp = a*x[ss+s]+b*y[ss+sp]; vstream(z[ss+s],tmp); - } + }); } template @@ -80,12 +80,12 @@ void ag5xpby_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const L int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - parallel_for(int ss=0;ssoSites();ss+=Ls){ // adds Ls + thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls vobj tmp; tmp = G5*x[ss+s]*a; tmp = tmp + b*y[ss+sp]; vstream(z[ss+s],tmp); - } + }); } template @@ -97,12 +97,12 @@ void axpbg5y_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const L GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - parallel_for(int ss=0;ssoSites();ss+=Ls){ // adds Ls + thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls vobj tmp; tmp = G5*y[ss+sp]*b; tmp = tmp + a*x[ss+s]; vstream(z[ss+s],tmp); - } + }); } template @@ -115,13 +115,13 @@ void ag5xpbg5y_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - parallel_for(int ss=0;ssoSites();ss+=Ls){ // adds Ls + thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls vobj tmp1; vobj tmp2; tmp1 = a*x[ss+s]+b*y[ss+sp]; tmp2 = G5*tmp1; vstream(z[ss+s],tmp2); - } + }); } template @@ -132,12 +132,12 @@ void axpby_ssp_pminus(Lattice &z,Coeff a,const Lattice &x,Coeff b,co conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - parallel_for(int ss=0;ssoSites();ss+=Ls){ // adds Ls + thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls vobj tmp; spProj5m(tmp,y[ss+sp]); tmp = a*x[ss+s]+b*tmp; vstream(z[ss+s],tmp); - } + }); } template @@ -148,12 +148,12 @@ void axpby_ssp_pplus(Lattice &z,Coeff a,const Lattice &x,Coeff b,con conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - parallel_for(int ss=0;ssoSites();ss+=Ls){ // adds Ls + thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls vobj tmp; spProj5p(tmp,y[ss+sp]); tmp = a*x[ss+s]+b*tmp; vstream(z[ss+s],tmp); - } + }); } template @@ -164,14 +164,14 @@ void G5R5(Lattice &z,const Lattice &x) conformable(x,z); int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - parallel_for(int ss=0;ssoSites();ss+=Ls) { + thread_loop((int ss=0;ssoSites();ss+=Ls) { vobj tmp; for(int s=0;soSites(); ss++) { + thread_loop( (int ss = 0; ss < grid->oSites(); ss++) ,{ subgroup[ss]()()(0, 0) = source[ss]()()(i0, i0); subgroup[ss]()()(0, 1) = source[ss]()()(i0, i1); subgroup[ss]()()(1, 0) = source[ss]()()(i1, i0); @@ -238,7 +238,7 @@ public: // this should be purely real Determinant[ss] = Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0); - } + }); } ////////////////////////////////////////////////////////////////////////////////////////// @@ -253,12 +253,12 @@ public: su2SubGroupIndex(i0, i1, su2_index); dest = 1.0; // start out with identity - parallel_for (int ss = 0; ss < grid->oSites(); ss++) { + thread_loop( (int ss = 0; ss < grid->oSites(); ss++) ,{ dest[ss]()()(i0, i0) = subgroup[ss]()()(0, 0); dest[ss]()()(i0, i1) = subgroup[ss]()()(0, 1); dest[ss]()()(i1, i0) = subgroup[ss]()()(1, 0); dest[ss]()()(i1, i1) = subgroup[ss]()()(1, 1); - } + }); } /////////////////////////////////////////////// diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 87e29ae5..9812ab51 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -63,9 +63,9 @@ template void Gather_plane_simple_table (std::vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) { int num=table.size(); - parallel_for(int i=0;i >& table,const L assert( (table.size()&0x1)==0); int num=table.size()/2; int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane - parallel_for(int j=0;j void HaloExchange(const Lattice &source,compressor &compress) @@ -509,20 +501,20 @@ public: for(int i=0;i #ifdef GRID_OMP #define thread_loop( range , ... ) _Pragma("omp parallel for schedule(static)") for range { __VA_ARGS__ ; }; #define thread_loop_in_region( range , ... ) _Pragma("omp for schedule(static)") for range { __VA_ARGS__ ; }; -#define thread_loop_collapse( range , ... ) _Pragma("omp parallel for collapse(2)") for range { __VA_ARGS__ }; +#define thread_loop_collapse( n, range , ... ) _Pragma("omp parallel for collapse(" #n ")") for range { __VA_ARGS__ }; #define thread_region _Pragma("omp parallel") #define thread_critical _Pragma("omp critical") +#define thread_num(a) omp_get_thread_num() +#define thread_max(a) omp_get_max_threads() #else #define thread_loop( range , ... ) for range { __VA_ARGS__ ; }; #define thread_loop_in_region( range , ... ) for range { __VA_ARGS__ ; }; -#define thread_loop_collapse( range , ... ) for range { __VA_ARGS__ ; }; +#define thread_loop_collapse( n, range , ... ) for range { __VA_ARGS__ ; }; #define thread_region #define thread_critical +#define thread_num(a) (0) +#define thread_max(a) (1) #endif -////////////////////////////////////////////////////////////////////////////////// -// Deprecated primitives; to eridicate when done delete and fix errors. -////////////////////////////////////////////////////////////////////////////////// - -#ifdef GRID_OMP -#define parallel_for _Pragma("omp parallel for schedule(static)") for -#define parallel_for_internal _Pragma("omp for schedule(static)") for -#define parallel_region thread_region -#define parallel_for_nest2 for -#else -#define parallel_for for -#define parallel_for_internal for -#define parallel_region -#define parallel_for_nest2 for -#endif ////////////////////////////////////////////////////////////////////////////////// // Accelerator primitives; fall back to threading