From b02d022993031f5fe58658a0ddb63a381cecb92f Mon Sep 17 00:00:00 2001 From: david clarke Date: Fri, 23 Feb 2024 17:14:28 -0700 Subject: [PATCH] fixed race condition (thx michael) --- Grid/qcd/smearing/HISQSmearing.h | 59 +++++++++++++++++--------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index ac4cb8b6..6fc6993e 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -168,25 +168,26 @@ public: // We infer some types that will be needed in the calculation. typedef decltype(gStencil.GetEntry(0,0)) stencilElement; typedef decltype(coalescedReadGeneralPermute(U_v[0](0),gStencil.GetEntry(0,0)->_permute,Nd)) U3matrix; - stencilElement SE0, SE1, SE2, SE3, SE4, SE5; - U3matrix U0, U1, U2, U3, U4, U5, W; int Nsites = U_v.size(); + auto gStencil_v = gStencil.View(); -// accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs - for(int site=0;site_offset; - SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; - SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; - SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; - SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; - SE5 = gStencil.GetEntry(s+5,site); int x_m_mu = SE5->_offset; + SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset; + SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + SE5 = gStencil_v.GetEntry(s+5,site); int x_m_mu = SE5->_offset; // When you're deciding whether to take an adjoint, the question is: how is the // stored link oriented compared to the one you want? If I imagine myself travelling @@ -212,10 +213,12 @@ public: // But on GPU it's non-trivial and maps scalar object to vector object and vice versa. coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_3*W); } - }//) + }) -// accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 5-link - for(int site=0;site_offset; - SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; - SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; - SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; - SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset; + SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset; U0 = coalescedReadGeneralPermute( U_v[x_p_mu ](nu ),SE0->_permute,Nd); U1 = coalescedReadGeneralPermute(U_3link_v[x_p_nu ](rho),SE1->_permute,Nd); @@ -248,10 +251,12 @@ public: sigmaIndex++; } } - }//) + }) -// accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 7-link - for(int site=0;site_offset; - SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; - SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; - SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; - SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset; + SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset; U0 = coalescedReadGeneralPermute(U_v[x_p_mu](nu),SE0->_permute,Nd); if(sigmaIndex<3) { @@ -286,7 +291,7 @@ public: sigmaIndex++; } } - }//) + }) } // end mu loop