From 9f280b82c4e56d8b32034dfbb83187e15add41e9 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 25 Jul 2017 11:30:41 -0400 Subject: [PATCH 01/14] Added mixed-precision CG with reliable updates --- lib/algorithms/Algorithms.h | 1 + .../ConjugateGradientReliableUpdate.h | 231 ++++++++++++++++++ tests/Test_dwf_mixedcg_prec_halfcomms.cc | 40 ++- 3 files changed, 260 insertions(+), 12 deletions(-) create mode 100644 lib/algorithms/iterative/ConjugateGradientReliableUpdate.h diff --git a/lib/algorithms/Algorithms.h b/lib/algorithms/Algorithms.h index 5123c7a1..f8dc2dc2 100644 --- a/lib/algorithms/Algorithms.h +++ b/lib/algorithms/Algorithms.h @@ -44,6 +44,7 @@ Author: Peter Boyle #include #include #include +#include // Lanczos support //#include diff --git a/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h b/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h new file mode 100644 index 00000000..1aab064d --- /dev/null +++ b/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h @@ -0,0 +1,231 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h + + Copyright (C) 2015 + +Author: Christopher Kelly + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H +#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H + +namespace Grid { + + template::value == 2, int>::type = 0,typename std::enable_if< getPrecision::value == 1, int>::type = 0> + class ConjugateGradientReliableUpdate : public LinearFunction { + public: + bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. + // Defaults true. + RealD Tolerance; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + Integer ReliableUpdatesPerformed; + + bool DoFinalCleanup; //Final DP cleanup, defaults to true + Integer IterationsToCleanup; //Final DP cleanup step iterations + + LinearOperatorBase &Linop_f; + LinearOperatorBase &Linop_d; + GridBase* SinglePrecGrid; + RealD Delta; //reliable update parameter + + ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase &_Linop_f, LinearOperatorBase &_Linop_d, bool err_on_no_conv = true) + : Tolerance(tol), + MaxIterations(maxit), + Delta(_delta), + Linop_f(_Linop_f), + Linop_d(_Linop_d), + SinglePrecGrid(_sp_grid), + ErrorOnNoConverge(err_on_no_conv), + DoFinalCleanup(true) + {}; + + void operator()(const FieldD &src, FieldD &psi) { + psi.checkerboard = src.checkerboard; + conformable(psi, src); + + RealD cp, c, a, d, b, ssq, qq, b_pred; + + FieldD p(src); + FieldD mmp(src); + FieldD r(src); + + // Initial residual computation & set up + RealD guess = norm2(psi); + assert(std::isnan(guess) == 0); + + Linop_d.HermOpAndNorm(psi, mmp, d, b); + + r = src - mmp; + p = r; + + a = norm2(p); + cp = a; + ssq = norm2(src); + + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: src " << ssq << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mp " << d << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mmp " << b << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: cp,r " << cp << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: p " << a << std::endl; + + RealD rsq = Tolerance * Tolerance * ssq; + + // Check if guess is really REALLY good :) + if (cp <= rsq) { + return; + } + + //Single prec initialization + FieldF r_f(SinglePrecGrid); + r_f.checkerboard = r.checkerboard; + precisionChange(r_f, r); + + FieldF psi_f(r_f); + psi_f = zero; + + FieldF p_f(r_f); + FieldF mmp_f(r_f); + + RealD MaxResidSinceLastRelUp = cp; //initial residual + + std::cout << GridLogIterative << std::setprecision(4) + << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; + + GridStopWatch LinalgTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + + SolverTimer.Start(); + int k = 0; + int l = 0; + + for (k = 1; k <= MaxIterations; k++) { + c = cp; + + MatrixTimer.Start(); + Linop_f.HermOpAndNorm(p_f, mmp_f, d, qq); + MatrixTimer.Stop(); + + LinalgTimer.Start(); + + a = c / d; + b_pred = a * (a * qq - d) / c; + + cp = axpy_norm(r_f, -a, mmp_f, r_f); + b = cp / c; + + // Fuse these loops ; should be really easy + psi_f = a * p_f + psi_f; + //p_f = p_f * b + r_f; + + LinalgTimer.Stop(); + + std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k + << " residual " << cp << " target " << rsq << std::endl; + std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl; + std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl; + + if(cp > MaxResidSinceLastRelUp){ + std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl; + MaxResidSinceLastRelUp = cp; + } + + // Stopping condition + if (cp <= rsq) { + //Although not written in the paper, I assume that I have to add on the final solution + precisionChange(mmp, psi_f); + psi = psi + mmp; + + + SolverTimer.Stop(); + Linop_d.HermOpAndNorm(psi, mmp, d, qq); + p = mmp - src; + + RealD srcnorm = sqrt(norm2(src)); + RealD resnorm = sqrt(norm2(p)); + RealD true_residual = resnorm / srcnorm; + + std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl; + std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)< CG(Tolerance,MaxIterations); + CG.ErrorOnNoConverge = ErrorOnNoConverge; + CG(Linop_d,src,psi); + IterationsToCleanup = CG.IterationsToComplete; + } + else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); + + std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n"; + return; + } + else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update + std::cout << GridLogMessage << "ConjugateGradientReliableUpdate " + << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n"; + precisionChange(mmp, psi_f); + psi = psi + mmp; + + Linop_d.HermOpAndNorm(psi, mmp, d, qq); + r = src - mmp; + + psi_f = zero; + precisionChange(r_f, r); + cp = norm2(r); + MaxResidSinceLastRelUp = cp; + + std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl; + + l = l+1; + } + + p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence + + } + std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge" + << std::endl; + + if (ErrorOnNoConverge) assert(0); + IterationsToComplete = k; + ReliableUpdatesPerformed = l; + } + }; + + +}; + + + +#endif diff --git a/tests/Test_dwf_mixedcg_prec_halfcomms.cc b/tests/Test_dwf_mixedcg_prec_halfcomms.cc index 9cc935d9..d6aaa21e 100644 --- a/tests/Test_dwf_mixedcg_prec_halfcomms.cc +++ b/tests/Test_dwf_mixedcg_prec_halfcomms.cc @@ -80,31 +80,47 @@ int main (int argc, char ** argv) LatticeFermionD src_o(FrbGrid); - LatticeFermionD result_o(FrbGrid); - LatticeFermionD result_o_2(FrbGrid); + LatticeFermionD result_cg(FrbGrid); pickCheckerboard(Odd,src_o,src); - result_o.checkerboard = Odd; - result_o = zero; - result_o_2.checkerboard = Odd; - result_o_2 = zero; + result_cg.checkerboard = Odd; + result_cg = zero; + LatticeFermionD result_mcg(result_cg); + LatticeFermionD result_rlcg(result_cg); SchurDiagMooeeOperator HermOpEO(Ddwf); SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); + //#define DO_MIXED_CG +#define DO_RLUP_CG + +#ifdef DO_MIXED_CG std::cout << "Starting mixed CG" << std::endl; MixedPrecisionConjugateGradient mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); mCG.InnerTolerance = 3.0e-5; - mCG(src_o,result_o); + mCG(src_o,result_mcg); +#endif +#ifdef DO_RLUP_CG + std::cout << "Starting reliable update CG" << std::endl; + ConjugateGradientReliableUpdate rlCG(1.e-8, 10000, 0.1, FrbGrid_f, HermOpEO_f, HermOpEO); + rlCG(src_o,result_rlcg); +#endif + std::cout << "Starting regular CG" << std::endl; ConjugateGradient CG(1.0e-8,10000); - CG(HermOpEO,src_o,result_o_2); + CG(HermOpEO,src_o,result_cg); - LatticeFermionD diff_o(FrbGrid); - RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2); - - std::cout << "Diff between mixed and regular CG: " << diff << std::endl; +#ifdef DO_MIXED_CG + LatticeFermionD diff_mcg(FrbGrid); + RealD vdiff_mcg = axpy_norm(diff_mcg, -1.0, result_cg, result_mcg); + std::cout << "Diff between mixed and regular CG: " << vdiff_mcg << std::endl; +#endif +#ifdef DO_RLUP_CG + LatticeFermionD diff_rlcg(FrbGrid); + RealD vdiff_rlcg = axpy_norm(diff_rlcg, -1.0, result_cg, result_rlcg); + std::cout << "Diff between reliable update and regular CG: " << vdiff_rlcg << std::endl; +#endif Grid_finalize(); } From 9939b267d26f13cf5c152705741ea7655badece9 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Mon, 31 Jul 2017 13:39:44 -0400 Subject: [PATCH 02/14] Added switching to fallback linear operator in reliable update CG, and added recalculation of b parameter on update. --- .../ConjugateGradientReliableUpdate.h | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h b/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h index 1aab064d..13666f97 100644 --- a/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h +++ b/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h @@ -47,6 +47,11 @@ namespace Grid { LinearOperatorBase &Linop_d; GridBase* SinglePrecGrid; RealD Delta; //reliable update parameter + + //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single + LinearOperatorBase *Linop_fallback; + RealD fallback_transition_tol; + ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase &_Linop_f, LinearOperatorBase &_Linop_d, bool err_on_no_conv = true) : Tolerance(tol), @@ -56,10 +61,19 @@ namespace Grid { Linop_d(_Linop_d), SinglePrecGrid(_sp_grid), ErrorOnNoConverge(err_on_no_conv), - DoFinalCleanup(true) + DoFinalCleanup(true), + Linop_fallback(NULL) {}; + void setFallbackLinop(LinearOperatorBase &_Linop_fallback, const RealD _fallback_transition_tol){ + Linop_fallback = &_Linop_fallback; + fallback_transition_tol = _fallback_transition_tol; + } + void operator()(const FieldD &src, FieldD &psi) { + LinearOperatorBase *Linop_f_use = &Linop_f; + bool using_fallback = false; + psi.checkerboard = src.checkerboard; conformable(psi, src); @@ -93,6 +107,8 @@ namespace Grid { // Check if guess is really REALLY good :) if (cp <= rsq) { + std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n"; + std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<HermOpAndNorm(p_f, mmp_f, d, qq); MatrixTimer.Stop(); LinalgTimer.Start(); @@ -206,12 +222,21 @@ namespace Grid { cp = norm2(r); MaxResidSinceLastRelUp = cp; + b = cp/c; + std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl; l = l+1; } p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence + + if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){ + std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl; + Linop_f_use = Linop_fallback; + using_fallback = true; + } + } std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge" From ab50145001d7a2f27c9f724440e0bfd88a6373e2 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 22 Aug 2017 17:12:25 -0400 Subject: [PATCH 03/14] Implemented first, unoptimized version of hand-unrolled G-parity kernels Improved Test_gparity --- lib/qcd/action/fermion/FermionOperatorImpl.h | 81 +++- lib/qcd/action/fermion/WilsonKernelsHand.cc | 456 +++++++++++++------ tests/core/Test_gparity.cc | 148 +++--- 3 files changed, 485 insertions(+), 200 deletions(-) diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 524179f5..1e344521 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -425,6 +425,22 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres //////////////////////////////////////////////////////////////////////////////////////// // Flavour doubled spinors; is Gparity the only? what about C*? //////////////////////////////////////////////////////////////////////////////////////// +namespace GparityWilsonImpl_helper{ + template + struct getAB; + + template + struct getAB{ + static inline A & ref(A &a, B &b){ return a; } + }; + template + struct getAB{ + static inline B & ref(A &a, B &b){ return b; } + }; +}; + + + template class GparityWilsonImpl : public ConjugateGaugeImpl > { public: @@ -462,7 +478,10 @@ class GparityWilsonImpl : public ConjugateGaugeImpl > tmp_full; + std::vector > tmp_half; + + GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p), tmp_full(GridThread::GetThreads()), tmp_half(GridThread::GetThreads()){}; bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; @@ -538,6 +557,66 @@ class GparityWilsonImpl : public ConjugateGaugeImpl + inline void loadLinkElement(Simd ®, ref &memory) { + reg = memory; + } + + template + void GparityTwistPermute(SiteSpinorType &into, const SiteSpinorType &from, const int direction, const int distance, const int perm, GridBase* grid){ + typedef typename SiteSpinorType::scalar_object sobj; + sobj stmp; + std::vector vals(grid->Nsimd()); + extract(from,vals); + std::vector icoor; + for(int s=0;sNsimd();s++){ + grid->iCoorFromIindex(icoor,s); + assert((icoor[direction]==0)||(icoor[direction]==1)); + + int permute_lane; + if ( distance == 1) { + permute_lane = icoor[direction]?1:0; + } else { + permute_lane = icoor[direction]?0:1; + } + if(perm) permute_lane = !permute_lane; + + if ( permute_lane ) { + stmp(0) = vals[s](1); + stmp(1) = vals[s](0); + vals[s] = stmp; + } + } + merge(into,vals); + } + + + template + const SiteSpinorType & GparityGetChi(int &g, SiteSpinorType const* in, const int dir, const int f, StencilEntry *SE, StencilImpl &st){ + const int mmu = dir % 4; + const int direction = st._directions[dir]; + const int sl = st._grid->_simd_layout[direction]; + const int perm = SE->_permute; + g = f; + + if(SE->_around_the_world && Params.twists[mmu]){ + if(sl == 1){ //not SIMD vectorized in G-parity direction so just change the flavor index accessed to implement the twist + g = (f+1) % 2; + return in[SE->_offset]; + }else{ //SIMD vectorized in Gparity direction + const int me = omp_get_thread_num(); + const int distance = st._distances[dir]; + assert(distance == -1 || distance == 1); + SiteSpinorType &tmp = GparityWilsonImpl_helper::getAB::ref(tmp_full[me], tmp_half[me]); + GparityTwistPermute(tmp, in[SE->_offset], direction, distance, perm, st._grid); + return tmp; + } + }else return in[SE->_offset]; + } + + + inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) { conformable(Uds._grid,GaugeGrid); diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 96b8ab0a..866e30d2 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -30,60 +30,77 @@ Author: paboyle #define REGISTER -#define LOAD_CHIMU \ - {const SiteSpinor & ref (in._odata[offset]); \ - Chimu_00=ref()(0)(0);\ - Chimu_01=ref()(0)(1);\ - Chimu_02=ref()(0)(2);\ - Chimu_10=ref()(1)(0);\ - Chimu_11=ref()(1)(1);\ - Chimu_12=ref()(1)(2);\ - Chimu_20=ref()(2)(0);\ - Chimu_21=ref()(2)(1);\ - Chimu_22=ref()(2)(2);\ - Chimu_30=ref()(3)(0);\ - Chimu_31=ref()(3)(1);\ - Chimu_32=ref()(3)(2);} +#define LOAD_CHIMU_BODY(F) \ + Chimu_00=ref(F)(0)(0); \ + Chimu_01=ref(F)(0)(1); \ + Chimu_02=ref(F)(0)(2); \ + Chimu_10=ref(F)(1)(0); \ + Chimu_11=ref(F)(1)(1); \ + Chimu_12=ref(F)(1)(2); \ + Chimu_20=ref(F)(2)(0); \ + Chimu_21=ref(F)(2)(1); \ + Chimu_22=ref(F)(2)(2); \ + Chimu_30=ref(F)(3)(0); \ + Chimu_31=ref(F)(3)(1); \ + Chimu_32=ref(F)(3)(2) + +#define LOAD_CHIMU(DIR,F) \ + { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); } + +#define LOAD_CHIMU_GPARITY(DIR,F) \ + { int g; const SiteSpinor & ref = GparityGetChi(g,in._odata.data(),DIR,F,SE,st); LOAD_CHIMU_BODY(g); } + +#define LOAD_CHI_BODY(F) \ + Chi_00 = ref(F)(0)(0);\ + Chi_01 = ref(F)(0)(1);\ + Chi_02 = ref(F)(0)(2);\ + Chi_10 = ref(F)(1)(0);\ + Chi_11 = ref(F)(1)(1);\ + Chi_12 = ref(F)(1)(2) + +#define LOAD_CHI(DIR,F) \ + {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); } + +#define LOAD_CHI_GPARITY(DIR,F) \ + { int g; const SiteHalfSpinor &ref = GparityGetChi(g,buf,DIR,F,SE,st); LOAD_CHI_BODY(g); } -#define LOAD_CHI\ - {const SiteHalfSpinor &ref(buf[offset]); \ - Chi_00 = ref()(0)(0);\ - Chi_01 = ref()(0)(1);\ - Chi_02 = ref()(0)(2);\ - Chi_10 = ref()(1)(0);\ - Chi_11 = ref()(1)(1);\ - Chi_12 = ref()(1)(2);} // To splat or not to splat depends on the implementation -#define MULT_2SPIN(A)\ - {auto & ref(U._odata[sU](A)); \ - Impl::loadLinkElement(U_00,ref()(0,0)); \ - Impl::loadLinkElement(U_10,ref()(1,0)); \ - Impl::loadLinkElement(U_20,ref()(2,0)); \ - Impl::loadLinkElement(U_01,ref()(0,1)); \ - Impl::loadLinkElement(U_11,ref()(1,1)); \ - Impl::loadLinkElement(U_21,ref()(2,1)); \ - UChi_00 = U_00*Chi_00;\ - UChi_10 = U_00*Chi_10;\ - UChi_01 = U_10*Chi_00;\ - UChi_11 = U_10*Chi_10;\ - UChi_02 = U_20*Chi_00;\ - UChi_12 = U_20*Chi_10;\ - UChi_00+= U_01*Chi_01;\ - UChi_10+= U_01*Chi_11;\ - UChi_01+= U_11*Chi_01;\ - UChi_11+= U_11*Chi_11;\ - UChi_02+= U_21*Chi_01;\ - UChi_12+= U_21*Chi_11;\ - Impl::loadLinkElement(U_00,ref()(0,2)); \ - Impl::loadLinkElement(U_10,ref()(1,2)); \ - Impl::loadLinkElement(U_20,ref()(2,2)); \ - UChi_00+= U_00*Chi_02;\ - UChi_10+= U_00*Chi_12;\ - UChi_01+= U_10*Chi_02;\ - UChi_11+= U_10*Chi_12;\ - UChi_02+= U_20*Chi_02;\ - UChi_12+= U_20*Chi_12;} +#define MULT_2SPIN_BODY \ + Impl::loadLinkElement(U_00,ref()(0,0)); \ + Impl::loadLinkElement(U_10,ref()(1,0)); \ + Impl::loadLinkElement(U_20,ref()(2,0)); \ + Impl::loadLinkElement(U_01,ref()(0,1)); \ + Impl::loadLinkElement(U_11,ref()(1,1)); \ + Impl::loadLinkElement(U_21,ref()(2,1)); \ + UChi_00 = U_00*Chi_00; \ + UChi_10 = U_00*Chi_10; \ + UChi_01 = U_10*Chi_00; \ + UChi_11 = U_10*Chi_10; \ + UChi_02 = U_20*Chi_00; \ + UChi_12 = U_20*Chi_10; \ + UChi_00+= U_01*Chi_01; \ + UChi_10+= U_01*Chi_11; \ + UChi_01+= U_11*Chi_01; \ + UChi_11+= U_11*Chi_11; \ + UChi_02+= U_21*Chi_01; \ + UChi_12+= U_21*Chi_11; \ + Impl::loadLinkElement(U_00,ref()(0,2)); \ + Impl::loadLinkElement(U_10,ref()(1,2)); \ + Impl::loadLinkElement(U_20,ref()(2,2)); \ + UChi_00+= U_00*Chi_02; \ + UChi_10+= U_00*Chi_12; \ + UChi_01+= U_10*Chi_02; \ + UChi_11+= U_10*Chi_12; \ + UChi_02+= U_20*Chi_02; \ + UChi_12+= U_20*Chi_12 + + +#define MULT_2SPIN(A,F) \ + {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; } + +#define MULT_2SPIN_GPARITY(A,F) \ + {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; } #define PERMUTE_DIR(dir) \ @@ -307,84 +324,85 @@ Author: paboyle result_31-= UChi_11; \ result_32-= UChi_12; -#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ +#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ SE=st.GetEntry(ptype,DIR,ss); \ offset = SE->_offset; \ local = SE->_is_local; \ perm = SE->_permute; \ if ( local ) { \ - LOAD_CHIMU; \ + LOAD_CHIMU_IMPL(DIR,F); \ PROJ; \ if ( perm) { \ PERMUTE_DIR(PERM); \ } \ } else { \ - LOAD_CHI; \ + LOAD_CHI_IMPL(DIR,F); \ } \ - MULT_2SPIN(DIR); \ + MULT_2SPIN_IMPL(DIR,F); \ RECON; -#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ + +#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ SE=st.GetEntry(ptype,DIR,ss); \ offset = SE->_offset; \ local = SE->_is_local; \ perm = SE->_permute; \ if ( local ) { \ - LOAD_CHIMU; \ + LOAD_CHIMU_IMPL(DIR,F); \ PROJ; \ if ( perm) { \ PERMUTE_DIR(PERM); \ } \ } else if ( st.same_node[DIR] ) { \ - LOAD_CHI; \ + LOAD_CHI_IMPL(DIR,F); \ } \ if (local || st.same_node[DIR] ) { \ - MULT_2SPIN(DIR); \ + MULT_2SPIN_IMPL(DIR,F); \ RECON; \ } -#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ +#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ SE=st.GetEntry(ptype,DIR,ss); \ offset = SE->_offset; \ if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ - LOAD_CHI; \ - MULT_2SPIN(DIR); \ + LOAD_CHI_IMPL(DIR,F); \ + MULT_2SPIN_IMPL(DIR,F); \ RECON; \ nmu++; \ } -#define HAND_RESULT(ss) \ +#define HAND_RESULT(ss,F) \ { \ SiteSpinor & ref (out._odata[ss]); \ - vstream(ref()(0)(0),result_00); \ - vstream(ref()(0)(1),result_01); \ - vstream(ref()(0)(2),result_02); \ - vstream(ref()(1)(0),result_10); \ - vstream(ref()(1)(1),result_11); \ - vstream(ref()(1)(2),result_12); \ - vstream(ref()(2)(0),result_20); \ - vstream(ref()(2)(1),result_21); \ - vstream(ref()(2)(2),result_22); \ - vstream(ref()(3)(0),result_30); \ - vstream(ref()(3)(1),result_31); \ - vstream(ref()(3)(2),result_32); \ + vstream(ref(F)(0)(0),result_00); \ + vstream(ref(F)(0)(1),result_01); \ + vstream(ref(F)(0)(2),result_02); \ + vstream(ref(F)(1)(0),result_10); \ + vstream(ref(F)(1)(1),result_11); \ + vstream(ref(F)(1)(2),result_12); \ + vstream(ref(F)(2)(0),result_20); \ + vstream(ref(F)(2)(1),result_21); \ + vstream(ref(F)(2)(2),result_22); \ + vstream(ref(F)(3)(0),result_30); \ + vstream(ref(F)(3)(1),result_31); \ + vstream(ref(F)(3)(2),result_32); \ } -#define HAND_RESULT_EXT(ss) \ +#define HAND_RESULT_EXT(ss,F) \ if (nmu){ \ SiteSpinor & ref (out._odata[ss]); \ - ref()(0)(0)+=result_00; \ - ref()(0)(1)+=result_01; \ - ref()(0)(2)+=result_02; \ - ref()(1)(0)+=result_10; \ - ref()(1)(1)+=result_11; \ - ref()(1)(2)+=result_12; \ - ref()(2)(0)+=result_20; \ - ref()(2)(1)+=result_21; \ - ref()(2)(2)+=result_22; \ - ref()(3)(0)+=result_30; \ - ref()(3)(1)+=result_31; \ - ref()(3)(2)+=result_32; \ + ref(F)(0)(0)+=result_00; \ + ref(F)(0)(1)+=result_01; \ + ref(F)(0)(2)+=result_02; \ + ref(F)(1)(0)+=result_10; \ + ref(F)(1)(1)+=result_11; \ + ref(F)(1)(2)+=result_12; \ + ref(F)(2)(0)+=result_20; \ + ref(F)(2)(1)+=result_21; \ + ref(F)(2)(2)+=result_22; \ + ref(F)(3)(0)+=result_30; \ + ref(F)(3)(1)+=result_31; \ + ref(F)(3)(2)+=result_32; \ } @@ -463,15 +481,18 @@ WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge int offset,local,perm, ptype; StencilEntry *SE; - HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); - HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); - HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); - HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); - HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); - HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); - HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); - HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); - HAND_RESULT(ss); +#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ + HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_RESULT(ss,F) + + HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); } template @@ -485,16 +506,19 @@ void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub StencilEntry *SE; int offset,local,perm, ptype; - - HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); - HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); - HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); - HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); - HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); - HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); - HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); - HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); - HAND_RESULT(ss); + +#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ + HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_RESULT(ss,F) + + HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); } template void @@ -509,16 +533,20 @@ WilsonKernels::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa int offset,local,perm, ptype; StencilEntry *SE; - ZERO_RESULT; - HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM); - HAND_RESULT(ss); + +#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ + ZERO_RESULT; \ + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_RESULT(ss,F) + + HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); } template @@ -532,16 +560,20 @@ void WilsonKernels::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D StencilEntry *SE; int offset,local,perm, ptype; - ZERO_RESULT; - HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM); - HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM); - HAND_RESULT(ss); + +#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ + ZERO_RESULT; \ + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_RESULT(ss,F) + + HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); } template void @@ -557,16 +589,20 @@ WilsonKernels::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa int offset,local,perm, ptype; StencilEntry *SE; int nmu=0; - ZERO_RESULT; - HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM); - HAND_RESULT_EXT(ss); + +#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ + ZERO_RESULT; \ + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_RESULT_EXT(ss,F) + + HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); } template @@ -581,16 +617,20 @@ void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D StencilEntry *SE; int offset,local,perm, ptype; int nmu=0; - ZERO_RESULT; - HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); - HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM); - HAND_RESULT_EXT(ss); + +#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ + ZERO_RESULT; \ + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ + HAND_RESULT_EXT(ss,F) + + HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); } //////////////////////////////////////////////// @@ -647,10 +687,130 @@ void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D FermionField &out){ assert(0); } \ HAND_SPECIALISE_EMPTY(GparityWilsonImplF); - HAND_SPECIALISE_EMPTY(GparityWilsonImplD); + //HAND_SPECIALISE_EMPTY(GparityWilsonImplD); HAND_SPECIALISE_EMPTY(GparityWilsonImplFH); HAND_SPECIALISE_EMPTY(GparityWilsonImplDF); + + + + +template<> void +WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef GparityWilsonImplD Impl; + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); + HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); +} + +template<> +void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ + typedef GparityWilsonImplD Impl; + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); + HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); +} + +template<> void +WilsonKernels::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef GparityWilsonImplD Impl; + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); + HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); +} + +template<> +void WilsonKernels::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ + typedef GparityWilsonImplD Impl; + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); + HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); +} + +template<> void +WilsonKernels::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef GparityWilsonImplD Impl; + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + int nmu=0; + HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); + nmu = 0; + HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); +} + +template<> +void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ + typedef GparityWilsonImplD Impl; + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + int nmu=0; + HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); + nmu = 0; + HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); +} + + + + + + + + + + + + + + ////////////// Wilson ; uses this implementation ///////////////////// #define INSTANTIATE_THEM(A) \ diff --git a/tests/core/Test_gparity.cc b/tests/core/Test_gparity.cc index cfb5d2c3..81091e9e 100644 --- a/tests/core/Test_gparity.cc +++ b/tests/core/Test_gparity.cc @@ -33,22 +33,68 @@ using namespace std; using namespace Grid; using namespace Grid::QCD; -typedef typename GparityDomainWallFermionR::FermionField FermionField; +//typedef GparityDomainWallFermionD GparityDiracOp; +//typedef DomainWallFermionD StandardDiracOp; +//#define DOP_PARAMS +typedef GparityMobiusFermionD GparityDiracOp; +typedef MobiusFermionD StandardDiracOp; +#define DOP_PARAMS ,1.5, 0.5 + + +typedef typename GparityDiracOp::FermionField GparityFermionField; +typedef typename GparityDiracOp::GaugeField GparityGaugeField; +typedef typename GparityFermionField::vector_type vComplexType; + +typedef typename StandardDiracOp::FermionField StandardFermionField; +typedef typename StandardDiracOp::GaugeField StandardGaugeField; + +enum{ same_vComplex = std::is_same::value }; +static_assert(same_vComplex == 1, "Dirac Operators must have same underlying SIMD complex type"); int main (int argc, char ** argv) { - const int nu = 3; + int nu = 0; Grid_init(&argc,&argv); + for(int i=1;i> nu; + std::cout << GridLogMessage << "Set Gparity direction to " << nu << std::endl; + } + } + + std::cout << GridLogMessage<< "*****************************************************************" < latt_2f(Nd,L); - std::vector latt_1f(Nd,L); latt_1f[nu] = 2*L; + //const int L =4; + //std::vector latt_2f(Nd,L); - std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector latt_2f = GridDefaultLatt(); + std::vector latt_1f(latt_2f); latt_1f[nu] = 2*latt_2f[nu]; + int L = latt_2f[nu]; + + + std::vector simd_layout = GridDefaultSimd(Nd,vComplexType::Nsimd()); + + std::cout << GridLogMessage << "SIMD layout: "; + for(int i=0;i mpi_layout = GridDefaultMpi(); //node layout GridCartesian * UGrid_1f = SpaceTimeGrid::makeFourDimGrid(latt_1f, simd_layout, mpi_layout); @@ -67,13 +113,13 @@ int main (int argc, char ** argv) GridParallelRNG RNG5_2f(FGrid_2f); RNG5_2f.SeedFixedIntegers(seeds5); GridParallelRNG RNG4_2f(UGrid_2f); RNG4_2f.SeedFixedIntegers(seeds4); - LatticeGaugeField Umu_2f(UGrid_2f); + GparityGaugeField Umu_2f(UGrid_2f); SU3::HotConfiguration(RNG4_2f,Umu_2f); - LatticeFermion src (FGrid_2f); - LatticeFermion tmpsrc(FGrid_2f); - FermionField src_2f(FGrid_2f); - LatticeFermion src_1f(FGrid_1f); + StandardFermionField src (FGrid_2f); + StandardFermionField tmpsrc(FGrid_2f); + GparityFermionField src_2f(FGrid_2f); + StandardFermionField src_1f(FGrid_1f); // Replicate fermion source random(RNG5_2f,src); @@ -81,8 +127,8 @@ int main (int argc, char ** argv) tmpsrc=src*2.0; PokeIndex<0>(src_2f,tmpsrc,1); - LatticeFermion result_1f(FGrid_1f); result_1f=zero; - LatticeGaugeField Umu_1f(UGrid_1f); + StandardFermionField result_1f(FGrid_1f); result_1f=zero; + StandardGaugeField Umu_1f(UGrid_1f); Replicate(Umu_2f,Umu_1f); //Coordinate grid for reference @@ -92,7 +138,7 @@ int main (int argc, char ** argv) //Copy-conjugate the gauge field //First C-shift the lattice by Lx/2 { - LatticeGaugeField Umu_shift = conjugate( Cshift(Umu_1f,nu,L) ); + StandardGaugeField Umu_shift = conjugate( Cshift(Umu_1f,nu,L) ); Umu_1f = where( xcoor_1f >= Integer(L), Umu_shift, Umu_1f ); // hack test to check the same @@ -101,7 +147,7 @@ int main (int argc, char ** argv) cout << GridLogMessage << "Umu diff " << norm2(Umu_shift)<(Umu_1f,nu)) Unu(UGrid_1f); Unu = PeekIndex(Umu_1f,nu); Unu = where(xcoor_1f == Integer(2*L-1), -Unu, Unu); PokeIndex(Umu_1f,Unu,nu); @@ -115,33 +161,33 @@ int main (int argc, char ** argv) RealD mass=0.0; RealD M5=1.8; - DomainWallFermionR Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5); + StandardDiracOp Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5 DOP_PARAMS); - LatticeFermion src_o_1f(FrbGrid_1f); - LatticeFermion result_o_1f(FrbGrid_1f); + StandardFermionField src_o_1f(FrbGrid_1f); + StandardFermionField result_o_1f(FrbGrid_1f); pickCheckerboard(Odd,src_o_1f,src_1f); result_o_1f=zero; - SchurDiagMooeeOperator HermOpEO(Ddwf); - ConjugateGradient CG(1.0e-8,10000); + SchurDiagMooeeOperator HermOpEO(Ddwf); + ConjugateGradient CG(1.0e-8,10000); CG(HermOpEO,src_o_1f,result_o_1f); // const int nu = 3; std::vector twists(Nd,0); twists[nu] = 1; - GparityDomainWallFermionR::ImplParams params; + GparityDiracOp::ImplParams params; params.twists = twists; - GparityDomainWallFermionR GPDdwf(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,mass,M5,params); + GparityDiracOp GPDdwf(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,mass,M5 DOP_PARAMS,params); for(int disp=-1;disp<=1;disp+=2) for(int mu=0;mu<5;mu++) { - FermionField Dsrc_2f(FGrid_2f); + GparityFermionField Dsrc_2f(FGrid_2f); - LatticeFermion Dsrc_1f(FGrid_1f); - LatticeFermion Dsrc_2freplica(FGrid_1f); - LatticeFermion Dsrc_2freplica0(FGrid_1f); - LatticeFermion Dsrc_2freplica1(FGrid_1f); + StandardFermionField Dsrc_1f(FGrid_1f); + StandardFermionField Dsrc_2freplica(FGrid_1f); + StandardFermionField Dsrc_2freplica0(FGrid_1f); + StandardFermionField Dsrc_2freplica1(FGrid_1f); if ( mu ==0 ) { std::cout << GridLogMessage<< " Cross checking entire hopping term"<(Dsrc_2f,0); - LatticeFermion Dsrc_2f1(FGrid_2f); Dsrc_2f1 = PeekIndex<0>(Dsrc_2f,1); + StandardFermionField Dsrc_2f0(FGrid_2f); Dsrc_2f0 = PeekIndex<0>(Dsrc_2f,0); + StandardFermionField Dsrc_2f1(FGrid_2f); Dsrc_2f1 = PeekIndex<0>(Dsrc_2f,1); // Dsrc_2f1 = Dsrc_2f1 - Dsrc_2f0; // std::cout << GridLogMessage << " Cross check two halves " < CG2f(1.0e-8,10000); - SchurDiagMooeeOperator HermOpEO2f(GPDdwf); + ConjugateGradient CG2f(1.0e-8,10000); + SchurDiagMooeeOperator HermOpEO2f(GPDdwf); CG2f(HermOpEO2f,src_o_2f,result_o_2f); std::cout << "2f cb "< Date: Tue, 22 Aug 2017 18:12:12 -0400 Subject: [PATCH 04/14] Replaced slow unpack-repack in G-parity BC twist with intrinsics version --- lib/qcd/action/fermion/FermionOperatorImpl.h | 26 ++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 1e344521..5300063b 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -565,6 +565,7 @@ class GparityWilsonImpl : public ConjugateGaugeImpl void GparityTwistPermute(SiteSpinorType &into, const SiteSpinorType &from, const int direction, const int distance, const int perm, GridBase* grid){ +#if 0 typedef typename SiteSpinorType::scalar_object sobj; sobj stmp; std::vector vals(grid->Nsimd()); @@ -589,6 +590,31 @@ class GparityWilsonImpl : public ConjugateGaugeImplPermuteType(direction); + typedef typename SiteSpinorType::vector_type vtype; + vtype tmp1, tmp2, tmp3, tmp4; + + for(int s=0;s 1h 1l + exchange(tmp2,tmp3, from(0)(s)(c), tmp1, permute_type); // 0l 0h , 1h 1l -> 0l 1h 0h,1l + permute(tmp4, tmp3, permute_type); //0h,1l -> 1l,0h + + if( (distance == 1 && !perm) || (distance == -1 && perm) ){ + //Pulled fermion through forwards face, GPBC on upper component + //Need 0= 0l 1h 1= 1l 0h + into(0)(s)(c) = tmp2; + into(1)(s)(c) = tmp4; + }else if( (distance == -1 && !perm) || (distance == 1 && perm) ){ + //Pulled fermion through backwards face, GPBC on lower component + //Need 0= 1l 0h 1= 0l 1h + into(0)(s)(c) = tmp4; + into(1)(s)(c) = tmp2; + }else assert(0); + } + } +#endif } From b61835c1a53ab6077840aabdb3ef8e77ea161008 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 23 Aug 2017 12:33:48 -0400 Subject: [PATCH 05/14] Added inplace version of intrinsic G-parity twist to hand-unrolled kernel --- lib/qcd/action/fermion/WilsonKernelsHand.cc | 90 ++++++++++++++++++++- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 866e30d2..045a2cda 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -47,9 +47,6 @@ Author: paboyle #define LOAD_CHIMU(DIR,F) \ { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); } -#define LOAD_CHIMU_GPARITY(DIR,F) \ - { int g; const SiteSpinor & ref = GparityGetChi(g,in._odata.data(),DIR,F,SE,st); LOAD_CHIMU_BODY(g); } - #define LOAD_CHI_BODY(F) \ Chi_00 = ref(F)(0)(0);\ Chi_01 = ref(F)(0)(1);\ @@ -61,10 +58,95 @@ Author: paboyle #define LOAD_CHI(DIR,F) \ {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); } -#define LOAD_CHI_GPARITY(DIR,F) \ + +//G-parity implementations using implementation method +#define LOAD_CHIMU_GPARITY_IMPL(DIR,F) \ + { int g; const SiteSpinor & ref = GparityGetChi(g,in._odata.data(),DIR,F,SE,st); LOAD_CHIMU_BODY(g); } + +#define LOAD_CHI_GPARITY_IMPL(DIR,F) \ { int g; const SiteHalfSpinor &ref = GparityGetChi(g,buf,DIR,F,SE,st); LOAD_CHI_BODY(g); } +//G-parity implementations using in-place intrinsic ops + +//1l 1h -> 1h 1l +//0l 0h , 1h 1l -> 0l 1h 0h,1l +//0h,1l -> 1l,0h +//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) ) +//Pulled fermion through forwards face, GPBC on upper component +//Need 0= 0l 1h 1= 1l 0h +//else if( (distance == -1 && !perm) || (distance == 1 && perm) ) +//Pulled fermion through backwards face, GPBC on lower component +//Need 0= 1l 0h 1= 0l 1h +#define DO_TWIST(INTO,S,C,F, tmp1, tmp2, tmp3, tmp4) \ + permute(tmp1, ref(1)(S)(C), permute_type); \ + exchange(tmp2,tmp3, ref(0)(S)(C), tmp1, permute_type); \ + permute(tmp4, tmp3, permute_type); \ + if( (distance == 1 && !perm) || (distance == -1 && perm) ){ \ + INTO = F == 0 ? tmp2 : tmp4; \ + }else if( (distance == -1 && !perm) || (distance == 1 && perm) ){ \ + INTO = F == 0 ? tmp4 : tmp2; \ + } + +#define LOAD_CHI_SETUP(DIR,F) \ + int g = F; \ + const int direction = st._directions[DIR]; \ + const int distance = st._distances[DIR]; \ + const int sl = st._grid->_simd_layout[direction]; \ + int inplace_twist = 0; \ + if(SE->_around_the_world && this->Params.twists[DIR % 4]){ \ + if(sl == 1){ \ + g = (F+1) % 2; \ + }else{ \ + inplace_twist = 1; \ + } \ + } + +#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F) \ + { const SiteSpinor &ref(in._odata[offset]); \ + LOAD_CHI_SETUP(DIR,F); \ + if(!inplace_twist){ \ + LOAD_CHIMU_BODY(g); \ + }else{ \ + const int permute_type = st._grid->PermuteType(direction); \ + DO_TWIST(Chimu_00,0,0,F, U_00,U_01,U_10,U_11); \ + DO_TWIST(Chimu_01,0,1,F, U_20,U_21,U_00,U_01); \ + DO_TWIST(Chimu_02,0,2,F, U_10,U_11,U_20,U_21); \ + DO_TWIST(Chimu_10,1,0,F, U_00,U_01,U_10,U_11); \ + DO_TWIST(Chimu_11,1,1,F, U_20,U_21,U_00,U_01); \ + DO_TWIST(Chimu_12,1,2,F, U_10,U_11,U_20,U_21); \ + DO_TWIST(Chimu_20,2,0,F, U_00,U_01,U_10,U_11); \ + DO_TWIST(Chimu_21,2,1,F, U_20,U_21,U_00,U_01); \ + DO_TWIST(Chimu_22,2,2,F, U_10,U_11,U_20,U_21); \ + DO_TWIST(Chimu_30,3,0,F, U_00,U_01,U_10,U_11); \ + DO_TWIST(Chimu_31,3,1,F, U_20,U_21,U_00,U_01); \ + DO_TWIST(Chimu_32,3,2,F, U_10,U_11,U_20,U_21); \ + } \ + } + + +#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F) \ + { const SiteHalfSpinor &ref(buf[offset]); \ + LOAD_CHI_SETUP(DIR,F); \ + if(!inplace_twist){ \ + LOAD_CHI_BODY(g); \ + }else{ \ + const int permute_type = st._grid->PermuteType(direction); \ + DO_TWIST(Chi_00,0,0,F, U_00,U_01,U_10,U_11); \ + DO_TWIST(Chi_01,0,1,F, U_20,U_21,UChi_00,UChi_01); \ + DO_TWIST(Chi_02,0,2,F, UChi_02,UChi_10,UChi_11,UChi_12); \ + DO_TWIST(Chi_10,1,0,F, U_00,U_01,U_10,U_11); \ + DO_TWIST(Chi_11,1,1,F, U_20,U_21,UChi_00,UChi_01); \ + DO_TWIST(Chi_12,1,2,F, UChi_02,UChi_10,UChi_11,UChi_12); \ + } \ + } + +//#define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_IMPL(DIR,F) +#define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F) + +//#define LOAD_CHIMU_GPARITY(DIR,F) LOAD_CHIMU_GPARITY_IMPL(DIR,F) +#define LOAD_CHIMU_GPARITY(DIR,F) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F) + // To splat or not to splat depends on the implementation #define MULT_2SPIN_BODY \ Impl::loadLinkElement(U_00,ref()(0,0)); \ From 46f88e6d726c238b84a9b05894f7157cb6f5c2a3 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 23 Aug 2017 13:21:10 -0400 Subject: [PATCH 06/14] G-parity hand-unrolled intrinsics twist now uses one less permute and one less temporary --- lib/qcd/action/fermion/WilsonKernelsHand.cc | 66 +++++++++++++-------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 045a2cda..6e03379e 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -78,16 +78,30 @@ Author: paboyle //else if( (distance == -1 && !perm) || (distance == 1 && perm) ) //Pulled fermion through backwards face, GPBC on lower component //Need 0= 1l 0h 1= 0l 1h -#define DO_TWIST(INTO,S,C,F, tmp1, tmp2, tmp3, tmp4) \ + +//1l 1h -> 1h 1l +//0l 0h , 1h 1l -> 0l 1h 0h,1l +#define DO_TWIST_0L_1H(INTO,S,C,F, tmp1, tmp2, tmp3) \ permute(tmp1, ref(1)(S)(C), permute_type); \ exchange(tmp2,tmp3, ref(0)(S)(C), tmp1, permute_type); \ - permute(tmp4, tmp3, permute_type); \ - if( (distance == 1 && !perm) || (distance == -1 && perm) ){ \ - INTO = F == 0 ? tmp2 : tmp4; \ - }else if( (distance == -1 && !perm) || (distance == 1 && perm) ){ \ - INTO = F == 0 ? tmp4 : tmp2; \ + INTO = tmp2; + +//0l 0h -> 0h 0l +//1l 1h, 0h 0l -> 1l 0h, 1h 0l +#define DO_TWIST_1L_0H(INTO,S,C,F, tmp1, tmp2, tmp3) \ + permute(tmp1, ref(0)(S)(C), permute_type); \ + exchange(tmp2,tmp3, ref(1)(S)(C), tmp1, permute_type); \ + INTO = tmp2; + +#define DO_TWIST(INTO,S,C,F, tmp1, tmp2, tmp3) \ + if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \ + ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \ + DO_TWIST_0L_1H(INTO,S,C,F,tmp1,tmp2,tmp3); \ + }else{ \ + DO_TWIST_1L_0H(INTO,S,C,F,tmp1,tmp2,tmp3); \ } + #define LOAD_CHI_SETUP(DIR,F) \ int g = F; \ const int direction = st._directions[DIR]; \ @@ -109,18 +123,18 @@ Author: paboyle LOAD_CHIMU_BODY(g); \ }else{ \ const int permute_type = st._grid->PermuteType(direction); \ - DO_TWIST(Chimu_00,0,0,F, U_00,U_01,U_10,U_11); \ - DO_TWIST(Chimu_01,0,1,F, U_20,U_21,U_00,U_01); \ - DO_TWIST(Chimu_02,0,2,F, U_10,U_11,U_20,U_21); \ - DO_TWIST(Chimu_10,1,0,F, U_00,U_01,U_10,U_11); \ - DO_TWIST(Chimu_11,1,1,F, U_20,U_21,U_00,U_01); \ - DO_TWIST(Chimu_12,1,2,F, U_10,U_11,U_20,U_21); \ - DO_TWIST(Chimu_20,2,0,F, U_00,U_01,U_10,U_11); \ - DO_TWIST(Chimu_21,2,1,F, U_20,U_21,U_00,U_01); \ - DO_TWIST(Chimu_22,2,2,F, U_10,U_11,U_20,U_21); \ - DO_TWIST(Chimu_30,3,0,F, U_00,U_01,U_10,U_11); \ - DO_TWIST(Chimu_31,3,1,F, U_20,U_21,U_00,U_01); \ - DO_TWIST(Chimu_32,3,2,F, U_10,U_11,U_20,U_21); \ + DO_TWIST(Chimu_00,0,0,F, U_00,U_01,U_10); \ + DO_TWIST(Chimu_01,0,1,F, U_11,U_20,U_21); \ + DO_TWIST(Chimu_02,0,2,F, U_00,U_01,U_10); \ + DO_TWIST(Chimu_10,1,0,F, U_11,U_20,U_21); \ + DO_TWIST(Chimu_11,1,1,F, U_00,U_01,U_10); \ + DO_TWIST(Chimu_12,1,2,F, U_11,U_20,U_21); \ + DO_TWIST(Chimu_20,2,0,F, U_00,U_01,U_10); \ + DO_TWIST(Chimu_21,2,1,F, U_11,U_20,U_21); \ + DO_TWIST(Chimu_22,2,2,F, U_00,U_01,U_10); \ + DO_TWIST(Chimu_30,3,0,F, U_11,U_20,U_21); \ + DO_TWIST(Chimu_31,3,1,F, U_00,U_01,U_10); \ + DO_TWIST(Chimu_32,3,2,F, U_11,U_20,U_21); \ } \ } @@ -132,15 +146,19 @@ Author: paboyle LOAD_CHI_BODY(g); \ }else{ \ const int permute_type = st._grid->PermuteType(direction); \ - DO_TWIST(Chi_00,0,0,F, U_00,U_01,U_10,U_11); \ - DO_TWIST(Chi_01,0,1,F, U_20,U_21,UChi_00,UChi_01); \ - DO_TWIST(Chi_02,0,2,F, UChi_02,UChi_10,UChi_11,UChi_12); \ - DO_TWIST(Chi_10,1,0,F, U_00,U_01,U_10,U_11); \ - DO_TWIST(Chi_11,1,1,F, U_20,U_21,UChi_00,UChi_01); \ - DO_TWIST(Chi_12,1,2,F, UChi_02,UChi_10,UChi_11,UChi_12); \ + DO_TWIST(Chi_00,0,0,F, U_00,U_01,U_10); \ + DO_TWIST(Chi_01,0,1,F, U_11,U_20,U_21); \ + DO_TWIST(Chi_02,0,2,F, UChi_00,UChi_01,UChi_02); \ + DO_TWIST(Chi_10,1,0,F, UChi_10,UChi_11,UChi_12); \ + DO_TWIST(Chi_11,1,1,F, U_00,U_01,U_10); \ + DO_TWIST(Chi_12,1,2,F, U_11,U_20,U_21); \ } \ } + + + + //#define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_IMPL(DIR,F) #define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F) From a0bb8e5b4660634db2b4a7cf994b2665926b24e9 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 23 Aug 2017 14:44:40 -0400 Subject: [PATCH 07/14] Added hand-unrolled kernel implementations of all the other dslash precision / comms precision combinations with G-parity --- lib/qcd/action/fermion/WilsonKernelsHand.cc | 213 ++++++++++---------- 1 file changed, 103 insertions(+), 110 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 6e03379e..ea04845e 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -786,120 +786,113 @@ void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D const FermionField &in, \ FermionField &out){ assert(0); } \ - HAND_SPECIALISE_EMPTY(GparityWilsonImplF); - //HAND_SPECIALISE_EMPTY(GparityWilsonImplD); - HAND_SPECIALISE_EMPTY(GparityWilsonImplFH); - HAND_SPECIALISE_EMPTY(GparityWilsonImplDF); +#define HAND_SPECIALISE_GPARITY(IMPL) \ + template<> void \ + WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out) \ + { \ + typedef IMPL Impl; \ + typedef typename Simd::scalar_type S; \ + typedef typename Simd::vector_type V; \ + \ + HAND_DECLARATIONS(ignore); \ + \ + int offset,local,perm, ptype; \ + StencilEntry *SE; \ + HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + } \ + \ + template<> \ + void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out) \ + { \ + typedef IMPL Impl; \ + typedef typename Simd::scalar_type S; \ + typedef typename Simd::vector_type V; \ + \ + HAND_DECLARATIONS(ignore); \ + \ + StencilEntry *SE; \ + int offset,local,perm, ptype; \ + HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + } \ + \ + template<> void \ + WilsonKernels::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out) \ + { \ + typedef IMPL Impl; \ + typedef typename Simd::scalar_type S; \ + typedef typename Simd::vector_type V; \ + \ + HAND_DECLARATIONS(ignore); \ + \ + int offset,local,perm, ptype; \ + StencilEntry *SE; \ + HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + } \ + \ + template<> \ + void WilsonKernels::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out) \ + { \ + typedef IMPL Impl; \ + typedef typename Simd::scalar_type S; \ + typedef typename Simd::vector_type V; \ + \ + HAND_DECLARATIONS(ignore); \ + \ + StencilEntry *SE; \ + int offset,local,perm, ptype; \ + HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + } \ + \ + template<> void \ + WilsonKernels::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out) \ + { \ + typedef IMPL Impl; \ + typedef typename Simd::scalar_type S; \ + typedef typename Simd::vector_type V; \ + \ + HAND_DECLARATIONS(ignore); \ + \ + int offset,local,perm, ptype; \ + StencilEntry *SE; \ + int nmu=0; \ + HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + nmu = 0; \ + HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + } \ + template<> \ + void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out) \ + { \ + typedef IMPL Impl; \ + typedef typename Simd::scalar_type S; \ + typedef typename Simd::vector_type V; \ + \ + HAND_DECLARATIONS(ignore); \ + \ + StencilEntry *SE; \ + int offset,local,perm, ptype; \ + int nmu=0; \ + HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + nmu = 0; \ + HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ + } - -template<> void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out) -{ -// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - typedef GparityWilsonImplD Impl; - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - int offset,local,perm, ptype; - StencilEntry *SE; - HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); - HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); -} - -template<> -void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out) -{ - typedef GparityWilsonImplD Impl; - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - StencilEntry *SE; - int offset,local,perm, ptype; - HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); - HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); -} - -template<> void -WilsonKernels::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out) -{ -// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - typedef GparityWilsonImplD Impl; - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - int offset,local,perm, ptype; - StencilEntry *SE; - HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); - HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); -} - -template<> -void WilsonKernels::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out) -{ - typedef GparityWilsonImplD Impl; - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - StencilEntry *SE; - int offset,local,perm, ptype; - HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); - HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); -} - -template<> void -WilsonKernels::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out) -{ -// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - typedef GparityWilsonImplD Impl; - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - int offset,local,perm, ptype; - StencilEntry *SE; - int nmu=0; - HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); - nmu = 0; - HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); -} - -template<> -void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out) -{ - typedef GparityWilsonImplD Impl; - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); - - StencilEntry *SE; - int offset,local,perm, ptype; - int nmu=0; - HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); - nmu = 0; - HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); -} - - - +HAND_SPECIALISE_GPARITY(GparityWilsonImplF); +HAND_SPECIALISE_GPARITY(GparityWilsonImplD); +HAND_SPECIALISE_GPARITY(GparityWilsonImplFH); +HAND_SPECIALISE_GPARITY(GparityWilsonImplDF); From ce5df177eeccee214f3f394bbdec385f7c19caeb Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 23 Aug 2017 15:05:22 -0400 Subject: [PATCH 08/14] Removed superfluous implementation of G-parity twist for hand-unrolled kernel from GparityWilsonImpl --- lib/qcd/action/fermion/FermionOperatorImpl.h | 85 +------------------- lib/qcd/action/fermion/WilsonKernelsHand.cc | 14 ---- 2 files changed, 1 insertion(+), 98 deletions(-) diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 5300063b..ffb82989 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -478,10 +478,7 @@ class GparityWilsonImpl : public ConjugateGaugeImpl > tmp_full; - std::vector > tmp_half; - - GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p), tmp_full(GridThread::GetThreads()), tmp_half(GridThread::GetThreads()){}; + GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; @@ -563,86 +560,6 @@ class GparityWilsonImpl : public ConjugateGaugeImpl - void GparityTwistPermute(SiteSpinorType &into, const SiteSpinorType &from, const int direction, const int distance, const int perm, GridBase* grid){ -#if 0 - typedef typename SiteSpinorType::scalar_object sobj; - sobj stmp; - std::vector vals(grid->Nsimd()); - extract(from,vals); - std::vector icoor; - for(int s=0;sNsimd();s++){ - grid->iCoorFromIindex(icoor,s); - assert((icoor[direction]==0)||(icoor[direction]==1)); - - int permute_lane; - if ( distance == 1) { - permute_lane = icoor[direction]?1:0; - } else { - permute_lane = icoor[direction]?0:1; - } - if(perm) permute_lane = !permute_lane; - - if ( permute_lane ) { - stmp(0) = vals[s](1); - stmp(1) = vals[s](0); - vals[s] = stmp; - } - } - merge(into,vals); -#else - int permute_type = grid->PermuteType(direction); - typedef typename SiteSpinorType::vector_type vtype; - vtype tmp1, tmp2, tmp3, tmp4; - - for(int s=0;s 1h 1l - exchange(tmp2,tmp3, from(0)(s)(c), tmp1, permute_type); // 0l 0h , 1h 1l -> 0l 1h 0h,1l - permute(tmp4, tmp3, permute_type); //0h,1l -> 1l,0h - - if( (distance == 1 && !perm) || (distance == -1 && perm) ){ - //Pulled fermion through forwards face, GPBC on upper component - //Need 0= 0l 1h 1= 1l 0h - into(0)(s)(c) = tmp2; - into(1)(s)(c) = tmp4; - }else if( (distance == -1 && !perm) || (distance == 1 && perm) ){ - //Pulled fermion through backwards face, GPBC on lower component - //Need 0= 1l 0h 1= 0l 1h - into(0)(s)(c) = tmp4; - into(1)(s)(c) = tmp2; - }else assert(0); - } - } -#endif - } - - - template - const SiteSpinorType & GparityGetChi(int &g, SiteSpinorType const* in, const int dir, const int f, StencilEntry *SE, StencilImpl &st){ - const int mmu = dir % 4; - const int direction = st._directions[dir]; - const int sl = st._grid->_simd_layout[direction]; - const int perm = SE->_permute; - g = f; - - if(SE->_around_the_world && Params.twists[mmu]){ - if(sl == 1){ //not SIMD vectorized in G-parity direction so just change the flavor index accessed to implement the twist - g = (f+1) % 2; - return in[SE->_offset]; - }else{ //SIMD vectorized in Gparity direction - const int me = omp_get_thread_num(); - const int distance = st._distances[dir]; - assert(distance == -1 || distance == 1); - SiteSpinorType &tmp = GparityWilsonImpl_helper::getAB::ref(tmp_full[me], tmp_half[me]); - GparityTwistPermute(tmp, in[SE->_offset], direction, distance, perm, st._grid); - return tmp; - } - }else return in[SE->_offset]; - } - - - inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) { conformable(Uds._grid,GaugeGrid); diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index ea04845e..e1243304 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -59,14 +59,6 @@ Author: paboyle {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); } -//G-parity implementations using implementation method -#define LOAD_CHIMU_GPARITY_IMPL(DIR,F) \ - { int g; const SiteSpinor & ref = GparityGetChi(g,in._odata.data(),DIR,F,SE,st); LOAD_CHIMU_BODY(g); } - -#define LOAD_CHI_GPARITY_IMPL(DIR,F) \ - { int g; const SiteHalfSpinor &ref = GparityGetChi(g,buf,DIR,F,SE,st); LOAD_CHI_BODY(g); } - - //G-parity implementations using in-place intrinsic ops //1l 1h -> 1h 1l @@ -156,13 +148,7 @@ Author: paboyle } - - - -//#define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_IMPL(DIR,F) #define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F) - -//#define LOAD_CHIMU_GPARITY(DIR,F) LOAD_CHIMU_GPARITY_IMPL(DIR,F) #define LOAD_CHIMU_GPARITY(DIR,F) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F) // To splat or not to splat depends on the implementation From edabb3577ff13df048fccb8e9fb17080531e32df Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 23 Aug 2017 16:54:06 -0400 Subject: [PATCH 09/14] Imported Benchmark_gparity --- benchmarks/Benchmark_gparity.cc | 190 ++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 benchmarks/Benchmark_gparity.cc diff --git a/benchmarks/Benchmark_gparity.cc b/benchmarks/Benchmark_gparity.cc new file mode 100644 index 00000000..f6036aa8 --- /dev/null +++ b/benchmarks/Benchmark_gparity.cc @@ -0,0 +1,190 @@ +#include +#include +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +typedef typename GparityDomainWallFermionF::FermionField GparityLatticeFermionF; +typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD; + + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int Ls=16; + for(int i=0;i> Ls; + } + + + int threads = GridThread::GetThreads(); + std::cout<_Nprocessors; + RealD NN = UGrid->NodeCount(); + + std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); + Dw.ZeroCounters(); + Dw.Dhop(src,result,0); + std::cout<Barrier(); + + double volume=Ls; for(int mu=0;muBarrier(); + DwH.ZeroCounters(); + DwH.Dhop(src,result,0); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;muBarrier(); + DwD.ZeroCounters(); + DwD.Dhop(src_d,result_d,0); + std::cout<Barrier(); + + double volume=Ls; for(int mu=0;mu latt4 = GridDefaultLatt(); - const int Ls=16; + int Ls=16; + for(int i=0;i> Ls; + } + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); From 74af885d4eda81453b1eb83b062a11810011d5f5 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 29 Aug 2017 09:50:37 -0400 Subject: [PATCH 13/14] Removed some no-longer-needed associated with G-parity hand unrolled kernel --- lib/qcd/action/fermion/FermionOperatorImpl.h | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index ffb82989..9d24deb2 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -425,22 +425,6 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres //////////////////////////////////////////////////////////////////////////////////////// // Flavour doubled spinors; is Gparity the only? what about C*? //////////////////////////////////////////////////////////////////////////////////////// -namespace GparityWilsonImpl_helper{ - template - struct getAB; - - template - struct getAB{ - static inline A & ref(A &a, B &b){ return a; } - }; - template - struct getAB{ - static inline B & ref(A &a, B &b){ return b; } - }; -}; - - - template class GparityWilsonImpl : public ConjugateGaugeImpl > { public: From 59bd1fe21b3fc81b8c50e9caa53f2adc65b3d7b5 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 29 Aug 2017 13:07:37 -0700 Subject: [PATCH 14/14] Fix for 'perm' and 'local' not being set for hand-unrolled external-site Dslash, which caused incorrect behavior of G-parity kernel --- lib/qcd/action/fermion/WilsonKernelsHand.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index a0f5ffec..80b81714 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -468,6 +468,8 @@ Author: paboyle #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ SE=st.GetEntry(ptype,DIR,ss); \ offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ LOAD_CHI_IMPL(DIR,F,PERM); \ MULT_2SPIN_IMPL(DIR,F); \