diff --git a/.gitignore b/.gitignore index 38e3da2d..da7de5e4 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ config.status .deps Make.inc eigen.inc +Eigen.inc # http://www.gnu.org/software/autoconf # ######################################## diff --git a/lib/Threads.h b/lib/Threads.h index 2f270b73..7cb0f3fe 100644 --- a/lib/Threads.h +++ b/lib/Threads.h @@ -44,8 +44,9 @@ Author: paboyle #define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)") #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)") #endif -#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)") -#define PARALLEL_REGION _Pragma("omp parallel") +#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)") +#define PARALLEL_REGION _Pragma("omp parallel") +#define PARALLEL_FOR_LOOP_STATIC _Pragma("omp parallel for schedule(static)") #else #define PARALLEL_FOR_LOOP #define PARALLEL_FOR_LOOP_INTERN diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index dfc0dd04..5ba5cd5d 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -62,9 +62,10 @@ class ConjugateGradient : public OperatorFunction { Integer MaxIterations; bool ReproTest; CG_state CGState;//to check reproducibility by repeating the CG + ReproducibilityState ReprTest; ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true, - bool ReproducibilityTest = false) + bool ReproducibilityTest = false) : Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv), @@ -84,7 +85,7 @@ class ConjugateGradient : public OperatorFunction { Field psi_start(psi);// save for the repro test if (CGState.do_repro) - std::cout << GridLogMessage << "Starting reproducibility test" << std::endl; + std::cout << GridLogMessage << "Starting reproducibility test" << std::endl; // Initial residual computation & set up RealD guess = norm2(psi); @@ -93,6 +94,10 @@ class ConjugateGradient : public OperatorFunction { Linop.HermOpAndNorm(psi, mmp, d, b); + if(!ReprTest.do_check) + ReprTest.reset(); + ReprTest.enable_reprocheck=ReproTest; + r = src - mmp; p = r; @@ -146,21 +151,22 @@ class ConjugateGradient : public OperatorFunction { b_pred = a * (a * qq - d) / c;// a check - cp = axpy_norm(r, -a, mmp, r);// new residual r = r_old - a * Ap + axpy(r, -a, mmp, r);// new residual r = r_old - a * Ap + cp = norm2(r, ReprTest);// if (ReproTest && !CGState.do_repro) { - CGState.residuals.push_back(cp); // save residuals state - std::cout << GridLogIterative << "ReproTest: Saving state" << std::endl; - } + CGState.residuals.push_back(cp); // save residuals state + std::cout << GridLogIterative << "ReproTest: Saving state" << std::endl; + } if (ReproTest && CGState.do_repro){ - // check that the residual agrees with the previous run - std::cout << GridLogIterative << "ReproTest: Checking state k=" << k << std::endl; - if (cp != CGState.residuals[k-1]){ - std::cout << GridLogMessage << "Failing reproducibility test"; - std::cout << GridLogMessage << " at k=" << k << std::endl; - std::cout << GridLogMessage << "saved residual = " << CGState.residuals[k-1] - << " cp = " << cp << std::endl; - exit(-1); - } + // check that the residual agrees with the previous run + std::cout << GridLogIterative << "ReproTest: Checking state k=" << k << std::endl; + if (cp != CGState.residuals[k-1]){ + std::cout << GridLogMessage << "Failing reproducibility test"; + std::cout << GridLogMessage << " at k=" << k << std::endl; + std::cout << GridLogMessage << "saved residual = " << CGState.residuals[k-1] + << " cp = " << cp << std::endl; + exit(-1); + } } b = cp / c; @@ -197,13 +203,16 @@ class ConjugateGradient : public OperatorFunction { if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); - if (!CGState.do_repro && ReproTest){ - CGState.do_repro = true; - this->operator()(Linop, src, psi_start);// run the repro test - } + if (!CGState.do_repro && ReproTest){ + CGState.do_repro = true; + ReprTest.do_check = true; + ReprTest.reset_counter(); + this->operator()(Linop, src, psi_start);// run the repro test + } - // Clear state - CGState.reset(); + // Clear state + CGState.reset(); + ReprTest.reset(); return; } } diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 88603407..8fc5549b 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -34,21 +34,24 @@ namespace Grid { template struct ReproducibilityState { - int n_call; + typedef typename T::vector_type vector_type; + unsigned int n_call; bool do_check; - std::vector > > th_states; + bool enable_reprocheck; + std::vector > > + th_states; - void reset(){ + void reset() { th_states.clear(); do_check = false; + enable_reprocheck = false; n_call = 0; } - ReproducibilityState(){ - reset(); - } -}; + void reset_counter() { n_call = 0; } + ReproducibilityState() { reset(); } +}; #ifdef GRID_WARN_SUBOPTIMAL #warning "Optimisation alert all these reduction loops are NOT threaded " @@ -58,15 +61,27 @@ struct ReproducibilityState { // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// template inline RealD norm2(const Lattice &arg){ - ComplexD nrm = innerProduct(arg,arg); - return std::real(nrm); + ReproducibilityState repr; + ComplexD nrm = innerProduct(arg, arg, repr); + return std::real(nrm); } - - + template + inline RealD norm2(const Lattice &arg, ReproducibilityState& rep) { + ComplexD nrm = innerProduct(arg, arg, rep); + return std::real(nrm); + } template - inline ComplexD innerProduct(const Lattice &left,const Lattice &right) + inline ComplexD innerProduct(const Lattice &left,const Lattice &right){ + ReproducibilityState repr; + return innerProduct(left, right, repr); + } + + + + template + inline ComplexD innerProduct(const Lattice &left,const Lattice &right, ReproducibilityState& repr) { typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -81,30 +96,45 @@ struct ReproducibilityState { // accumulation done in the same precision ad vobj... // may need to froce higher precision - PARALLEL_FOR_LOOP + PARALLEL_FOR_LOOP_STATIC //request statically scheduled threads for reproducibility for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; - GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); - - decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation + GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); + + decltype(innerProduct(left._odata[0],right._odata[0])) vnrm = zero; // private to thread; sub summation for(int ss=myoff;ssThisRank() << std::endl; - exit(1); + /////////////////////// Reproducibility section + if (repr.enable_reprocheck) { + if (repr.do_check) { + //std::cout << GridLogMessage << "Checking thread state for inner product. Call n. " << repr.n_call << std::endl; + for (int thread = 0; thread < sumarray.size(); thread++) { + if (sumarray[thread] != repr.th_states[repr.n_call][thread]) { + std::cout << GridLogMessage << "Reproducibility failure on node " << grid->ThisRank() << std::endl; + std::cout << GridLogMessage << "Call: "<< repr.n_call << " Thread: " << thread << std::endl; + std::cout << GridLogMessage << "Size of states: " << repr.th_states.size() << std::endl; + std::cout << GridLogMessage << sumarray[thread] << std::endl; + std::cout << GridLogMessage << repr.th_states[repr.n_call][thread] << std::endl; + //exit(1); + } + } + repr.n_call++; + } else + { + //std::cout << GridLogMessage << "Saving thread state for inner product. Call n. " << repr.n_call << std::endl; + repr.th_states.resize(repr.n_call+1); + repr.th_states[repr.n_call].resize(grid->SumArraySize()); + repr.th_states[repr.n_call] = sumarray; // save threads state + repr.n_call++; } - } - } else { - repr.th_states.push_back(sumarray);//save threads state - repr.n_call +=1; } - */ + //////////////////////////////////////////////////////// + vector_type vvnrm; vvnrm=zero; // sum across threads for(int i=0;iSumArraySize();i++){ @@ -154,13 +184,13 @@ struct ReproducibilityState { PARALLEL_FOR_LOOP for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; - GridThread::GetWork(grid->oSites(),thr,mywork,myoff); + GridThread::GetWork(grid->oSites(),thr,mywork,myoff); - vobj vvsum=zero; + vobj vvsum=zero; for(int ss=myoff;ss