mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-15 06:17:05 +01:00
Report only on failing nodes
This commit is contained in:
@ -81,6 +81,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void set_reproducibility_interval(unsigned int interval){
|
||||||
|
ReprTest.interval = interval;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
|
void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
|
||||||
@ -95,8 +98,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
Field r(src);
|
Field r(src);
|
||||||
Field psi_start(psi);// save for the repro test
|
Field psi_start(psi);// save for the repro test
|
||||||
|
|
||||||
if (CGState.do_repro)
|
if (CGState.do_repro && ReproTest)
|
||||||
std::cout << GridLogMessage << "Starting reproducibility test" << std::endl;
|
std::cout << GridLogMessage << "Starting reproducibility test, full check every "
|
||||||
|
<< ReprTest.interval << " calls" << std::endl;
|
||||||
|
|
||||||
// Initial residual computation & set up
|
// Initial residual computation & set up
|
||||||
RealD guess = norm2(psi);
|
RealD guess = norm2(psi);
|
||||||
@ -208,7 +212,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
|
|
||||||
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
||||||
|
|
||||||
if (!CGState.do_repro && ReproTest){
|
if (! (CGState.do_repro && ReproTest)){
|
||||||
CGState.do_repro = true;
|
CGState.do_repro = true;
|
||||||
ReprTest.do_check = true;
|
ReprTest.do_check = true;
|
||||||
ReprTest.reset_counter();
|
ReprTest.reset_counter();
|
||||||
|
@ -41,6 +41,7 @@ class ReproducibilityState {
|
|||||||
bool do_check;
|
bool do_check;
|
||||||
bool enable_reprocheck;
|
bool enable_reprocheck;
|
||||||
bool success;
|
bool success;
|
||||||
|
unsigned int interval;
|
||||||
std::vector<sum_type> th_states;
|
std::vector<sum_type> th_states;
|
||||||
|
|
||||||
void reset_counter() { n_call = 0; }
|
void reset_counter() { n_call = 0; }
|
||||||
@ -52,12 +53,14 @@ class ReproducibilityState {
|
|||||||
n_call = 0;
|
n_call = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
ReproducibilityState() { reset(); }
|
ReproducibilityState():interval(1) {
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
void check(GridBase* grid, sum_type &sumarray){
|
void check(GridBase* grid, sum_type &sumarray){
|
||||||
/////////////////////// Reproducibility section, not threaded on purpouse
|
/////////////////////// Reproducibility section, not threaded on purpouse
|
||||||
if (enable_reprocheck) {
|
if (enable_reprocheck) {
|
||||||
if (do_check) {
|
if (do_check && (n_call % interval) == 0) {
|
||||||
for (int thread = 0; thread < sumarray.size(); thread++) {
|
for (int thread = 0; thread < sumarray.size(); thread++) {
|
||||||
int words = sizeof(sumarray[thread])/sizeof(unsigned char);
|
int words = sizeof(sumarray[thread])/sizeof(unsigned char);
|
||||||
unsigned char xors[words];
|
unsigned char xors[words];
|
||||||
@ -65,43 +68,45 @@ class ReproducibilityState {
|
|||||||
// OR all words
|
// OR all words
|
||||||
unsigned char res = 0;
|
unsigned char res = 0;
|
||||||
for (int w = 0; w < words; w++) res = res | xors[w];
|
for (int w = 0; w < words; w++) res = res | xors[w];
|
||||||
if ( res ) {
|
|
||||||
std::cout << GridLogMessage << "Reproducibility failure report" << std::endl;
|
|
||||||
|
|
||||||
Grid_unquiesce_nodes();
|
Grid_unquiesce_nodes();
|
||||||
int rank = 0;
|
int rank = 0;
|
||||||
while (rank < grid->_Nprocessors){
|
while (rank < grid->_Nprocessors){
|
||||||
if (rank == grid->ThisRank() ){
|
if (rank == grid->ThisRank() ){
|
||||||
grid->PrintRankInfo();
|
if ( res ) {
|
||||||
std::cout << "Call: "<< n_call << " Thread: " << thread << std::endl;
|
std::cout << "Reproducibility failure report" << std::endl;
|
||||||
std::cout << "Size of states: " << th_states.size() << std::endl;
|
grid->PrintRankInfo();
|
||||||
std::cout << std::setprecision(GRID_REAL_DIGITS+1) << std::scientific;
|
std::cout << "Call: "<< n_call << " Thread: " << thread << std::endl;
|
||||||
std::cout << "Saved partial sum : " << th_states[n_call][thread] << std::endl;
|
std::cout << "Size of states: " << th_states.size() << std::endl;
|
||||||
std::cout << "Current partial sum: " << sumarray[thread] << std::endl;
|
std::cout << std::setprecision(GRID_REAL_DIGITS+1) << std::scientific;
|
||||||
std::cout << "Saved state " << std::endl; show_binaryrep(th_states[n_call][thread]);
|
std::cout << "Saved partial sum : " << th_states[n_call][thread] << std::endl;
|
||||||
std::cout << "Current state" << std::endl; show_binaryrep(sumarray[thread]);
|
std::cout << "Current partial sum: " << sumarray[thread] << std::endl;
|
||||||
std::cout << "XOR result" << std::endl; show_binaryrep(xors, words);
|
std::cout << "Saved state " << std::endl; show_binaryrep(th_states[n_call][thread]);
|
||||||
|
std::cout << "Current state" << std::endl; show_binaryrep(sumarray[thread]);
|
||||||
|
std::cout << "XOR result" << std::endl; show_binaryrep(xors, words);
|
||||||
//std::cout << std::defaultfloat; //not supported by some compilers
|
//std::cout << std::defaultfloat; //not supported by some compilers
|
||||||
std::cout << std::setprecision(6);
|
std::cout << std::setprecision(6);
|
||||||
success = false;
|
success = false;
|
||||||
}
|
|
||||||
rank++;
|
|
||||||
grid->Barrier();
|
|
||||||
}
|
}
|
||||||
Grid_quiesce_nodes();
|
|
||||||
}
|
}
|
||||||
|
rank++;
|
||||||
|
grid->Barrier();
|
||||||
}
|
}
|
||||||
n_call++;
|
Grid_quiesce_nodes();
|
||||||
} else
|
|
||||||
{
|
|
||||||
std::cout << GridLogDebug << "Saving thread state for inner product. Call n. " << n_call << std::endl;
|
|
||||||
th_states.resize(n_call+1);
|
|
||||||
th_states[n_call].resize(grid->SumArraySize());
|
|
||||||
th_states[n_call] = sumarray; // save threads state
|
|
||||||
n_call++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else if (!do_check)
|
||||||
|
{
|
||||||
|
std::cout << GridLogDebug << "Saving thread state for inner product. Call n. "
|
||||||
|
<< n_call << std::endl;
|
||||||
|
th_states.resize(n_call+1);
|
||||||
|
th_states[n_call].resize(grid->SumArraySize());
|
||||||
|
th_states[n_call] = sumarray; // save threads state
|
||||||
|
//n_call++;
|
||||||
|
}
|
||||||
|
n_call++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2,12 +2,12 @@
|
|||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
Source file: ./tests/Test_cayley_cg.cc
|
Source file: ./tests/Test_cayley_cg_reproducibility.cc
|
||||||
|
|
||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -32,6 +32,9 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
#define REPRODUCIBILITY_INTERVAL 1
|
||||||
|
|
||||||
|
|
||||||
template<class d>
|
template<class d>
|
||||||
struct scal {
|
struct scal {
|
||||||
d internal;
|
d internal;
|
||||||
@ -169,6 +172,7 @@ void TestCGunprec(What & Ddwf,
|
|||||||
|
|
||||||
MdagMLinearOperator<What,LatticeFermion> HermOp(Ddwf);
|
MdagMLinearOperator<What,LatticeFermion> HermOp(Ddwf);
|
||||||
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
|
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
|
||||||
|
CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
|
||||||
CG(HermOp,src,result);
|
CG(HermOp,src,result);
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -188,6 +192,7 @@ void TestCGprec(What & Ddwf,
|
|||||||
|
|
||||||
SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf);
|
SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf);
|
||||||
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
|
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
|
||||||
|
CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
|
||||||
CG(HermOpEO,src_o,result_o);
|
CG(HermOpEO,src_o,result_o);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -204,6 +209,7 @@ void TestCGschur(What & Ddwf,
|
|||||||
LatticeFermion result(FGrid); result=zero;
|
LatticeFermion result(FGrid); result=zero;
|
||||||
|
|
||||||
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
|
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
|
||||||
|
CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
|
||||||
SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
|
SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
|
||||||
SchurSolver(Ddwf,src,result);
|
SchurSolver(Ddwf,src,result);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user