From f1908c7bc9f284a43a65a273c87c82d38e5d223d Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Mon, 21 Nov 2016 09:52:07 +0000
Subject: [PATCH] Adding reproducibility tests

---
 lib/algorithms/iterative/ConjugateGradient.h |  23 ++--
 lib/lattice/Lattice_reduction.h              | 107 +++++++++++++------
 2 files changed, 85 insertions(+), 45 deletions(-)
diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h
index 1f9b9b58..dfc0dd04 100644
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -33,17 +33,21 @@ directory
 
 namespace Grid {
 
-struct CG_state{
-	bool do_repro;
-	std::vector<RealD> residuals;
+struct CG_state {
+  bool do_repro;
+  std::vector<RealD> residuals;
 
-	CG_state(){
-		do_repro = false;
-		residuals.clear();}
+  CG_state() {
+    do_repro = false;
+    residuals.clear();
+  }
 
+  void reset(){
+    do_repro = false;
+    residuals.clear();
+  }
 };
 
-
 /////////////////////////////////////////////////////////////
 // Base classes for iterative processes based on operators
 // single input vec, single output vec.
@@ -57,7 +61,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
   RealD Tolerance;
   Integer MaxIterations;
   bool ReproTest;
-  CG_state CGState;
+  CG_state CGState;//to check reproducibility by repeating the CG
 
   ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true,
   	bool ReproducibilityTest = false)
@@ -199,8 +203,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
        	}
 
        	// Clear state
-       	CGState.residuals.clear();
-       	CGState.do_repro = false;
+       	CGState.reset();
         return;
       }
     }
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index 2615af48..88603407 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -31,6 +31,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define GRID_LATTICE_REDUCTION_H
 
 namespace Grid {
+
+template <class T>
+struct ReproducibilityState {
+  int n_call;
+  bool do_check;
+  std::vector<std::vector<T, alignedAllocator<T> > > th_states;
+
+  void reset(){
+    th_states.clear();
+    do_check = false;
+    n_call = 0;
+  }
+
+  ReproducibilityState(){
+    reset();
+  }
+};
+
+
 #ifdef GRID_WARN_SUBOPTIMAL
 #warning "Optimisation alert all these reduction loops are NOT threaded "
 #endif     
@@ -39,9 +58,12 @@ namespace Grid {
     // Deterministic Reduction operations
     ////////////////////////////////////////////////////////////////////////////////////////////////////
   template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
-    ComplexD nrm = innerProduct(arg,arg);
-    return std::real(nrm); 
-  }
+      ComplexD nrm = innerProduct(arg,arg);
+      return std::real(nrm); 
+    }
+
+
+    
 
     template<class vobj>
     inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) 
@@ -54,24 +76,39 @@ namespace Grid {
 
       std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
       for(int i=0;i<grid->SumArraySize();i++){
-	sumarray[i]=zero;
+        sumarray[i]=zero;
       }
 
-PARALLEL_FOR_LOOP
+      // accumulation done in the same precision ad vobj...
+      // may need to froce higher precision
+      PARALLEL_FOR_LOOP
       for(int thr=0;thr<grid->SumArraySize();thr++){
-	int nwork, mywork, myoff;
+        int nwork, mywork, myoff;
 	GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
 	
 	decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
         for(int ss=myoff;ss<mywork+myoff; ss++){
-	  vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
+          vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
 	}
 	sumarray[thr]=TensorRemove(vnrm) ;
       }
-    
+
+/*
+      if (repr.do_check){
+        if (sumarray!=repr.th_states[n_call]){
+          std::cout << GridLogMessage << "Reproducibility failure on node " << grid->ThisRank() << std::endl;
+          exit(1);
+        }
+        }
+      } else {
+        repr.th_states.push_back(sumarray);//save threads state
+        repr.n_call +=1;
+      }
+      */
+
       vector_type vvnrm; vvnrm=zero;  // sum across threads
       for(int i=0;i<grid->SumArraySize();i++){
-	vvnrm = vvnrm+sumarray[i];
+        vvnrm = vvnrm+sumarray[i];
       } 
       nrm = Reduce(vvnrm);// sum across simd
       right._grid->GlobalSum(nrm);
@@ -79,26 +116,26 @@ PARALLEL_FOR_LOOP
     }
 
     template<class Op,class T1>
-      inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
-      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
+    inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
+    ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
     {
       return sum(closure(expr));
     }
 
     template<class Op,class T1,class T2>
-      inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
-      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
+    inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
+    ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
     {
       return sum(closure(expr));
     }
 
 
     template<class Op,class T1,class T2,class T3>
-      inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
-      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				 eval(0,std::get<1>(expr.second)),
-				 eval(0,std::get<2>(expr.second))
-				 ))::scalar_object
+    inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
+    ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+     eval(0,std::get<1>(expr.second)),
+     eval(0,std::get<2>(expr.second))
+     ))::scalar_object
     {
       return sum(closure(expr));
     }
@@ -111,24 +148,24 @@ PARALLEL_FOR_LOOP
 
       std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
       for(int i=0;i<grid->SumArraySize();i++){
-	sumarray[i]=zero;
+        sumarray[i]=zero;
       }
 
-PARALLEL_FOR_LOOP
+      PARALLEL_FOR_LOOP
       for(int thr=0;thr<grid->SumArraySize();thr++){
-	int nwork, mywork, myoff;
+        int nwork, mywork, myoff;
 	GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
 
 	vobj vvsum=zero;
         for(int ss=myoff;ss<mywork+myoff; ss++){
-	  vvsum = vvsum + arg._odata[ss];
+          vvsum = vvsum + arg._odata[ss];
 	}
 	sumarray[thr]=vvsum;
       }
 
       vobj vsum=zero;  // sum across threads
       for(int i=0;i<grid->SumArraySize();i++){
-	vsum = vsum+sumarray[i];
+        vsum = vsum+sumarray[i];
       } 
 
       typedef typename vobj::scalar_object sobj;
@@ -138,7 +175,7 @@ PARALLEL_FOR_LOOP
       extract(vsum,buf);
 
       for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
-      arg._grid->GlobalSum(ssum);
+        arg._grid->GlobalSum(ssum);
 
       return ssum;
     }
@@ -146,23 +183,23 @@ PARALLEL_FOR_LOOP
 
 
 template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
-{
-  typedef typename vobj::scalar_object sobj;
-  GridBase  *grid = Data._grid;
-  assert(grid!=NULL);
+    {
+      typedef typename vobj::scalar_object sobj;
+      GridBase  *grid = Data._grid;
+      assert(grid!=NULL);
 
   // FIXME
   // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
 
-  const int    Nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
+      const int    Nd = grid->_ndimension;
+      const int Nsimd = grid->Nsimd();
 
-  assert(orthogdim >= 0);
-  assert(orthogdim < Nd);
+      assert(orthogdim >= 0);
+      assert(orthogdim < Nd);
 
-  int fd=grid->_fdimensions[orthogdim];
-  int ld=grid->_ldimensions[orthogdim];
-  int rd=grid->_rdimensions[orthogdim];
+      int fd=grid->_fdimensions[orthogdim];
+      int ld=grid->_ldimensions[orthogdim];
+      int rd=grid->_rdimensions[orthogdim];
 
   std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
   std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars