Implemented mixed precision CG. Fixed filelist to exclude lib/Old directory and include Config.h.

2025-11-22 15:39:32 +00:00 · 2016-07-06 15:57:04 -04:00
parent df5c788ef2
commit 85ed8175cb
9 changed files with 311 additions and 14 deletions
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -44,6 +44,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <algorithms/iterative/SchurRedBlack.h>

 #include <algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <algorithms/iterative/ConjugateGradientMixedPrec.h>

 // Lanczos support
 #include <algorithms/iterative/MatrixUtils.h>
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -0,0 +1,141 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+
+namespace Grid {
+
+  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
+  public:                                                
+    RealD   Tolerance;
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+
+    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+    LinearFunction<FieldF> *guesser;
+    
+    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
+      Linop_f(_Linop_f), Linop_d(_Linop_d),
+      Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      OuterLoopNormMult(100.), guesser(NULL){ };
+
+    void useGuesser(LinearFunction<FieldF> &g){
+      guesser = g;
+    }
+  
+    void operator() (const FieldD &src_d_in, FieldD &sol_d){
+      GridStopWatch TotalTimer;
+      TotalTimer.Start();
+    
+      int cb = src_d_in.checkerboard;
+      sol_d.checkerboard = cb;
+    
+      RealD src_norm = norm2(src_d_in);
+      RealD stop = src_norm * Tolerance*Tolerance;
+
+      GridBase* DoublePrecGrid = src_d_in._grid;
+      FieldD tmp_d(DoublePrecGrid);
+      tmp_d.checkerboard = cb;
+    
+      FieldD tmp2_d(DoublePrecGrid);
+      tmp2_d.checkerboard = cb;
+    
+      FieldD src_d(DoublePrecGrid);
+      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
+    
+      RealD inner_tol = Tolerance;
+    
+      FieldF src_f(SinglePrecGrid);
+      src_f.checkerboard = cb;
+    
+      FieldF sol_f(SinglePrecGrid);
+      sol_f.checkerboard = cb;
+    
+      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      CG_f.ErrorOnNoConverge = false;
+
+      GridStopWatch InnerCGtimer;
+
+      GridStopWatch PrecChangeTimer;
+    
+      for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
+	//Compute double precision rsd and also new RHS vector.
+	Linop_d.HermOp(sol_d, tmp_d);
+	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
+      
+	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
+
+	if(norm < OuterLoopNormMult * stop){
+	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
+	  break;
+	}
+	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+
+	PrecChangeTimer.Start();
+	precisionChange(src_f, src_d);
+	PrecChangeTimer.Stop();
+      
+	zeroit(sol_f);
+
+	//Optionally improve inner solver guess (eg using known eigenvectors)
+	if(guesser != NULL)
+	  (*guesser)(src_f, sol_f);
+
+	//Inner CG
+	CG_f.Tolerance = inner_tol;
+	InnerCGtimer.Start();
+	CG_f(Linop_f, src_f, sol_f);
+	InnerCGtimer.Stop();
+      
+	//Convert sol back to double and add to double prec solution
+	PrecChangeTimer.Start();
+	precisionChange(tmp_d, sol_f);
+	PrecChangeTimer.Stop();
+      
+	axpy(sol_d, 1.0, tmp_d, sol_d);
+      }
+    
+      //Final trial CG
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
+    
+      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
+      CG_d(Linop_d, src_d_in, sol_d);
+
+      TotalTimer.Stop();
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+    }
+  };
+
+}
+
+#endif
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@@ -82,6 +82,12 @@ template <typename T> using is_lattice      = std::is_base_of<LatticeBase,T >;

 template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;

+//Specialization of getVectorType for lattices
+template<typename T>
+struct getVectorType<Lattice<T> >{
+  typedef typename Lattice<T>::vector_object type;
+};
+ 
 template<class sobj>
 inline sobj eval(const unsigned int ss, const sobj &arg)
 {
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -482,6 +482,96 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)

 }

+//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
+template<typename vobj, typename sobj>
+typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in){
+  typedef typename vobj::vector_type vtype;
+  
+  GridBase* in_grid = in._grid;
+  out.resize(in_grid->lSites());
+  
+  int ndim = in_grid->Nd();
+  int in_nsimd = vtype::Nsimd();

+  std::vector<int> in_icoor[in_nsimd];
+      
+  for(int lane=0; lane < in_nsimd; lane++){
+    in_icoor[lane].resize(ndim);
+    in_grid->iCoorFromIindex(in_icoor[lane], lane);
+  }
+  
+PARALLEL_FOR_LOOP
+  for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
+    //Assemble vector of pointers to output elements
+    std::vector<sobj*> out_ptrs(in_nsimd);
+
+    std::vector<int> in_ocoor(ndim);
+    in_grid->oCoorFromOindex(in_ocoor, in_oidx);
+
+    std::vector<int> lcoor(in_grid->Nd());
+      
+    for(int lane=0; lane < in_nsimd; lane++){
+      for(int mu=0;mu<ndim;mu++)
+	lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
+
+      int lex;
+      Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
+      out_ptrs[lane] = &out[lex];
+    }
+    
+    //Unpack into those ptrs
+    const vobj & in_vobj = in._odata[in_oidx];
+    extract1(in_vobj, out_ptrs, 0);
+  }
+}
+
+//Convert a Lattice from one precision to another
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
+  assert(out._grid->Nd() == in._grid->Nd());
+  out.checkerboard = in.checkerboard;
+  GridBase *in_grid=in._grid;
+  GridBase *out_grid = out._grid;
+
+  typedef typename VobjOut::scalar_object SobjOut;
+  typedef typename VobjIn::scalar_object SobjIn;
+
+  int ndim = out._grid->Nd();
+  int out_nsimd = out_grid->Nsimd();
+    
+  std::vector<int> out_icoor[out_nsimd];
+      
+  for(int lane=0; lane < out_nsimd; lane++){
+    out_icoor[lane].resize(ndim);
+    out_grid->iCoorFromIindex(out_icoor[lane], lane);
+  }
+        
+  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
+  unvectorizeToLexOrdArray(in_slex_conv, in);
+    
+  PARALLEL_FOR_LOOP
+  for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
+    std::vector<int> out_ocoor(ndim);
+    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
+
+    std::vector<SobjOut*> ptrs(out_nsimd);      
+
+    std::vector<int> lcoor(out_grid->Nd());
+      
+    for(int lane=0; lane < out_nsimd; lane++){
+      for(int mu=0;mu<ndim;mu++)
+	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
+	
+      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
+      ptrs[lane] = &in_slex_conv[llex];
+    }
+    merge(out._odata[out_oidx], ptrs, 0);
+  }
+}
+
+
+  
+
+ 
 }
 #endif
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -10,6 +10,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@phys.columbia.edu>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -167,6 +168,33 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac
  }
 }

+////////////////////////////////////////////////////////////////////////
+// Extract to a bunch of scalar object pointers of different scalar type, with offset. Useful for precision change
+////////////////////////////////////////////////////////////////////////
+template<class vobj, class sobj> inline 
+void extract1(const vobj &vec,std::vector<sobj*> &extracted, int offset)
+{
+  typedef typename vobj::scalar_type vobj_scalar_type ;
+  typedef typename vobj::vector_type vobj_vector_type ;
+
+  typedef typename sobj::scalar_type sobj_scalar_type ;
+  
+  static const int words=sizeof(vobj)/sizeof(vobj_vector_type);
+  static const int Nsimd=vobj_vector_type::Nsimd();
+
+  int Nextr=extracted.size();
+  int s = Nsimd/Nextr;
+  vobj_scalar_type * vp = (vobj_scalar_type *)&vec;
+
+  for(int w=0;w<words;w++){
+    for(int i=0;i<Nextr;i++){
+      sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
+      pointer[w] = vp[i*s+w*Nsimd];
+    }
+  }
+}
+
+  
 ////////////////////////////////////////////////////////////////////////
 // Merge a contiguous array of scalar objects
 ////////////////////////////////////////////////////////////////////////
--- a/lib/tensors/Tensor_traits.h
+++ b/lib/tensors/Tensor_traits.h
@@ -8,6 +8,7 @@

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@phys.columbia.edu>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -230,6 +231,35 @@ namespace Grid {
    static const bool value = true;
  };

+  //Get the SIMD vector type from a Grid tensor or Lattice<Tensor>
+  template<typename T>
+  struct getVectorType{
+    typedef T type;
+  };
+  
+  //Query if a tensor or Lattice<Tensor> is SIMD vector or scalar
+  template<typename T>
+  class isSIMDvectorized{
+    template<typename U>
+    static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,   typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *);
+
+    template<typename U>
+    static double test(...);
+  
+  public:
+    enum {value = sizeof(test<T>(0)) == sizeof(char) };
+  };
+  
+  //Get the precision of a Lattice, tensor or scalar type in units of sizeof(float)
+  template<typename T>
+  class getPrecision{
+    typedef typename getVectorType<T>::type vector_obj; //get the vector_obj (i.e. a grid Tensor) if its a Lattice<vobj>, do nothing otherwise (i.e. if fundamental or grid Tensor)
+  
+    typedef typename GridTypeMapper<vector_obj>::scalar_type scalar_type; //get the associated scalar type. Works on fundamental and tensor types
+    typedef typename GridTypeMapper<scalar_type>::Realified real_scalar_type; //remove any std::complex wrapper, should get us to the fundamental type
+  public:
+    enum { value = sizeof(real_scalar_type)/sizeof(float) };
+  };
 }

 #endif
--- a/scripts/filelist
+++ b/scripts/filelist
@@ -2,7 +2,8 @@
 
 cd lib

-HFILES=`find . -type f -name '*.h'`
+HFILES=`find . -type f -name '*.h' -not -path '*/Old/*'`
+HFILES="$HFILES Config.h"
 CCFILES=`find . -type f -name '*.cc' -not  -name '*ommunicator*.cc'`
 echo> Make.inc
 echo HFILES=$HFILES >> Make.inc
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,13 +1,5 @@

-bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
-
-
-Test_GaugeAction_SOURCES=Test_GaugeAction.cc
-Test_GaugeAction_LDADD=-lGrid
-
-
-Test_RectPlaq_SOURCES=Test_RectPlaq.cc
-Test_RectPlaq_LDADD=-lGrid
+bin_PROGRAMS += Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 


 Test_cayley_cg_SOURCES=Test_cayley_cg.cc
@@ -114,8 +106,8 @@ Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid


-Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc
-Test_gp_rect_force_LDADD=-lGrid
+Test_GaugeAction_SOURCES=Test_GaugeAction.cc
+Test_GaugeAction_LDADD=-lGrid


 Test_gparity_SOURCES=Test_gparity.cc
@@ -126,6 +118,10 @@ Test_gpdwf_force_SOURCES=Test_gpdwf_force.cc
 Test_gpdwf_force_LDADD=-lGrid


+Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc
+Test_gp_rect_force_LDADD=-lGrid
+
+
 Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
 Test_gpwilson_even_odd_LDADD=-lGrid

@@ -202,6 +198,10 @@ Test_rect_force_SOURCES=Test_rect_force.cc
 Test_rect_force_LDADD=-lGrid


+Test_RectPlaq_SOURCES=Test_RectPlaq.cc
+Test_RectPlaq_LDADD=-lGrid
+
+
 Test_remez_SOURCES=Test_remez.cc
 Test_remez_LDADD=-lGrid