Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet

2026-05-09 19:54:31 +01:00 · 2023-03-24 15:40:57 -04:00
parent d8a9a745d8 dcf172da3b
commit 5c85774ee3
34 changed files with 4353 additions and 123 deletions
@@ -108,7 +108,10 @@ NAMESPACE_BEGIN(Grid);
    GridStopWatch PrecChangeTimer;
    
    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
-      
+
+    precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
+    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
+    
    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
      //Compute double precision rsd and also new RHS vector.
      Linop_d.HermOp(sol_d, tmp_d);
@@ -123,7 +126,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??

      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d);
+      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
      PrecChangeTimer.Stop();
      
      sol_f = Zero();
@@ -142,7 +145,7 @@ NAMESPACE_BEGIN(Grid);
      
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f);
+      precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
      PrecChangeTimer.Stop();
      
      axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -0,0 +1,373 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
+//The residual is stored in single precision, but the search directions and solution are stored in double precision. 
+//Every update_freq iterations the residual is corrected in double precision. 
+//For safety the a final regular CG is applied to clean up if necessary
+
+//PB Pure single, then double fixup
+
+template<class FieldD, class FieldF,
+	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
+	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
+					     public OperatorFunction<FieldD>
+{
+public:                                                
+
+  using OperatorFunction<FieldD>::operator();
+
+  RealD   Tolerance;
+  Integer MaxIterationsMshift;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
+  int verbose;
+  MultiShiftFunction shifts;
+  std::vector<RealD> TrueResidualShift;
+
+  int ReliableUpdateFreq; //number of iterations between reliable updates
+
+  GridBase* SinglePrecGrid; //Grid for single-precision fields
+  LinearOperatorBase<FieldF> &Linop_f; //single precision
+
+  ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
+				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
+				       int _ReliableUpdateFreq) : 
+    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
+    MaxIterations(20000)
+  { 
+    verbose=1;
+    IterationsToCompleteShift.resize(_shifts.order);
+    TrueResidualShift.resize(_shifts.order);
+  }
+
+  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
+  {
+    GridBase *grid = src.Grid();
+    int nshift = shifts.order;
+    std::vector<FieldD> results(nshift,grid);
+    (*this)(Linop,src,results,psi);
+  }
+  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
+  {
+    int nshift = shifts.order;
+
+    (*this)(Linop,src,results);
+  
+    psi = shifts.norm*src;
+    for(int i=0;i<nshift;i++){
+      psi = psi + shifts.residues[i]*results[i];
+    }
+
+    return;
+  }
+
+  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
+  { 
+    GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
+    GridBase *DoublePrecGrid = src_d.Grid();
+
+    ////////////////////////////////////////////////////////////////////////
+    // Convenience references to the info stored in "MultiShiftFunction"
+    ////////////////////////////////////////////////////////////////////////
+    int nshift = shifts.order;
+
+    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
+    std::vector<RealD> &mresidual(shifts.tolerances);
+    std::vector<RealD> alpha(nshift,1.0);
+
+    //Double precision search directions
+    FieldD p_d(DoublePrecGrid);
+    std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
+    std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
+
+    FieldD tmp_d(DoublePrecGrid);
+    FieldD r_d(DoublePrecGrid);
+    FieldF r_f(SinglePrecGrid);
+    FieldD mmp_d(DoublePrecGrid);
+
+    assert(psi_d.size()==nshift);
+    assert(mass.size()==nshift);
+    assert(mresidual.size()==nshift);
+  
+    // dynamic sized arrays on stack; 2d is a pain with vector
+    RealD  bs[nshift];
+    RealD  rsq[nshift];
+    RealD  rsqf[nshift];
+    RealD  z[nshift][2];
+    int     converged[nshift];
+  
+    const int       primary =0;
+  
+    //Primary shift fields CG iteration
+    RealD a,b,c,d;
+    RealD cp,bp,qq; //prev
+  
+    // Matrix mult fields
+    FieldF p_f(SinglePrecGrid);
+    FieldF mmp_f(SinglePrecGrid);
+
+    // Check lightest mass
+    for(int s=0;s<nshift;s++){
+      assert( mass[s]>= mass[primary] );
+      converged[s]=0;
+    }
+  
+    // Wire guess to zero
+    // Residuals "r" are src
+    // First search direction "p" is also src
+    cp = norm2(src_d);
+
+    // Handle trivial case of zero src.
+    if( cp == 0. ){
+      for(int s=0;s<nshift;s++){
+	psi_d[s] = Zero();
+	psi_f[s] = Zero();
+	IterationsToCompleteShift[s] = 1;
+	TrueResidualShift[s] = 0.;
+      }
+      return;
+    }
+
+    for(int s=0;s<nshift;s++){
+      rsq[s] = cp * mresidual[s] * mresidual[s];
+      rsqf[s] =rsq[s];
+      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
+      //      ps_d[s] = src_d;
+      precisionChangeFast(ps_f[s],src_d);
+    }
+    // r and p for primary
+    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
+    r_d = p_d;
+    
+    //MdagM+m[0]
+    precisionChangeFast(p_f,p_d);
+    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
+    precisionChangeFast(tmp_d,mmp_f);
+    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
+    tmp_d = tmp_d - mmp_d;
+    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
+    //    assert(norm2(tmp_d)< 1.0e-4);
+
+    axpy(mmp_d,mass[0],p_d,mmp_d);
+    RealD rn = norm2(p_d);
+    d += rn*mass[0];
+
+    b = -cp /d;
+  
+    // Set up the various shift variables
+    int       iz=0;
+    z[0][1-iz] = 1.0;
+    z[0][iz]   = 1.0;
+    bs[0]      = b;
+    for(int s=1;s<nshift;s++){
+      z[s][1-iz] = 1.0;
+      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
+      bs[s]      = b*z[s][iz]; 
+    }
+  
+    // r += b[0] A.p[0]
+    // c= norm(r)
+    c=axpy_norm(r_d,b,mmp_d,r_d);
+  
+    for(int s=0;s<nshift;s++) {
+      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
+      precisionChangeFast(psi_f[s],psi_d[s]);
+    }
+  
+    ///////////////////////////////////////
+    // Timers
+    ///////////////////////////////////////
+    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
+
+    SolverTimer.Start();
+  
+    // Iteration loop
+    int k;
+  
+    for (k=1;k<=MaxIterationsMshift;k++){    
+
+      a = c /cp;
+      AXPYTimer.Start();
+      axpy(p_d,a,p_d,r_d); 
+      AXPYTimer.Stop();
+
+      PrecChangeTimer.Start();
+      precisionChangeFast(r_f, r_d);
+      PrecChangeTimer.Stop();
+
+      AXPYTimer.Start();
+      for(int s=0;s<nshift;s++){
+	if ( ! converged[s] ) { 
+	  if (s==0){
+	    axpy(ps_f[s],a,ps_f[s],r_f);
+	  } else{
+	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
+	    axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
+	  }
+	}
+      }
+      AXPYTimer.Stop();
+
+      cp=c;
+      PrecChangeTimer.Start();
+      precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
+      PrecChangeTimer.Stop();
+      MatrixTimer.Start();  
+      Linop_f.HermOp(p_f,mmp_f);
+      MatrixTimer.Stop();  
+      PrecChangeTimer.Start();
+      precisionChangeFast(mmp_d, mmp_f); // From Float to Double
+      PrecChangeTimer.Stop();
+
+      d=real(innerProduct(p_d,mmp_d));    
+      axpy(mmp_d,mass[0],p_d,mmp_d);
+      RealD rn = norm2(p_d);
+      d += rn*mass[0];
+    
+      bp=b;
+      b=-cp/d;
+
+      // Toggle the recurrence history
+      bs[0] = b;
+      iz = 1-iz;
+      ShiftTimer.Start();
+      for(int s=1;s<nshift;s++){
+	if((!converged[s])){
+	  RealD z0 = z[s][1-iz];
+	  RealD z1 = z[s][iz];
+	  z[s][iz] = z0*z1*bp
+	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
+	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
+	}
+      }
+      ShiftTimer.Stop();
+
+      //Update single precision solutions
+      AXPYTimer.Start();
+      for(int s=0;s<nshift;s++){
+	int ss = s;
+	if( (!converged[s]) ) { 
+	  axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
+	}
+      }
+      c = axpy_norm(r_d,b,mmp_d,r_d);
+      AXPYTimer.Stop();
+    
+      // Convergence checks
+      int all_converged = 1;
+      for(int s=0;s<nshift;s++){
+      
+	if ( (!converged[s]) ){
+	  IterationsToCompleteShift[s] = k;
+	
+	  RealD css  = c * z[s][iz]* z[s][iz];
+	
+	  if(css<rsqf[s]){
+	    if ( ! converged[s] )
+	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
+	    converged[s]=1;
+	  } else {
+	    all_converged=0;
+	  }
+
+	}
+      }
+
+      if ( all_converged || k == MaxIterationsMshift-1){
+
+	SolverTimer.Stop();
+
+	for(int s=0;s<nshift;s++){
+	  precisionChangeFast(psi_d[s],psi_f[s]);
+	}
+
+	
+	if ( all_converged ){
+	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
+	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
+	} else {
+	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
+	}
+	
+	// Check answers 
+	for(int s=0; s < nshift; s++) { 
+	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
+	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
+	  axpy(r_d,-alpha[s],src_d,tmp_d);
+	  RealD rn = norm2(r_d);
+	  RealD cn = norm2(src_d);
+	  TrueResidualShift[s] = std::sqrt(rn/cn);
+	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
+
+	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
+	  if(rn >= rsq[s]){
+	    CleanupTimer.Start();
+	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
+
+	    //Setup linear operators for final cleanup
+	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
+	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
+					       
+	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
+	    cg(src_d, psi_d[s]);
+	    
+	    TrueResidualShift[s] = cg.TrueResidual;
+	    CleanupTimer.Stop();
+	  }
+	}
+
+	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
+	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
+
+	IterationsToComplete = k;	
+
+	return;
+      }
+   
+    }
+    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
+    assert(0);
+  }
+
+};
+NAMESPACE_END(Grid);
+
@@ -81,6 +81,7 @@ public:
  using OperatorFunction<FieldD>::operator();

  RealD   Tolerance;
+  Integer MaxIterationsMshift;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@@ -95,9 +96,9 @@ public:

  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
-				       int _ReliableUpdateFreq
-				       ) : 
-    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
+				       int _ReliableUpdateFreq) : 
+    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
+    MaxIterations(20000)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
@@ -130,6 +131,9 @@ public:
    GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
    GridBase *DoublePrecGrid = src_d.Grid();

+    precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
+    precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
+    
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
@@ -200,10 +204,10 @@ public:
    r_d = p_d;
    
    //MdagM+m[0]
-    precisionChangeFast(p_f,p_d);
+    precisionChange(p_f, p_d, pc_wk_d_to_s);

    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    precisionChangeFast(tmp_d,mmp_f);
+    precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    tmp_d = tmp_d - mmp_d;
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
@@ -244,7 +248,7 @@ public:
    // Iteration loop
    int k;
  
-    for (k=1;k<=MaxIterations;k++){    
+    for (k=1;k<=MaxIterationsMshift;k++){    

      a = c /cp;
      AXPYTimer.Start();
@@ -263,7 +267,7 @@ public:
      AXPYTimer.Stop();

      PrecChangeTimer.Start();
-      precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
+      precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
      PrecChangeTimer.Stop();

      cp=c;
@@ -272,7 +276,7 @@ public:
      MatrixTimer.Stop();  

      PrecChangeTimer.Start();
-      precisionChangeFast(mmp_d, mmp_f); // From Float to Double
+      precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
      PrecChangeTimer.Stop();

      AXPYTimer.Start();
@@ -350,12 +354,17 @@ public:
 	}
      }

-      if ( all_converged ){
+      if ( all_converged || k == MaxIterationsMshift-1){

 	SolverTimer.Stop();
-	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
-	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
-      
+
+	if ( all_converged ){
+	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
+	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
+	} else {
+	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
+	}
+	
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
@@ -396,12 +405,10 @@ public:

 	return;
      }
-
   
    }
-    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-    //  assert(0);
+    assert(0);
  }

 };
@@ -48,7 +48,7 @@ public:
  LinearOperatorBase<FieldF> &Linop_f;
  LinearOperatorBase<FieldD> &Linop_d;
  GridBase* SinglePrecGrid;
-  RealD Delta; //reliable update parameter
+  RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update

  //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
  LinearOperatorBase<FieldF> *Linop_fallback;
@@ -65,7 +65,9 @@ public:
      ErrorOnNoConverge(err_on_no_conv),
      DoFinalCleanup(true),
      Linop_fallback(NULL)
-  {};
+  {
+    assert(Delta > 0. && Delta < 1. && "Expect  0 < Delta < 1");
+  };

  void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
    Linop_fallback = &_Linop_fallback;
@@ -116,9 +118,12 @@ public:
    }

    //Single prec initialization
+    precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
+    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
+    
    FieldF r_f(SinglePrecGrid);
    r_f.Checkerboard() = r.Checkerboard();
-    precisionChange(r_f, r);
+    precisionChange(r_f, r, pc_wk_dp_to_sp);

    FieldF psi_f(r_f);
    psi_f = Zero();
@@ -134,7 +139,8 @@ public:
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
-
+    GridStopWatch PrecChangeTimer;
+    
    SolverTimer.Start();
    int k = 0;
    int l = 0;
@@ -173,7 +179,9 @@ public:
      // Stopping condition
      if (cp <= rsq) {
 	//Although not written in the paper, I assume that I have to add on the final solution
-	precisionChange(mmp, psi_f);
+	PrecChangeTimer.Start();
+	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
+	PrecChangeTimer.Stop();
 	psi = psi + mmp;
 	
 	
@@ -194,7 +202,10 @@ public:
 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;

+	
 	IterationsToComplete = k;	
 	ReliableUpdatesPerformed = l;
 	  
@@ -214,14 +225,21 @@ public:
      else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 		  << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
-	precisionChange(mmp, psi_f);
+	PrecChangeTimer.Start();
+	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
+	PrecChangeTimer.Stop();
 	psi = psi + mmp;

+	MatrixTimer.Start();
 	Linop_d.HermOpAndNorm(psi, mmp, d, qq);
+	MatrixTimer.Stop();
+	
 	r = src - mmp;

 	psi_f = Zero();
-	precisionChange(r_f, r);
+	PrecChangeTimer.Start();
+	precisionChange(r_f, r, pc_wk_dp_to_sp);
+	PrecChangeTimer.Stop();
 	cp = norm2(r);
 	MaxResidSinceLastRelUp = cp;

@@ -248,7 +248,7 @@ public:
  ///////////////////////////////////////////
  // user defined constructor
  ///////////////////////////////////////////
-  Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) { 
+  Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { 
    this->_grid = grid;
    resize(this->_grid->oSites());
    assert((((uint64_t)&this->_odata[0])&0xF) ==0);
@@ -440,7 +440,17 @@ public:
 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx);

 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+#if 1
 	assert(rank == _grid->ThisRank() );
+#else
+// 
+	if (rank != _grid->ThisRank() ){
+	std::cout <<"rank "<<rank<<" _grid->ThisRank() "<<_grid->ThisRank()<< std::endl;
+//	exit(-42);
+//	assert(0);
+	}
+#endif
+
 	
 	int l_idx=generator_idx(o_idx,i_idx);
 	_generators[l_idx] = master_engine;
@@ -1080,6 +1080,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }

+//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
 template<class VobjOut, class VobjIn>
 void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 {
@@ -1097,9 +1098,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
      precisionChange(vout,vin,N);
  });
 }
-//Convert a Lattice from one precision to another
+//Convert a Lattice from one precision to another (original, slow implementation)
 template<class VobjOut, class VobjIn>
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
+void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 {
  assert(out.Grid()->Nd() == in.Grid()->Nd());
  for(int d=0;d<out.Grid()->Nd();d++){
@@ -1145,6 +1146,128 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
  });
 }

+//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
+class precisionChangeWorkspace{
+  std::pair<Integer,Integer>* fmap_device; //device pointer
+  //maintain grids for checking
+  GridBase* _out_grid;
+  GridBase* _in_grid;
+public:
+  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
+    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
+    assert(out_grid->Nd() == in_grid->Nd());
+    for(int d=0;d<out_grid->Nd();d++){
+      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
+    }
+    int Nsimd_out = out_grid->Nsimd();
+
+    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
+    for(int lane=0; lane < out_grid->Nsimd(); lane++)
+      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
+  
+    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
+    thread_for(out_oidx,out_grid->oSites(),{
+	Coordinate out_ocorr; 
+	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
+      
+	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
+	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
+	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
+	
+	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
+	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
+	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
+	  int in_oidx = 0, in_lane = 0;
+	  for(int d=0;d<in_grid->_ndimension;d++){
+	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
+	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
+	  }
+	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
+	}
+      });
+
+    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
+    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
+    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
+    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
+  }
+
+  //Prevent moving or copying
+  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
+  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
+  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
+  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
+  
+  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
+
+  void checkGrids(GridBase* out, GridBase* in) const{
+    conformable(out, _out_grid);
+    conformable(in, _in_grid);
+  }
+  
+  ~precisionChangeWorkspace(){
+    acceleratorFreeDevice(fmap_device);
+  }
+};
+
+
+//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
+//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
+template<class VobjOut, class VobjIn>
+auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
+  if(out.Grid() == in.Grid()){
+    precisionChangeFast(out,in);
+    return 1;
+  }else{
+    return 0;
+  }
+}
+template<class VobjOut, class VobjIn>
+int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
+  return 0;
+}
+
+
+//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
+//which contains the mapping data.
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
+  if(_precisionChangeFastWrap(out,in,0)) return;
+  
+  static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+
+  out.Checkerboard() = in.Checkerboard();
+  constexpr int Nsimd_out = VobjOut::Nsimd();
+
+  workspace.checkGrids(out.Grid(),in.Grid());
+  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
+
+  //Do the copy/precision change
+  autoView( out_v , out, AcceleratorWrite);
+  autoView( in_v , in, AcceleratorRead);
+
+  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
+      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
+      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
+	int in_oidx = fmap_osite[out_lane].first;
+	int in_lane = fmap_osite[out_lane].second;
+	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
+      }
+    });
+}
+
+//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
+//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
+  if(_precisionChangeFastWrap(out,in,0)) return;   
+  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
+  precisionChange(out, in, workspace);
+}
+
+
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
@@ -36,7 +36,7 @@ NAMESPACE_BEGIN(Grid);
 // Wilson compressor will need FaceGather policies for:
 // Periodic, Dirichlet, and partial Dirichlet for DWF
 ///////////////////////////////////////////////////////////////
-const int dwf_compressor_depth=1;
+const int dwf_compressor_depth=2;
 #define DWF_COMPRESS
 class FaceGatherPartialDWF
 {
@@ -127,6 +127,8 @@ NAMESPACE_BEGIN(Grid);
 	  ApproxNegPowerAction.tolerances[i]    = action_tolerance[i];
 	  ApproxHalfPowerAction.tolerances[i]   = action_tolerance[i];
 	  ApproxNegHalfPowerAction.tolerances[i]= action_tolerance[i];
+	}
+	for(int i=0;i<ApproxPowerMD.tolerances.size();i++){
 	  ApproxPowerMD.tolerances[i]       = md_tolerance[i];
 	  ApproxNegPowerMD.tolerances[i]    = md_tolerance[i];
 	  ApproxHalfPowerMD.tolerances[i]   = md_tolerance[i];
@@ -29,6 +29,8 @@
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H

+#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h>
+
 NAMESPACE_BEGIN(Grid);

    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -58,7 +60,7 @@ NAMESPACE_BEGIN(Grid);
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
 #if 0
-	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOp : DenOp);
+	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
 	ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
 	msCG(schurOp,in, out);
 #else
@@ -66,7 +68,8 @@ NAMESPACE_BEGIN(Grid);
 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
-	
+
+	// Action better with higher precision?
 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	precisionChange(inD2,in);
 	std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
@@ -76,12 +79,12 @@ NAMESPACE_BEGIN(Grid);
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
-	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
+	SchurDifferentiableOperator<ImplF>  schurOpF (numerator ? NumOpF  : DenOpF);

 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
 	std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid());
-	ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
+	ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	precisionChange(inD2,in);
 	std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
 	msCG(schurOpD2, inD2, out_elemsD2, outD2);
@@ -284,6 +284,15 @@ public:
 		  << as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl;
      }
    }
+    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
+    std::cout << GridLogMessage << " Dslash counts "<<std::endl;
+    std::cout << GridLogMessage << "------------------------- "<<std::endl;
+    uint64_t full, partial, dirichlet;
+    DslashGetCounts(dirichlet,partial,full);
+    std::cout << GridLogMessage << " Full BCs               : "<<full<<std::endl;
+    std::cout << GridLogMessage << " Partial dirichlet BCs  : "<<partial<<std::endl;
+    std::cout << GridLogMessage << " Dirichlet BCs          : "<<dirichlet<<std::endl;
+
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    std::cout << GridLogMessage << " Force average size "<<std::endl;
    std::cout << GridLogMessage << "------------------------- "<<std::endl;
@@ -29,6 +29,27 @@

 NAMESPACE_BEGIN(Grid);

+uint64_t DslashFullCount;
+uint64_t DslashPartialCount;
+uint64_t DslashDirichletCount;
+
+void DslashResetCounts(void)
+{
+  DslashFullCount=0;
+  DslashPartialCount=0;
+  DslashDirichletCount=0;
+}
+void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
+{
+  dirichlet = DslashDirichletCount;
+  partial   = DslashPartialCount;
+  full      = DslashFullCount;
+}
+void DslashLogFull(void)     { DslashFullCount++;}
+void DslashLogPartial(void)  { DslashPartialCount++;}
+void DslashLogDirichlet(void){ DslashDirichletCount++;}
+
+
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table)
 {
@@ -120,6 +120,12 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
 }
 */

+void DslashResetCounts(void);
+void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
+void DslashLogFull(void);
+void DslashLogPartial(void);
+void DslashLogDirichlet(void);
+
 struct StencilEntry {
 #ifdef GRID_CUDA
  uint64_t _byte_offset;       // 8 bytes
@@ -312,6 +318,7 @@ public:

  int face_table_computed;
  int partialDirichlet;
+  int fullDirichlet;
  std::vector<commVector<std::pair<int,int> > > face_table ;
  Vector<int> surface_list;

@@ -442,6 +449,9 @@ public:
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    _grid->StencilSendToRecvFromComplete(MpiReqs,0);
+    if   ( this->partialDirichlet ) DslashLogPartial();
+    else if ( this->fullDirichlet ) DslashLogDirichlet();
+    else DslashLogFull();
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
@@ -770,6 +780,10 @@ public:
    if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
    partialDirichlet = p.partialDirichlet;
    DirichletBlock(p.dirichlet); // comms send/recv set up
+    fullDirichlet=0;
+    for(int d=0;d<p.dirichlet.size();d++){
+      if (p.dirichlet[d]) fullDirichlet=1;
+    }

    _unified_buffer_size=0;
    surface_list.resize(0);
@@ -226,7 +226,7 @@ template<class vobjOut, class vobjIn>
 accelerator_inline 
 void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
 {
-  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+  static_assert( std::is_same<typename vobjOut::scalar_typeD, typename vobjIn::scalar_typeD>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same

  typedef typename vobjOut::vector_type ovector_type;  
  typedef typename vobjIn::vector_type ivector_type;  
@@ -251,9 +251,9 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest
  ovector_type * __restrict__ op = (ovector_type *)&vecOut;
  ivector_type * __restrict__ ip = (ivector_type *)&vecIn;
  for(int w=0;w<owords;w++){
-    itmp = ip[iNsimd*w].getlane(lane_in);
+    itmp = ip[w].getlane(lane_in);
    otmp = itmp; //potential precision change
-    op[oNsimd*w].putlane(otmp,lane_out);
+    op[w].putlane(otmp,lane_out);
  }
 }

@@ -167,14 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val)
  return;
 }

-void GridCmdOptionFloat(std::string &str,float & val)
+void GridCmdOptionFloat(std::string &str,double & val)
 {
  std::stringstream ss(str);
  ss>>val;
  return;
 }

-
 void GridParseLayout(char **argv,int argc,
 		     Coordinate &latt_c,
 		     Coordinate &mpi_c)
@@ -57,7 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
 void GridCmdOptionInt(std::string &str,int & val);
-void GridCmdOptionFloat(std::string &str,float & val);
+void GridCmdOptionFloat(std::string &str,double & val);


 void GridParseLayout(char **argv,int argc,