From 1db58a8acce1cddb20be116b5916e4b359ae259f Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Tue, 21 Feb 2023 10:52:42 -0500
Subject: [PATCH] Precision change improvements

Added a new, much faster implementation of precision change that uses (optionally) a precomputed workspace containing pointer offsets that is device resident, such that all lattice copying occurs only on the device and no host<->device transfer is required, other than the pointer table. It also avoids the need to unpack and repack the fields using explicit lane copying. When this new precisionChange is called without a workspace, one will be computed on-the-fly; however it is still considerably faster than the original implementation.

In the special case of using double2 and when the Grids are the same, calls to the new precisionChange will automatically use precisionChangeFast, such that there is a single API call for all precision changes.

Reliable update and mixed-prec multishift have been modified to precompute precision change workspaces

Renamed the original precisionChange as precisionChangeOrig

Fixed incorrect pointer offset bug in copyLane

Added a test and a benchmark for precisionChange

Added a test for reliable update CG
---
 .../ConjugateGradientMultiShiftMixedPrec.h    |  11 +-
 .../ConjugateGradientReliableUpdate.h         |  32 ++-
 Grid/lattice/Lattice_transfer.h               | 127 +++++++++++-
 Grid/tensors/Tensor_extract_merge.h           |   6 +-
 benchmarks/Benchmark_prec_change.cc           | 189 ++++++++++++++++++
 tests/core/Test_prec_change.cc                | 124 ++++++++++++
 tests/solver/Test_dwf_relupcg_prec.cc         | 143 +++++++++++++
 7 files changed, 616 insertions(+), 16 deletions(-)
 create mode 100644 benchmarks/Benchmark_prec_change.cc
 create mode 100644 tests/core/Test_prec_change.cc
 create mode 100644 tests/solver/Test_dwf_relupcg_prec.cc

diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
index de1cfe01..a89a1e4a 100644
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -130,6 +130,9 @@ public:
     GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
     GridBase *DoublePrecGrid = src_d.Grid();
 
+    precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
+    precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
+    
     ////////////////////////////////////////////////////////////////////////
     // Convenience references to the info stored in "MultiShiftFunction"
     ////////////////////////////////////////////////////////////////////////
@@ -200,10 +203,10 @@ public:
     r_d = p_d;
     
     //MdagM+m[0]
-    precisionChangeFast(p_f,p_d);
+    precisionChange(p_f, p_d, pc_wk_d_to_s);
 
     Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    precisionChangeFast(tmp_d,mmp_f);
+    precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
     tmp_d = tmp_d - mmp_d;
     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
@@ -263,7 +266,7 @@ public:
       AXPYTimer.Stop();
 
       PrecChangeTimer.Start();
-      precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
+      precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
       PrecChangeTimer.Stop();
 
       cp=c;
@@ -272,7 +275,7 @@ public:
       MatrixTimer.Stop();  
 
       PrecChangeTimer.Start();
-      precisionChangeFast(mmp_d, mmp_f); // From Float to Double
+      precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
       PrecChangeTimer.Stop();
 
       AXPYTimer.Start();
diff --git a/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
index 094f847d..514800fc 100644
--- a/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
+++ b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
@@ -48,7 +48,7 @@ public:
   LinearOperatorBase<FieldF> &Linop_f;
   LinearOperatorBase<FieldD> &Linop_d;
   GridBase* SinglePrecGrid;
-  RealD Delta; //reliable update parameter
+  RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
 
   //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
   LinearOperatorBase<FieldF> *Linop_fallback;
@@ -65,7 +65,9 @@ public:
       ErrorOnNoConverge(err_on_no_conv),
       DoFinalCleanup(true),
       Linop_fallback(NULL)
-  {};
+  {
+    assert(Delta > 0. && Delta < 1. && "Expect  0 < Delta < 1");
+  };
 
   void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
     Linop_fallback = &_Linop_fallback;
@@ -116,9 +118,12 @@ public:
     }
 
     //Single prec initialization
+    precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
+    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
+    
     FieldF r_f(SinglePrecGrid);
     r_f.Checkerboard() = r.Checkerboard();
-    precisionChange(r_f, r);
+    precisionChange(r_f, r, pc_wk_dp_to_sp);
 
     FieldF psi_f(r_f);
     psi_f = Zero();
@@ -134,7 +139,8 @@ public:
     GridStopWatch LinalgTimer;
     GridStopWatch MatrixTimer;
     GridStopWatch SolverTimer;
-
+    GridStopWatch PrecChangeTimer;
+    
     SolverTimer.Start();
     int k = 0;
     int l = 0;
@@ -173,7 +179,9 @@ public:
       // Stopping condition
       if (cp <= rsq) {
 	//Although not written in the paper, I assume that I have to add on the final solution
-	precisionChange(mmp, psi_f);
+	PrecChangeTimer.Start();
+	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
+	PrecChangeTimer.Stop();
 	psi = psi + mmp;
 	
 	
@@ -194,7 +202,10 @@ public:
 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
 
+	
 	IterationsToComplete = k;	
 	ReliableUpdatesPerformed = l;
 	  
@@ -214,14 +225,21 @@ public:
       else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 		  << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
-	precisionChange(mmp, psi_f);
+	PrecChangeTimer.Start();
+	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
+	PrecChangeTimer.Stop();
 	psi = psi + mmp;
 
+	MatrixTimer.Start();
 	Linop_d.HermOpAndNorm(psi, mmp, d, qq);
+	MatrixTimer.Stop();
+	
 	r = src - mmp;
 
 	psi_f = Zero();
-	precisionChange(r_f, r);
+	PrecChangeTimer.Start();
+	precisionChange(r_f, r, pc_wk_dp_to_sp);
+	PrecChangeTimer.Stop();
 	cp = norm2(r);
 	MaxResidSinceLastRelUp = cp;
 
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 3cdb75d1..6f9fc480 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -1080,6 +1080,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
   });
 }
 
+//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
 template<class VobjOut, class VobjIn>
 void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 {
@@ -1097,9 +1098,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
       precisionChange(vout,vin,N);
   });
 }
-//Convert a Lattice from one precision to another
+//Convert a Lattice from one precision to another (original, slow implementation)
 template<class VobjOut, class VobjIn>
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
+void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 {
   assert(out.Grid()->Nd() == in.Grid()->Nd());
   for(int d=0;d<out.Grid()->Nd();d++){
@@ -1145,6 +1146,128 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
   });
 }
 
+//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
+class precisionChangeWorkspace{
+  std::pair<Integer,Integer>* fmap_device; //device pointer
+  //maintain grids for checking
+  GridBase* _out_grid;
+  GridBase* _in_grid;
+public:
+  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
+    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
+    assert(out_grid->Nd() == in_grid->Nd());
+    for(int d=0;d<out_grid->Nd();d++){
+      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
+    }
+    int Nsimd_out = out_grid->Nsimd();
+
+    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
+    for(int lane=0; lane < out_grid->Nsimd(); lane++)
+      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
+  
+    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
+    thread_for(out_oidx,out_grid->oSites(),{
+	Coordinate out_ocorr; 
+	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
+      
+	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
+	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
+	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
+	
+	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
+	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
+	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
+	  int in_oidx = 0, in_lane = 0;
+	  for(int d=0;d<in_grid->_ndimension;d++){
+	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
+	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
+	  }
+	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
+	}
+      });
+
+    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
+    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
+    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
+    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
+  }
+
+  //Prevent moving or copying
+  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
+  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
+  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
+  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
+  
+  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
+
+  void checkGrids(GridBase* out, GridBase* in) const{
+    conformable(out, _out_grid);
+    conformable(in, _in_grid);
+  }
+  
+  ~precisionChangeWorkspace(){
+    acceleratorFreeDevice(fmap_device);
+  }
+};
+
+
+//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
+//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
+template<class VobjOut, class VobjIn>
+auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
+  if(out.Grid() == in.Grid()){
+    precisionChangeFast(out,in);
+    return 1;
+  }else{
+    return 0;
+  }
+}
+template<class VobjOut, class VobjIn>
+int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
+  return 0;
+}
+
+
+//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
+//which contains the mapping data.
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
+  if(_precisionChangeFastWrap(out,in,0)) return;
+  
+  static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+
+  out.Checkerboard() = in.Checkerboard();
+  constexpr int Nsimd_out = VobjOut::Nsimd();
+
+  workspace.checkGrids(out.Grid(),in.Grid());
+  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
+
+  //Do the copy/precision change
+  autoView( out_v , out, AcceleratorWrite);
+  autoView( in_v , in, AcceleratorRead);
+
+  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
+      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
+      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
+	int in_oidx = fmap_osite[out_lane].first;
+	int in_lane = fmap_osite[out_lane].second;
+	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
+      }
+    });
+}
+
+//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
+//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
+  if(_precisionChangeFastWrap(out,in,0)) return;   
+  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
+  precisionChange(out, in, workspace);
+}
+
+
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/Grid/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h
index 87572faf..f1407d1f 100644
--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@@ -226,7 +226,7 @@ template<class vobjOut, class vobjIn>
 accelerator_inline 
 void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
 {
-  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+  static_assert( std::is_same<typename vobjOut::scalar_typeD, typename vobjIn::scalar_typeD>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
 
   typedef typename vobjOut::vector_type ovector_type;  
   typedef typename vobjIn::vector_type ivector_type;  
@@ -251,9 +251,9 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest
   ovector_type * __restrict__ op = (ovector_type *)&vecOut;
   ivector_type * __restrict__ ip = (ivector_type *)&vecIn;
   for(int w=0;w<owords;w++){
-    itmp = ip[iNsimd*w].getlane(lane_in);
+    itmp = ip[w].getlane(lane_in);
     otmp = itmp; //potential precision change
-    op[oNsimd*w].putlane(otmp,lane_out);
+    op[w].putlane(otmp,lane_out);
   }
 }
 
diff --git a/benchmarks/Benchmark_prec_change.cc b/benchmarks/Benchmark_prec_change.cc
new file mode 100644
index 00000000..ba34f87e
--- /dev/null
+++ b/benchmarks/Benchmark_prec_change.cc
@@ -0,0 +1,189 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_prec_change.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int Ls = 12;
+  Coordinate latt4 = GridDefaultLatt();
+
+  GridCartesian         * UGridD   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);
+  GridCartesian         * FGridD   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
+  GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
+
+  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGridD);  RNG4.SeedFixedIntegers(seeds4);
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGridD);  RNG5.SeedFixedIntegers(seeds5);
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  LatticeFermionD field_d(FGridD), tmp_d(FGridD);
+  random(RNG5,field_d); tmp_d = field_d;
+
+  LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);
+  precisionChange(field_d2, field_d); tmp_d2 = field_d2;
+
+  LatticeFermionF field_f(FGridF), tmp_f(FGridF);
+  precisionChange(field_f, field_d); tmp_f = field_f;
+
+  int N = 500;
+
+  double time_ds = 0, time_sd = 0;
+
+  std::cout<<GridLogMessage << "Benchmarking single<->double original implementation (fields initially device-resident)" << std::endl;
+  for(int i=0;i<N;i++){
+    //We want to benchmark the typical scenario of both fields being device resident
+    //To do this, invoke an operation that will open a device view and touch all sites
+    //with a write operation that invalidates the CPU copy
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    double start=usecond();
+    precisionChangeOrig(field_d,field_f);
+    double stop=usecond();
+    time_sd += stop - start;
+
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    start=usecond();
+    precisionChangeOrig(field_f,field_d);
+    stop=usecond();
+    time_ds += stop - start;   
+  }
+  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
+
+
+  precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());
+  precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());
+  
+  std::cout<<GridLogMessage << "Benchmarking single<->double with pregenerated workspace(fields initially device-resident)" << std::endl;
+  time_sd = time_ds = 0;
+  for(int i=0;i<N;i++){
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    double start=usecond();
+    precisionChange(field_d,field_f, wk_sp_to_dp);
+    double stop=usecond();
+    time_sd += stop - start;
+
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    start=usecond();
+    precisionChange(field_f,field_d, wk_dp_to_sp);
+    stop=usecond();
+    time_ds += stop - start;   
+  }
+  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
+  
+  std::cout<<GridLogMessage << "Benchmarking single<->double with workspace generated on-the-fly (fields initially device-resident)" << std::endl;
+  time_sd = time_ds = 0;
+  for(int i=0;i<N;i++){
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    double start=usecond();
+    precisionChange(field_d,field_f);
+    double stop=usecond();
+    time_sd += stop - start;
+
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    start=usecond();
+    precisionChange(field_f,field_d);
+    stop=usecond();
+    time_ds += stop - start;
+
+  }
+  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
+
+
+  std::cout<<GridLogMessage << "Benchmarking single<->double2 (fields initially device-resident)" << std::endl;
+  time_sd = time_ds = 0;
+  for(int i=0;i<N;i++){
+    field_d2 = tmp_d2;
+    field_f = tmp_f;
+
+    double start=usecond();
+    precisionChangeFast(field_d2,field_f);
+    double stop=usecond();
+    time_sd += stop - start;
+
+    field_d2 = tmp_d2;
+    field_f = tmp_f;
+
+    start=usecond();
+    precisionChangeFast(field_f,field_d2);
+    stop=usecond();
+    time_ds += stop - start;
+  }
+  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
+
+
+  std::cout<<GridLogMessage << "Benchmarking single<->double2 through standard precisionChange call(fields initially device-resident) [NB: perf should be the same as the previous test!]" << std::endl;
+  time_sd = time_ds = 0;
+  for(int i=0;i<N;i++){
+    field_d2 = tmp_d2;
+    field_f = tmp_f;
+
+    double start=usecond();
+    precisionChange(field_d2,field_f);
+    double stop=usecond();
+    time_sd += stop - start;
+
+    field_d2 = tmp_d2;
+    field_f = tmp_f;
+
+    start=usecond();
+    precisionChange(field_f,field_d2);
+    stop=usecond();
+    time_ds += stop - start;
+  }
+  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
+
+  Grid_finalize();
+}
diff --git a/tests/core/Test_prec_change.cc b/tests/core/Test_prec_change.cc
new file mode 100644
index 00000000..06b9ae5c
--- /dev/null
+++ b/tests/core/Test_prec_change.cc
@@ -0,0 +1,124 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/core/Test_prec_change.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int Ls = 12;
+  Coordinate latt4 = GridDefaultLatt();
+
+  GridCartesian         * UGridD   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);
+  GridCartesian         * FGridD   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
+  GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
+
+  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGridD);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG5F(FGridF);  RNG5F.SeedFixedIntegers(seeds5);
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  LatticeFermionD field_d(FGridD), tmp_d(FGridD);
+  random(RNG5,field_d);
+  RealD norm2_field_d = norm2(field_d);
+  
+  LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);
+  random(RNG5F,field_d2);
+  RealD norm2_field_d2 = norm2(field_d2);
+  
+  LatticeFermionF field_f(FGridF);
+  
+  //Test original implementation
+  {
+    std::cout << GridLogMessage << "Testing original implementation" << std::endl;
+    field_f = Zero();
+    precisionChangeOrig(field_f,field_d);
+    RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
+    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
+    tmp_d = Zero();
+    precisionChangeOrig(tmp_d, field_f);
+    Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
+    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
+  }
+  //Test new implementation with pregenerated workspace
+  {
+    std::cout << GridLogMessage << "Testing new implementation with pregenerated workspace" << std::endl;
+    precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());
+    precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());
+    
+    field_f = Zero();
+    precisionChange(field_f,field_d,wk_dp_to_sp);
+    RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
+    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
+    tmp_d = Zero();
+    precisionChange(tmp_d, field_f,wk_sp_to_dp);
+    Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
+    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
+  }
+  //Test new implementation without pregenerated workspace
+  {
+    std::cout << GridLogMessage << "Testing new implementation without pregenerated workspace" << std::endl;
+    field_f = Zero();
+    precisionChange(field_f,field_d);
+    RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
+    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
+    tmp_d = Zero();
+    precisionChange(tmp_d, field_f);
+    Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
+    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
+  } 
+  //Test fast implementation
+  {
+    std::cout << GridLogMessage << "Testing fast (double2) implementation" << std::endl;
+    field_f = Zero();
+    precisionChangeFast(field_f,field_d2);
+    RealD Ndiff = (norm2_field_d2 - norm2(field_f))/norm2_field_d2;
+    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
+    tmp_d2 = Zero();
+    precisionChangeFast(tmp_d2, field_f);
+    Ndiff = norm2( LatticeFermionD2(tmp_d2-field_d2) ) / norm2_field_d2;
+    std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
+  }
+  std::cout << "Done" << std::endl;
+  
+  Grid_finalize();
+}
diff --git a/tests/solver/Test_dwf_relupcg_prec.cc b/tests/solver/Test_dwf_relupcg_prec.cc
new file mode 100644
index 00000000..1d8c022a
--- /dev/null
+++ b/tests/solver/Test_dwf_relupcg_prec.cc
@@ -0,0 +1,143 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/solver/Test_dwf_relupcg_prec.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  double relup_delta = 0.2;
+  for(int i=1;i<argc-1;i++){
+    std::string sarg = argv[i];
+    if(sarg == "--relup_delta"){
+      std::stringstream ss; ss << argv[i+1]; ss >> relup_delta;
+      std::cout << GridLogMessage << "Set reliable update Delta to " << relup_delta << std::endl;
+    }
+  }   
+  
+  const int Ls=12;
+
+  { 
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  GridCartesian         * UGrid_f   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f);
+  GridCartesian         * FGrid_f   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
+  GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
+  
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeFermionD    src(FGrid); random(RNG5,src);
+  LatticeFermionD result(FGrid); result=Zero();
+  LatticeGaugeFieldD Umu(UGrid);
+  LatticeGaugeFieldF Umu_f(UGrid_f); 
+  
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+
+  precisionChange(Umu_f,Umu);
+  
+  RealD mass=0.1;
+  RealD M5=1.8;
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionF Ddwf_f(Umu_f,*FGrid_f,*FrbGrid_f,*UGrid_f,*UrbGrid_f,mass,M5);
+
+  LatticeFermionD    src_o(FrbGrid);
+  LatticeFermionD result_o(FrbGrid);
+  LatticeFermionD result_o_2(FrbGrid);
+  pickCheckerboard(Odd,src_o,src);
+  result_o.Checkerboard() = Odd;
+  result_o = Zero();
+  result_o_2.Checkerboard() = Odd;
+  result_o_2 = Zero();
+
+  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
+  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
+
+  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
+  ConjugateGradientReliableUpdate<LatticeFermionD,LatticeFermionF> mCG(1e-8, 10000, relup_delta, FrbGrid_f, HermOpEO_f, HermOpEO);
+  double t1,t2,flops;
+  double MdagMsiteflops = 1452; // Mobius (real coeffs)
+  // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of)
+  double CGsiteflops = (8+4+8+4+4)*Nc*Ns ;
+  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
+  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
+  int iters, iters_cleanup, relups, tot_iters;
+  for(int i=0;i<10;i++){
+    result_o = Zero();
+    t1=usecond();
+    mCG(src_o,result_o);
+    t2=usecond();
+    iters = mCG.IterationsToComplete; //Number of single prec CG iterations
+    iters_cleanup = mCG.IterationsToCleanup;
+    relups = mCG.ReliableUpdatesPerformed;
+    tot_iters  = iters + iters_cleanup + relups; //relup cost MdagM application in double
+    
+    flops = MdagMsiteflops*4*FrbGrid->gSites()*tot_iters;
+    flops+= CGsiteflops*FrbGrid->gSites()*tot_iters;
+    std::cout << " SinglePrecision single prec iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
+    std::cout << " SinglePrecision double prec cleanup iterations/sec "<< iters_cleanup/(t2-t1)*1000.*1000.<<std::endl;
+    std::cout << " SinglePrecision reliable updates/sec "<< relups/(t2-t1)*1000.*1000.<<std::endl;
+    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
+  }
+  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
+  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
+  for(int i=0;i<1;i++){
+    result_o_2 = Zero();
+    t1=usecond();
+    CG(HermOpEO,src_o,result_o_2);
+    t2=usecond();
+    iters = CG.IterationsToComplete;
+    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; 
+    flops+= CGsiteflops*FrbGrid->gSites()*iters;
+    
+    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
+    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
+  }
+  
+  //  MemoryManager::Print();
+
+  LatticeFermionD diff_o(FrbGrid);
+  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
+
+  std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
+  }
+  
+  MemoryManager::Print();
+
+  Grid_finalize();
+}