diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h index de1cfe01..a89a1e4a 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h @@ -130,6 +130,9 @@ public: GRID_TRACE("ConjugateGradientMultiShiftMixedPrec"); GridBase *DoublePrecGrid = src_d.Grid(); + precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid); + precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid); + //////////////////////////////////////////////////////////////////////// // Convenience references to the info stored in "MultiShiftFunction" //////////////////////////////////////////////////////////////////////// @@ -200,10 +203,10 @@ public: r_d = p_d; //MdagM+m[0] - precisionChangeFast(p_f,p_d); + precisionChange(p_f, p_d, pc_wk_d_to_s); Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) - precisionChangeFast(tmp_d,mmp_f); + precisionChange(tmp_d, mmp_f, pc_wk_s_to_d); Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) tmp_d = tmp_d - mmp_d; std::cout << " Testing operators match "< &Linop_f; LinearOperatorBase &Linop_d; GridBase* SinglePrecGrid; - RealD Delta; //reliable update parameter + RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single LinearOperatorBase *Linop_fallback; @@ -65,7 +65,9 @@ public: ErrorOnNoConverge(err_on_no_conv), DoFinalCleanup(true), Linop_fallback(NULL) - {}; + { + assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1"); + }; void setFallbackLinop(LinearOperatorBase &_Linop_fallback, const RealD _fallback_transition_tol){ Linop_fallback = &_Linop_fallback; @@ -116,9 +118,12 @@ public: } //Single prec initialization + precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid); + precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid()); + FieldF r_f(SinglePrecGrid); r_f.Checkerboard() = r.Checkerboard(); - precisionChange(r_f, r); + precisionChange(r_f, r, pc_wk_dp_to_sp); FieldF psi_f(r_f); psi_f = Zero(); @@ -134,7 +139,8 @@ public: GridStopWatch LinalgTimer; GridStopWatch MatrixTimer; GridStopWatch SolverTimer; - + GridStopWatch PrecChangeTimer; + SolverTimer.Start(); int k = 0; int l = 0; @@ -173,7 +179,9 @@ public: // Stopping condition if (cp <= rsq) { //Although not written in the paper, I assume that I have to add on the final solution - precisionChange(mmp, psi_f); + PrecChangeTimer.Start(); + precisionChange(mmp, psi_f, pc_wk_sp_to_dp); + PrecChangeTimer.Stop(); psi = psi + mmp; @@ -194,7 +202,10 @@ public: std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() < &in, Lattice &out) }); } +//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field) template void precisionChangeFast(Lattice &out, const Lattice &in) { @@ -1097,9 +1098,9 @@ void precisionChangeFast(Lattice &out, const Lattice &in) precisionChange(vout,vin,N); }); } -//Convert a Lattice from one precision to another +//Convert a Lattice from one precision to another (original, slow implementation) template -void precisionChange(Lattice &out, const Lattice &in) +void precisionChangeOrig(Lattice &out, const Lattice &in) { assert(out.Grid()->Nd() == in.Grid()->Nd()); for(int d=0;dNd();d++){ @@ -1145,6 +1146,128 @@ void precisionChange(Lattice &out, const Lattice &in) }); } +//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls +class precisionChangeWorkspace{ + std::pair* fmap_device; //device pointer + //maintain grids for checking + GridBase* _out_grid; + GridBase* _in_grid; +public: + precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){ + //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device + assert(out_grid->Nd() == in_grid->Nd()); + for(int d=0;dNd();d++){ + assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]); + } + int Nsimd_out = out_grid->Nsimd(); + + std::vector out_icorrs(out_grid->Nsimd()); //reuse these + for(int lane=0; lane < out_grid->Nsimd(); lane++) + out_grid->iCoorFromIindex(out_icorrs[lane], lane); + + std::vector > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd + thread_for(out_oidx,out_grid->oSites(),{ + Coordinate out_ocorr; + out_grid->oCoorFromOindex(out_ocorr, out_oidx); + + Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate) + for(int out_lane=0; out_lane < Nsimd_out; out_lane++){ + out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr); + + //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr); + //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice + //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity + int in_oidx = 0, in_lane = 0; + for(int d=0;d_ndimension;d++){ + in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] ); + in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] ); + } + fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair( in_oidx, in_lane ); + } + }); + + //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines) + size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair); + fmap_device = (std::pair*)acceleratorAllocDevice(fmap_bytes); + acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); + } + + //Prevent moving or copying + precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete; + precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete; + precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete; + precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete; + + std::pair const* getMap() const{ return fmap_device; } + + void checkGrids(GridBase* out, GridBase* in) const{ + conformable(out, _out_grid); + conformable(in, _in_grid); + } + + ~precisionChangeWorkspace(){ + acceleratorFreeDevice(fmap_device); + } +}; + + +//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check) +//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery +template +auto _precisionChangeFastWrap(Lattice &out, const Lattice &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){ + if(out.Grid() == in.Grid()){ + precisionChangeFast(out,in); + return 1; + }else{ + return 0; + } +} +template +int _precisionChangeFastWrap(Lattice &out, const Lattice &in, long dummy){ //note long here is intentional; it means the above is preferred if available + return 0; +} + + +//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace +//which contains the mapping data. +template +void precisionChange(Lattice &out, const Lattice &in, const precisionChangeWorkspace &workspace){ + if(_precisionChangeFastWrap(out,in,0)) return; + + static_assert( std::is_same::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same + + out.Checkerboard() = in.Checkerboard(); + constexpr int Nsimd_out = VobjOut::Nsimd(); + + workspace.checkGrids(out.Grid(),in.Grid()); + std::pair const* fmap_device = workspace.getMap(); + + //Do the copy/precision change + autoView( out_v , out, AcceleratorWrite); + autoView( in_v , in, AcceleratorRead); + + accelerator_for(out_oidx, out.Grid()->oSites(), 1,{ + std::pair const* fmap_osite = fmap_device + out_oidx*Nsimd_out; + for(int out_lane=0; out_lane < Nsimd_out; out_lane++){ + int in_oidx = fmap_osite[out_lane].first; + int in_lane = fmap_osite[out_lane].second; + copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane); + } + }); +} + +//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast +//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device +template +void precisionChange(Lattice &out, const Lattice &in){ + if(_precisionChangeFastWrap(out,in,0)) return; + precisionChangeWorkspace workspace(out.Grid(), in.Grid()); + precisionChange(out, in, workspace); +} + + + + //////////////////////////////////////////////////////////////////////////////// // Communicate between grids //////////////////////////////////////////////////////////////////////////////// diff --git a/Grid/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h index 87572faf..f1407d1f 100644 --- a/Grid/tensors/Tensor_extract_merge.h +++ b/Grid/tensors/Tensor_extract_merge.h @@ -226,7 +226,7 @@ template accelerator_inline void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in) { - static_assert( std::is_same::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same + static_assert( std::is_same::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same typedef typename vobjOut::vector_type ovector_type; typedef typename vobjIn::vector_type ivector_type; @@ -251,9 +251,9 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest ovector_type * __restrict__ op = (ovector_type *)&vecOut; ivector_type * __restrict__ ip = (ivector_type *)&vecIn; for(int w=0;w +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int Ls = 12; + Coordinate latt4 = GridDefaultLatt(); + + GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD); + GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD); + GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD); + + GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF); + GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF); + GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF); + + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGridD); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionD field_d(FGridD), tmp_d(FGridD); + random(RNG5,field_d); tmp_d = field_d; + + LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF); + precisionChange(field_d2, field_d); tmp_d2 = field_d2; + + LatticeFermionF field_f(FGridF), tmp_f(FGridF); + precisionChange(field_f, field_d); tmp_f = field_f; + + int N = 500; + + double time_ds = 0, time_sd = 0; + + std::cout<double original implementation (fields initially device-resident)" << std::endl; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + + precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid()); + precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid()); + + std::cout<double with pregenerated workspace(fields initially device-resident)" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + std::cout<double with workspace generated on-the-fly (fields initially device-resident)" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + + std::cout<double2 (fields initially device-resident)" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + + std::cout<double2 through standard precisionChange call(fields initially device-resident) [NB: perf should be the same as the previous test!]" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + Grid_finalize(); +} diff --git a/tests/core/Test_prec_change.cc b/tests/core/Test_prec_change.cc new file mode 100644 index 00000000..06b9ae5c --- /dev/null +++ b/tests/core/Test_prec_change.cc @@ -0,0 +1,124 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/core/Test_prec_change.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int Ls = 12; + Coordinate latt4 = GridDefaultLatt(); + + GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD); + GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD); + GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD); + + GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF); + GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF); + GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF); + + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG5F(FGridF); RNG5F.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionD field_d(FGridD), tmp_d(FGridD); + random(RNG5,field_d); + RealD norm2_field_d = norm2(field_d); + + LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF); + random(RNG5F,field_d2); + RealD norm2_field_d2 = norm2(field_d2); + + LatticeFermionF field_f(FGridF); + + //Test original implementation + { + std::cout << GridLogMessage << "Testing original implementation" << std::endl; + field_f = Zero(); + precisionChangeOrig(field_f,field_d); + RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d = Zero(); + precisionChangeOrig(tmp_d, field_f); + Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + //Test new implementation with pregenerated workspace + { + std::cout << GridLogMessage << "Testing new implementation with pregenerated workspace" << std::endl; + precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid()); + precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid()); + + field_f = Zero(); + precisionChange(field_f,field_d,wk_dp_to_sp); + RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d = Zero(); + precisionChange(tmp_d, field_f,wk_sp_to_dp); + Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + //Test new implementation without pregenerated workspace + { + std::cout << GridLogMessage << "Testing new implementation without pregenerated workspace" << std::endl; + field_f = Zero(); + precisionChange(field_f,field_d); + RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d = Zero(); + precisionChange(tmp_d, field_f); + Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + //Test fast implementation + { + std::cout << GridLogMessage << "Testing fast (double2) implementation" << std::endl; + field_f = Zero(); + precisionChangeFast(field_f,field_d2); + RealD Ndiff = (norm2_field_d2 - norm2(field_f))/norm2_field_d2; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d2 = Zero(); + precisionChangeFast(tmp_d2, field_f); + Ndiff = norm2( LatticeFermionD2(tmp_d2-field_d2) ) / norm2_field_d2; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + std::cout << "Done" << std::endl; + + Grid_finalize(); +} diff --git a/tests/solver/Test_dwf_relupcg_prec.cc b/tests/solver/Test_dwf_relupcg_prec.cc new file mode 100644 index 00000000..1d8c022a --- /dev/null +++ b/tests/solver/Test_dwf_relupcg_prec.cc @@ -0,0 +1,143 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/solver/Test_dwf_relupcg_prec.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + double relup_delta = 0.2; + for(int i=1;i> relup_delta; + std::cout << GridLogMessage << "Set reliable update Delta to " << relup_delta << std::endl; + } + } + + const int Ls=12; + + { + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + GridCartesian * UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f); + GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f); + GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeFermionD src(FGrid); random(RNG5,src); + LatticeFermionD result(FGrid); result=Zero(); + LatticeGaugeFieldD Umu(UGrid); + LatticeGaugeFieldF Umu_f(UGrid_f); + + SU::HotConfiguration(RNG4,Umu); + + precisionChange(Umu_f,Umu); + + RealD mass=0.1; + RealD M5=1.8; + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionF Ddwf_f(Umu_f,*FGrid_f,*FrbGrid_f,*UGrid_f,*UrbGrid_f,mass,M5); + + LatticeFermionD src_o(FrbGrid); + LatticeFermionD result_o(FrbGrid); + LatticeFermionD result_o_2(FrbGrid); + pickCheckerboard(Odd,src_o,src); + result_o.Checkerboard() = Odd; + result_o = Zero(); + result_o_2.Checkerboard() = Odd; + result_o_2 = Zero(); + + SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); + + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; + ConjugateGradientReliableUpdate mCG(1e-8, 10000, relup_delta, FrbGrid_f, HermOpEO_f, HermOpEO); + double t1,t2,flops; + double MdagMsiteflops = 1452; // Mobius (real coeffs) + // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of) + double CGsiteflops = (8+4+8+4+4)*Nc*Ns ; + std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops< CG(1.0e-8,10000); + for(int i=0;i<1;i++){ + result_o_2 = Zero(); + t1=usecond(); + CG(HermOpEO,src_o,result_o_2); + t2=usecond(); + iters = CG.IterationsToComplete; + flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; + flops+= CGsiteflops*FrbGrid->gSites()*iters; + + std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<