Merge branch 'develop' into feature/gpu-port

2026-06-25 13:03:30 +01:00 · 2018-12-13 05:11:34 +00:00
parent adbdc4e65b c509bd3fe2
commit b57a4d32aa
647 changed files with 49155 additions and 11160 deletions
@@ -114,7 +114,7 @@ int main (int argc, char ** argv)

  {
    FGrid->Barrier();
-    ScidacWriter _ScidacWriter;
+    ScidacWriter _ScidacWriter(FGrid->IsBoss());
    _ScidacWriter.open(file);
    std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
    std::cout << GridLogMessage << " Writing out gauge field "<<std::endl;
@@ -144,7 +144,7 @@ int main (int argc, char ** argv)
      std::cout << GridLogMessage << "****************************************************************** "<<std::endl;

      std::stringstream filefn;      filefn << filef << "."<< n;
-      ScidacWriter _ScidacWriter;
+      ScidacWriter _ScidacWriter(FGrid->IsBoss());
      _ScidacWriter.open(filefn.str());
      _ScidacWriter.writeScidacFieldRecord(src[n],record);
      _ScidacWriter.close();
@@ -38,6 +38,7 @@ int main (int argc, char ** argv)
  typedef typename DomainWallFermionR::ComplexField ComplexField; 
  typename DomainWallFermionR::ImplParams params; 

+  double stp=1.0e-5;
  const int Ls=4;

  Grid_init(&argc,&argv);
@@ -197,7 +198,7 @@ int main (int argc, char ** argv)

  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOpCk(Dchk);
-  ConjugateGradient<FermionField> CG((1.0e-2),10000);
+  ConjugateGradient<FermionField> CG((stp),10000);
  s_res = Zero();
  CG(HermOp,s_src,s_res);

@@ -227,5 +228,11 @@ int main (int argc, char ** argv)
    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< norm2(tmp)/norm2(src[n])<<std::endl;
  }

+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  int blockDim = 0;//not used for BlockCGVec
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGVec,blockDim,stp,10000);
+  BCGV.PrintInterval=10;
+  BCGV(HermOpCk,src,result);
+
  Grid_finalize();
 }
@@ -0,0 +1,220 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename MobiusFermionR::FermionField FermionField; 
+  typedef typename MobiusFermionR::ComplexField ComplexField; 
+  typename MobiusFermionR::ImplParams params; 
+
+  const int Ls=12;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  std::vector<int> mpi_split (mpi_layout.size(),1);
+  std::vector<int> split_coor (mpi_layout.size(),1);
+  std::vector<int> split_dim (mpi_layout.size(),1);
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  /////////////////////////////////////////////
+  // Split into 1^4 mpi communicators
+  /////////////////////////////////////////////
+
+  for(int i=0;i<argc;i++){
+    if(std::string(argv[i]) == "--split"){
+      for(int k=0;k<mpi_layout.size();k++){
+	std::stringstream ss; 
+	ss << argv[i+1+k]; 
+	ss >> mpi_split[k];
+      }
+      break;
+    }
+  }
+
+ 
+  double stp = 1.e-8;
+  int nrhs = 1;
+  int me;
+  for(int i=0;i<mpi_layout.size();i++){
+//	split_dim[i] = (mpi_layout[i]/mpi_split[i]);
+	nrhs *= (mpi_layout[i]/mpi_split[i]);
+//	split_coor[i] = FGrid._processor_coor[i]/mpi_split[i];
+  }
+  std::cout << GridLogMessage << "Creating split grids " <<std::endl;
+  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
+						    GridDefaultSimd(Nd,vComplex::Nsimd()),
+						    mpi_split,
+						    *UGrid,me); 
+  std::cout << GridLogMessage <<"Creating split ferm grids " <<std::endl;
+
+  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
+  std::cout << GridLogMessage <<"Creating split rb grids " <<std::endl;
+  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
+  std::cout << GridLogMessage <<"Creating split ferm rb grids " <<std::endl;
+  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,SGrid);
+  std::cout << GridLogMessage << "Made the grids"<<std::endl;
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG5,src[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  if(0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " "<<file<<" successfully read" <<std::endl;
+  } else {
+    GridParallelRNG pRNG(UGrid );  
+    std::cout << GridLogMessage << "Intialising 4D RNG "<<std::endl;
+    pRNG.SeedFixedIntegers(seeds);
+    std::cout << GridLogMessage << "Intialised 4D RNG "<<std::endl;
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+    std::cout << " Site zero "<< Umu._odata[0]   <<std::endl;
+  } 
+
+  /////////////////
+  // MPI only sends
+  /////////////////
+  LatticeGaugeField s_Umu(SGrid);
+  FermionField s_src(SFGrid);
+  FermionField s_tmp(SFGrid);
+  FermionField s_res(SFGrid);
+
+  std::cout << GridLogMessage << "Made the split grid fields"<<std::endl;
+  ///////////////////////////////////////////////////////////////
+  // split the source out using MPI instead of I/O
+  ///////////////////////////////////////////////////////////////
+  Grid_split  (Umu,s_Umu);
+  Grid_split  (src,s_src);
+  std::cout << GridLogMessage << " split rank  " <<me << " s_src "<<norm2(s_src)<<std::endl;
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  //  RealD mass=0.00107;
+  RealD mass=0.1;
+  RealD M5=1.8;
+  RealD mobius_factor=32./12.;
+  RealD mobius_b=0.5*(mobius_factor+1.);
+  RealD mobius_c=0.5*(mobius_factor-1.);
+  MobiusFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,mobius_b,mobius_c,params);
+  MobiusFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,mobius_b,mobius_c,params);
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<MobiusFermionR,FermionField> HermOp(Ddwf);
+  MdagMLinearOperator<MobiusFermionR,FermionField> HermOpCk(Dchk);
+  ConjugateGradient<FermionField> CG((stp),100000);
+  s_res = zero;
+
+  CG(HermOp,s_src,s_res);
+
+  std::cout << GridLogMessage << " split residual norm "<<norm2(s_res)<<std::endl;
+  /////////////////////////////////////////////////////////////
+  // Report how long they all took
+  /////////////////////////////////////////////////////////////
+  std::vector<uint32_t> iterations(nrhs,0);
+  iterations[me] = CG.IterationsToComplete;
+    
+  for(int n=0;n<nrhs;n++){
+    UGrid->GlobalSum(iterations[n]);
+    std::cout << GridLogMessage<<" Rank "<<n<<" "<< iterations[n]<<" CG iterations"<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Gather and residual check on the results
+  /////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage<< "Unsplitting the result"<<std::endl;
+  Grid_unsplit(result,s_res);
+
+
+  std::cout << GridLogMessage<< "Checking the residuals"<<std::endl;
+  for(int n=0;n<nrhs;n++){
+    std::cout << GridLogMessage<< " res["<<n<<"] norm "<<norm2(result[n])<<std::endl;
+    HermOpCk.HermOp(result[n],tmp); tmp = tmp - src[n];
+    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< std::sqrt(norm2(tmp)/norm2(src[n]))<<std::endl;
+  }
+
+
+  for(int s=0;s<nrhs;s++){
+    result[s]=zero;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+  {
+    BCGV(HermOpCk,src,result);
+  }
+
+  
+  
+  Grid_finalize();
+}
@@ -0,0 +1,144 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename DomainWallFermionR::FermionField FermionField; 
+  typedef typename DomainWallFermionR::ComplexField ComplexField; 
+  typename DomainWallFermionR::ImplParams params; 
+
+  const int Ls=16;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+ 
+  double stp = 1.e-8;
+  int nrhs = 2;
+
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG5,src[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  int conf = 0;
+  if(conf==0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " Config "<<file<<" successfully read" <<std::endl;
+  } else if (conf==1){
+    GridParallelRNG pRNG(UGrid );  
+
+    pRNG.SeedFixedIntegers(seeds);
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+  } else {
+    SU3::ColdConfiguration(Umu);
+    std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
+  }
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  RealD mass=0.01;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,params);
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
+  ConjugateGradient<FermionField> CG((stp),100000);
+
+  for(int rhs=0;rhs<1;rhs++){
+    result[rhs] = zero;
+    CG(HermOp,src[rhs],result[rhs]);
+  }
+
+  for(int rhs=0;rhs<1;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  for(int s=0;s<nrhs;s++){
+      result[s]=zero;
+   }
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+  {
+    BCGV(HermOp,src,result);
+  }
+  
+  for(int rhs=0;rhs<nrhs;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  Grid_finalize();
+}
@@ -0,0 +1,148 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename DomainWallFermionR::FermionField FermionField; 
+  typedef typename DomainWallFermionR::ComplexField ComplexField; 
+  typename DomainWallFermionR::ImplParams params; 
+
+  const int Ls=16;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+ 
+  double stp = 1.e-8;
+  int nrhs = 2;
+
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src4(nrhs,UGrid);
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG4(UGrid);  pRNG4.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG4,src4[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  int conf = 0;
+  if(conf==0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " Config "<<file<<" successfully read" <<std::endl;
+  } else if (conf==1){
+    GridParallelRNG pRNG(UGrid );  
+
+    pRNG.SeedFixedIntegers(seeds);
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+  } else {
+    SU3::ColdConfiguration(Umu);
+    std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
+  }
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  RealD mass=0.01;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,params);
+  for(int s=0;s<nrhs;s++) {
+    Ddwf.ImportPhysicalFermionSource(src4[s],src[s]);
+  }
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
+  ConjugateGradient<FermionField> CG((stp),100000);
+
+  for(int rhs=0;rhs<1;rhs++){
+    result[rhs] = zero;
+    //    CG(HermOp,src[rhs],result[rhs]);
+  }
+
+  for(int rhs=0;rhs<1;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  for(int s=0;s<nrhs;s++){
+    result[s]=zero;
+  }
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+  {
+    BCGV(HermOp,src,result);
+  }
+  
+  for(int rhs=0;rhs<nrhs;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  Grid_finalize();
+}
@@ -0,0 +1,147 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename DomainWallFermionR::FermionField FermionField; 
+  typedef typename DomainWallFermionR::ComplexField ComplexField; 
+  typename DomainWallFermionR::ImplParams params; 
+
+  const int Ls=16;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+ 
+  double stp = 1.e-8;
+  int nrhs = 4;
+
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG5,src[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  int conf = 2;
+  if(conf==0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " Config "<<file<<" successfully read" <<std::endl;
+  } else if (conf==1){
+    GridParallelRNG pRNG(UGrid );  
+
+    pRNG.SeedFixedIntegers(seeds);
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+  } else {
+    SU3::ColdConfiguration(Umu);
+    std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
+  }
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  RealD mass=0.01;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,params);
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
+  ConjugateGradient<FermionField> CG((stp),100000);
+
+  for(int rhs=0;rhs<1;rhs++){
+    result[rhs] = zero;
+    CG(HermOp,src[rhs],result[rhs]);
+  }
+
+  for(int rhs=0;rhs<1;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  for(int s=0;s<nrhs;s++){
+    result[s]=zero;
+  }
+
+
+  {
+    BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+    SchurRedBlackDiagTwoSolve<FermionField> SchurSolver(BCGV);
+    SchurSolver(Ddwf,src,result);
+  }
+  
+  for(int rhs=0;rhs<nrhs;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  Grid_finalize();
+}
@@ -67,34 +67,70 @@ int main (int argc, char ** argv)
  GridParallelRNG pRNG(UGrid );  pRNG.SeedFixedIntegers(seeds);
  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);

-  FermionField src(FGrid); random(pRNG5,src);
+  FermionField src(FGrid);
+  FermionField tt(FGrid);
+#if 1
+  random(pRNG5,src);
+#else
+  src=zero;
+  ComplexField coor(FGrid);
+  LatticeCoordinate(coor,0);
+  for(int ss=0;ss<FGrid->oSites();ss++){
+    src._odata[ss]()()(0)=coor._odata[ss]()()();
+  }
+  LatticeCoordinate(coor,1);
+  for(int ss=0;ss<FGrid->oSites();ss++){
+    src._odata[ss]()()(0)+=coor._odata[ss]()()();
+  }
+#endif
  FermionField src_o(FrbGrid);   pickCheckerboard(Odd,src_o,src);
  FermionField result_o(FrbGrid); result_o=Zero(); 
  RealD nrm = norm2(src);

  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);

+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
  RealD mass=0.003;
-  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); 
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); 
  SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);

  ConjugateGradient<FermionField> CG(1.0e-8,10000);
  int blockDim = 0;
  BlockConjugateGradient<FermionField>    BCGrQ(BlockCGrQ,blockDim,1.0e-8,10000);
-  BlockConjugateGradient<FermionField>    BCG  (BlockCG,blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    BCG  (BlockCGrQ,blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    BCGv (BlockCGrQVec,blockDim,1.0e-8,10000);
  BlockConjugateGradient<FermionField>    mCG  (CGmultiRHS,blockDim,1.0e-8,10000);

  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
  std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
-  ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
+  ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
  SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
  FermionField src4d(UGrid); random(pRNG,src4d);
  FermionField src4d_o(UrbGrid);   pickCheckerboard(Odd,src4d_o,src4d);
  FermionField result4d_o(UrbGrid); 

  result4d_o=Zero();
-  CG(HermOp4d,src4d_o,result4d_o);
+  double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 +  == 1146
+  {
+    double t1=usecond();
+    CG(HermOp4d,src4d_o,result4d_o);
+    double t2=usecond();
+    double ncall=CG.IterationsToComplete;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    HermOp4d.Report();
+  }
+  Ds4d.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;


@@ -103,7 +139,17 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  Ds.ZeroCounters();
  result_o=Zero();
-  CG(HermOp,src_o,result_o);
+  {
+    double t1=usecond();
+    CG(HermOp,src_o,result_o);
+    double t2=usecond();
+    double ncall=CG.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    HermOp.Report();
+  }
  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

@@ -112,7 +158,37 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  Ds.ZeroCounters();
  result_o=Zero();
-  mCG(HermOp,src_o,result_o);
+  {
+    double t1=usecond();
+    mCG(HermOp,src_o,result_o);
+    double t2=usecond();
+    double ncall=mCG.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    HermOp.Report();
+  }
+
+  Ds.Report();
+  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+
+  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+  std::cout << GridLogMessage << " Calling Block CGrQ for "<<Ls <<" right hand sides" <<std::endl;
+  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+  Ds.ZeroCounters();
+  result_o=Zero();
+  {
+    double t1=usecond();
+    BCGrQ(HermOp,src_o,result_o);
+    double t2=usecond();
+    double ncall=BCGrQ.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    HermOp.Report();
+  }
  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

@@ -120,11 +196,45 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  Ds.ZeroCounters();
-  result_o=Zero();
-  BCGrQ(HermOp,src_o,result_o);
+  result_o=zero;
+  {
+    double t1=usecond();
+    BCG(HermOp,src_o,result_o);
+    double t2=usecond();
+    double ncall=BCGrQ.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    HermOp.Report();
+  }
  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling BCGvec "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::vector<FermionField> src_v   (Ls,UrbGrid);
+  std::vector<FermionField> result_v(Ls,UrbGrid);
+  for(int s=0;s<Ls;s++) result_v[s] = zero;
+  for(int s=0;s<Ls;s++) {
+    FermionField src4(UGrid);
+    ExtractSlice(src4,src,s,0);
+    pickCheckerboard(Odd,src_v[s],src4);  
+  }
+
+  {
+    double t1=usecond();
+    BCGv(HermOp4d,src_v,result_v);
+    double t2=usecond();
+    double ncall=BCGv.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    //    HermOp4d.Report();
+  }
+

  Grid_finalize();
 }
@@ -74,7 +74,16 @@ int main (int argc, char ** argv)
  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);

  RealD mass=0.003;
-  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); 
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+
+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }
+
+  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); 
  MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);

  ConjugateGradient<FermionField> CG(1.0e-8,10000);
@@ -86,11 +95,23 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
  std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
-  ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
+  ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
  MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
  FermionField src4d(UGrid); random(pRNG,src4d);
  FermionField result4d(UGrid); result4d=Zero();
-  CG(HermOp4d,src4d,result4d);
+
+  double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 +  == 1146
+  {
+    double t1=usecond();
+    CG(HermOp4d,src4d,result4d);
+    double t2=usecond();
+    double ncall=CG.IterationsToComplete;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+   }
+
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;


@@ -98,9 +119,18 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  result=Zero();
+{
  Ds.ZeroCounters();
+  double t1=usecond();
  CG(HermOp,src,result);
+  double t2=usecond();
+  double ncall=CG.IterationsToComplete;
+  double flops = deodoe_flops * ncall;
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
  Ds.Report();
+}
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@@ -108,7 +138,16 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  result=Zero();
  Ds.ZeroCounters();
+{
+  double t1=usecond();
  mCG(HermOp,src,result);
+  double t2=usecond();
+  double ncall=CG.IterationsToComplete;
+  double flops = deodoe_flops * ncall;
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+  std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+}
  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

@@ -117,7 +156,16 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
  result=Zero();
  Ds.ZeroCounters();
+{
+  double t1=usecond();
  BCGrQ(HermOp,src,result);
+  double t2=usecond();
+  double ncall=CG.IterationsToComplete;
+  double flops = deodoe_flops * ncall;
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+  std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+}
  Ds.Report();
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

@@ -71,7 +71,10 @@ int main (int argc, char ** argv)
  }  
  
  RealD mass=0.003;
-  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);

  FermionField res_o(&RBGrid); 
  FermionField src_o(&RBGrid); 
@@ -80,7 +83,19 @@ int main (int argc, char ** argv)

  SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
  ConjugateGradient<FermionField> CG(1.0e-8,10000);
+  double t1=usecond();
  CG(HermOpEO,src_o,res_o);
+  double t2=usecond();
+
+  // Schur solver: uses DeoDoe => volume * 1146
+  double ncall=CG.IterationsToComplete;
+  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
+
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+  std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+
+

  FermionField tmp(&RBGrid);

@@ -65,7 +65,10 @@ int main (int argc, char ** argv)
  FermionField  resid(&Grid); 

  RealD mass=0.1;
-  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);

  ConjugateGradient<FermionField> CG(1.0e-8,10000);
  SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);
@@ -73,7 +73,10 @@ int main (int argc, char ** argv)
  }  
  
  RealD mass=0.1;
-  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);

  MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
  ConjugateGradient<FermionField> CG(1.0e-6,10000);
@@ -0,0 +1,121 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_wilson_cg_unprec.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  typedef typename ImprovedStaggeredFermionR::FermionField FermionField; 
+  typename ImprovedStaggeredFermionR::ImplParams params; 
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
+
+
+  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+
+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
+  ////////////////////////////////////////
+  // sqrt 
+  ////////////////////////////////////////
+  double     lo=0.001;
+  double     hi=1.0;
+  int precision=64;
+  int    degree=10;
+  AlgRemez remez(lo,hi,precision);
+  remez.generateApprox(degree,1,2);
+  MultiShiftFunction Sqrt(remez,1.0e-6,false);
+  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/2)"<<std::endl;
+
+
+  ////////////////////////////////////////////
+  // Setup staggered
+  ////////////////////////////////////////////
+  RealD mass=0.003;
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+
+  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
+  SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
+
+  FermionField src(&Grid); random(pRNG,src);
+  FermionField src_o(&RBGrid); 
+  pickCheckerboard(Odd,src_o,src);
+
+
+  /////////////////////////////////
+  //Multishift CG
+  /////////////////////////////////
+  std::vector<FermionField> result(degree,&RBGrid);
+  ConjugateGradientMultiShift<FermionField> MSCG(10000,Sqrt);
+
+  double deodoe_flops=(1205+15*degree)*volume; // == 66*16 +  == 1146
+
+  double t1=usecond();
+  MSCG(HermOpEO,src_o,result);
+  double t2=usecond();
+  double ncall=MSCG.IterationsToComplete;
+  double flops = deodoe_flops * ncall;
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+  std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+  //  HermOpEO.Report();
+
+  Grid_finalize();
+}