Merge branch 'develop' of github.com:paboyle/Grid into develop

2025-10-13 20:54:43 +01:00 · 2017-12-01 19:44:31 +00:00
parent 2427a21428 28ceacec45
commit 2fd4989029
7 changed files with 190 additions and 166 deletions
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -308,32 +308,34 @@ namespace Grid {
    public:
      SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+	GridLogIterative.TimingMode(1);
+	std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl;
 	n2 = Mpc(in,out);
+	std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl;
 	ComplexD dot= innerProduct(in,out);
+	std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl;
 	n1 = real(dot);
      }
      virtual void HermOp(const Field &in, Field &out){
+	std::cout << GridLogIterative << " HermOp "<<std::endl;
 	Mpc(in,out);
      }
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
 	Field tmp2(in._grid);

+	std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
 	_Mat.Mooee(in,out);
 	_Mat.Mooee(out,tmp);
+	std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;

 	_Mat.Meooe(in,out);
 	_Mat.Meooe(out,tmp2);
+	std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl;

-	return axpy_norm(out,-1.0,tmp2,tmp);
-#if 0
-	//... much prefer conventional Schur norm
-	_Mat.Meooe(in,tmp);
-	_Mat.MooeeInv(tmp,out);
-	_Mat.Meooe(out,tmp);
-	_Mat.Mooee(in,out);
-        return axpy_norm(out,-1.0,tmp,out);
-#endif
+	RealD nn=axpy_norm(out,-1.0,tmp2,tmp);
+	std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl;
+	return nn;
      }
      virtual  RealD MpcDag   (const Field &in, Field &out){
 	return Mpc(in,out);
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -123,11 +123,14 @@ namespace Grid {
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
-
+      
+      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
+
+      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
    
      /////////////////////////////////////////////////////
      // src_o = (source_o - Moe MeeInv source_e)
@@ -144,6 +147,7 @@ namespace Grid {
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;

      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
@@ -152,15 +156,16 @@ namespace Grid {
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
     
+      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;

      // Verify the unprec residual
      _Matrix.M(out,resid); 
      resid = resid-in;
      RealD ns = norm2(in);
      RealD nr = norm2(resid);
-
      std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
    }     
  };
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -134,8 +134,18 @@ void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
 {
  _ndimension = processors.size();
-  assert(_ndimension = parent._ndimension);
-  
+
+  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
+  std::vector<int> parent_processor_coor(_ndimension,0);
+  std::vector<int> parent_processors    (_ndimension,1);
+
+  // Can make 5d grid from 4d etc...
+  int pad = _ndimension-parent_ndimension;
+  for(int d=0;d<parent_ndimension;d++){
+    parent_processor_coor[pad+d]=parent._processor_coor[d];
+    parent_processors    [pad+d]=parent._processors[d];
+  }
+
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // split the communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -154,9 +164,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  std::vector<int> ssize(_ndimension); // coor of split within parent

  for(int d=0;d<_ndimension;d++){
-    ccoor[d] = parent._processor_coor[d] % processors[d];
-    scoor[d] = parent._processor_coor[d] / processors[d];
-    ssize[d] = parent._processors[d]     / processors[d];
+    ccoor[d] = parent_processor_coor[d] % processors[d];
+    scoor[d] = parent_processor_coor[d] / processors[d];
+    ssize[d] = parent_processors[d]     / processors[d];
  }
  int crank;  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
  // Mpi uses the reverse Lexico convention to us
@@ -166,38 +176,36 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  MPI_Comm comm_split;
  if ( Nchild > 1 ) { 

-    /*
-    std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
-    std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
-    for(int d=0;d<parent._processors.size();d++)  std::cout << parent._processors[d] << " ";
-    std::cout<<std::endl;
-
-    std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
-    for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
-    std::cout<<std::endl;
-
-    std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< _ndimension <<"]    ";
-    for(int d=0;d<processors.size();d++)  std::cout << parent._processor_coor[d] << " ";
-    std::cout<<std::endl;
-
-    std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
-    for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
-    std::cout<<std::endl;
-
-    std::cout << GridLogMessage<<" new coor ["<< _ndimension <<"]    ";
-    for(int d=0;d<processors.size();d++)  std::cout << parent._processor_coor[d] << " ";
-    std::cout<<std::endl;
-    */
+    if(0){
+      std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
+      std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
+      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
+      for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    ";
+      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    ";
+      for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
+      for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
+      std::cout<<std::endl;
+    }

    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
    assert(ierr==0);
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // Declare victory
    //////////////////////////////////////////////////////////////////////////////////////////////////////
-    /*
-    std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
-	      << Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
-    */
+    //    std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
+    //	      << Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
  } else {
    comm_split=parent.communicator;
    srank = 0;
@@ -207,6 +215,17 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  // Set up from the new split communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  InitFromMPICommunicator(processors,comm_split);
+
+  if(0){ 
+    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
+    for(int d=0;d<processors.size();d++){
+      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
+    }
+  }
+  for(int d=0;d<processors.size();d++){
+    assert(_processor_coor[d] == ccoor[d] );
+  }
+
 }

 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -231,7 +250,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &proc
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);

-  if ( communicator_base != communicator_world ) {
+  if ( 0 && (communicator_base != communicator_world) ) {
    std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
    
    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -606,7 +606,7 @@ CartesianCommunicator::~CartesianCommunicator()
  MPI_Finalized(&MPI_is_finalised);
  if (communicator && !MPI_is_finalised) {
    MPI_Comm_free(&communicator);
-    for(int i=0;i<  communicator_halo.size();i++){
+    for(int i=0;i<communicator_halo.size();i++){
      MPI_Comm_free(&communicator_halo[i]);
    }
  }  
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -50,26 +50,22 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
    half.checkerboard = cb;
-    int ssh=0;
-    //parallel_for
-    for(int ss=0;ss<full._grid->oSites();ss++){
-      std::vector<int> coor;
+
+    parallel_for(int ss=0;ss<full._grid->oSites();ss++){
      int cbos;
-      
+      std::vector<int> coor;
      full._grid->oCoorFromOindex(coor,ss);
      cbos=half._grid->CheckerBoard(coor);
      
      if (cbos==cb) {
+	int ssh=half._grid->oIndex(coor);
 	half._odata[ssh] = full._odata[ss];
-	ssh++;
      }
    }
  }
  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
    int cb = half.checkerboard;
-    int ssh=0;
-    //parallel_for
-    for(int ss=0;ss<full._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;

@@ -77,8 +73,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
      cbos=half._grid->CheckerBoard(coor);
      
      if (cbos==cb) {
+	int ssh=half._grid->oIndex(coor);
 	full._odata[ss]=half._odata[ssh];
-	ssh++;
      }
    }
  }
@@ -698,30 +694,6 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
-//
-// All to all plan
-//
-// Subvolume on fine grid is v.    Vectors a,b,c,d 
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////////////
-// SIMPLEST CASE:
-///////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Mesh of nodes (2) ; subdivide to  1 subdivisions
-//
-// Lex ord:   
-//          N0 va0 vb0  N1 va1 vb1 
-//
-// For each dimension do an all to all
-//
-// full AllToAll(0)
-//          N0 va0 va1    N1 vb0 vb1
-//
-// REARRANGE
-//          N0 va01       N1 vb01
-//
-// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
-// NB: Easiest to programme if keep in lex order.
-//
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 // SIMPLE CASE:
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -755,9 +727,17 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
 //
 // Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
 // NB: Easiest to programme if keep in lex order.
-//
-/////////////////////////////////////////////////////////
-
+/*
+ *  Let chunk = (fvol*nvec)/sP be size of a chunk.         ( Divide lexico vol * nvec into fP/sP = M chunks )
+ *  
+ *  2nd A2A (over sP nodes; subdivide the fP into sP chunks of M)
+ * 
+ *     node 0     1st chunk of node 0M..(1M-1); 2nd chunk of node 0M..(1M-1)..   data chunk x M x sP = fL / sP * M * sP = fL * M growth
+ *     node 1     1st chunk of node 1M..(2M-1); 2nd chunk of node 1M..(2M-1)..
+ *     node 2     1st chunk of node 2M..(3M-1); 2nd chunk of node 2M..(3M-1)..
+ *     node 3     1st chunk of node 3M..(3M-1); 2nd chunk of node 2M..(3M-1)..
+ *  etc...
+ */
 template<class Vobj>
 void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 {
@@ -816,57 +796,58 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)

  int nvec = nvector; // Counts down to 1 as we collapse dims
  std::vector<int> ldims = full_grid->_ldimensions;
-  std::vector<int> lcoor(ndim);

  for(int d=ndim-1;d>=0;d--){

    if ( ratio[d] != 1 ) {

      full_grid ->AllToAll(d,alldata,tmpdata);
-      //      std::cout << GridLogMessage << "Grid_split: dim " <<d<<" ratio "<<ratio[d]<<" nvec "<<nvec<<" procs "<<split_grid->_processors[d]<<std::endl;
-      //      for(int v=0;v<nvec;v++){
-      //	std::cout << "Grid_split: alldata["<<v<<"] " << alldata[v] <<std::endl;
-      //	std::cout << "Grid_split: tmpdata["<<v<<"] " << tmpdata[v] <<std::endl;
-      //      }
-      //////////////////////////////////////////
-      //Local volume for this dimension is expanded by ratio of processor extents
-      // Number of vectors is decreased by same factor
-      // Rearrange to lexico for bigger volume
-      //////////////////////////////////////////
-      nvec    /= ratio[d];
+      if ( split_grid->_processors[d] > 1 ) {
+	alldata=tmpdata;
+	split_grid->AllToAll(d,alldata,tmpdata);
+      }

-      auto rdims = ldims; rdims[d]  *=   ratio[d];
-      auto rsites= lsites*ratio[d];
-      for(int v=0;v<nvec;v++){
+      auto rdims = ldims; 
+      auto     M = ratio[d];
+      auto rsites= lsites*M;// increases rsites by M
+      nvec      /= M;       // Reduce nvec by subdivision factor
+      rdims[d]  *= M;       // increase local dim by same factor

-	// For loop over each site within old subvol
-	for(int lsite=0;lsite<lsites;lsite++){
+      int sP =   split_grid->_processors[d];
+      int fP =    full_grid->_processors[d];

-	  Lexicographic::CoorFromIndex(lcoor, lsite, ldims);	  
+      int fvol   = lsites;
+      
+      int chunk  = (nvec*fvol)/sP;          assert(chunk*sP == nvec*fvol);

-	  for(int r=0;r<ratio[d];r++){ // ratio*nvec terms
+      // Loop over reordered data post A2A
+      parallel_for(int c=0;c<chunk;c++){
+	for(int m=0;m<M;m++){
+	  for(int s=0;s<sP;s++){
+	    
+	    // addressing; use lexico
+	    int lex_r;
+	    uint64_t lex_c        = c+chunk*m+chunk*M*s;
+	    uint64_t lex_fvol_vec = c+chunk*s;
+	    uint64_t lex_fvol     = lex_fvol_vec%fvol;
+	    uint64_t lex_vec      = lex_fvol_vec/fvol;

-	    auto rcoor = lcoor;	    rcoor[d]  += r*ldims[d];
+	    // which node sets an adder to the coordinate
+	    std::vector<int> coor(ndim);
+	    Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);	  
+	    coor[d] += m*ldims[d];
+	    Lexicographic::IndexFromCoor(coor, lex_r, rdims);	  
+	    lex_r += lex_vec * rsites;

-	    int rsite; Lexicographic::IndexFromCoor(rcoor, rsite, rdims);	  
-	    rsite += v * rsites;
+	    // LexicoFind coordinate & vector number within split lattice
+	    alldata[lex_r] = tmpdata[lex_c];

-	    int rmul=nvec*lsites;
-	    int vmul=     lsites;
-	    alldata[rsite] = tmpdata[lsite+r*rmul+v*vmul];
-	    //	    if ( lsite==0 ) {
-	    //	      std::cout << "Grid_split: grow alldata["<<rsite<<"] " << alldata[rsite] << " <- tmpdata["<< lsite+r*rmul+v*vmul<<"] "<<tmpdata[lsite+r*rmul+v*vmul]  <<std::endl;
-	    //	    }	      
 	  }
 	}
      }
      ldims[d]*= ratio[d];
      lsites  *= ratio[d];

-      if ( split_grid->_processors[d] > 1 ) {
-	tmpdata = alldata;
-	split_grid->AllToAll(d,tmpdata,alldata);
-      }
    }
  }
  vectorizeFromLexOrdArray(alldata,split);    
@@ -937,59 +918,61 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
  /////////////////////////////////////////////////////////////////
  // Start from split grid and work towards full grid
  /////////////////////////////////////////////////////////////////
-  std::vector<int> lcoor(ndim);
-  std::vector<int> rcoor(ndim);

  int nvec = 1;
-  lsites = split_grid->lSites();
-  std::vector<int> ldims = split_grid->_ldimensions;
+  uint64_t rsites        = split_grid->lSites();
+  std::vector<int> rdims = split_grid->_ldimensions;

-  //  for(int d=ndim-1;d>=0;d--){
  for(int d=0;d<ndim;d++){

    if ( ratio[d] != 1 ) {

+      auto     M = ratio[d];

-      if ( split_grid->_processors[d] > 1 ) {
-	tmpdata = alldata;
-	split_grid->AllToAll(d,tmpdata,alldata);
-      }
-
-      //////////////////////////////////////////
-      //Local volume for this dimension is expanded by ratio of processor extents
-      // Number of vectors is decreased by same factor
-      // Rearrange to lexico for bigger volume
-      //////////////////////////////////////////
-      auto rsites= lsites/ratio[d];
-      auto rdims = ldims; rdims[d]/=ratio[d];
-
-      for(int v=0;v<nvec;v++){
-
-	// rsite, rcoor --> smaller local volume
-	// lsite, lcoor --> bigger original (single node?) volume
-	// For loop over each site within smaller subvol
-	for(int rsite=0;rsite<rsites;rsite++){
-
-	  Lexicographic::CoorFromIndex(rcoor, rsite, rdims);	  
-	  int lsite;
-
-	  for(int r=0;r<ratio[d];r++){ 
-
-	    lcoor = rcoor; lcoor[d] += r*rdims[d];
-	    Lexicographic::IndexFromCoor(lcoor, lsite, ldims); lsite += v * lsites;
-
-	    int rmul=nvec*rsites;
-	    int vmul=     rsites;
-	    tmpdata[rsite+r*rmul+v*vmul]=alldata[lsite];
-
+      int sP =   split_grid->_processors[d];
+      int fP =    full_grid->_processors[d];
+      
+      auto ldims = rdims;  ldims[d]  /= M;  // Decrease local dims by same factor
+      auto lsites= rsites/M;                // Decreases rsites by M
+      
+      int fvol   = lsites;
+      int chunk  = (nvec*fvol)/sP;          assert(chunk*sP == nvec*fvol);
+	
+      {
+	// Loop over reordered data post A2A
+	for(int c=0;c<chunk;c++){
+	  for(int m=0;m<M;m++){
+	    for(int s=0;s<sP;s++){
+	      
+	      // addressing; use lexico
+	      int lex_r;
+	      uint64_t lex_c = c+chunk*m+chunk*M*s;
+	      uint64_t lex_fvol_vec = c+chunk*s;
+	      uint64_t lex_fvol     = lex_fvol_vec%fvol;
+	      uint64_t lex_vec      = lex_fvol_vec/fvol;
+	      
+	      // which node sets an adder to the coordinate
+	      std::vector<int> coor(ndim);
+	      Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);	  
+	      coor[d] += m*ldims[d];
+	      Lexicographic::IndexFromCoor(coor, lex_r, rdims);	  
+	      lex_r += lex_vec * rsites;
+	      
+	      // LexicoFind coordinate & vector number within split lattice
+	      tmpdata[lex_c] = alldata[lex_r];
+	    }
 	  }
 	}
      }
-      nvec   *= ratio[d];
-      ldims[d]=rdims[d];
-      lsites  =rsites;

+      if ( split_grid->_processors[d] > 1 ) {
+	split_grid->AllToAll(d,tmpdata,alldata);
+	tmpdata=alldata;
+      }
      full_grid ->AllToAll(d,tmpdata,alldata);
+      rdims[d]/= M;
+      rsites  /= M;
+      nvec    *= M;       // Increase nvec by subdivision factor
    }
  }

@@ -997,12 +980,12 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
  for(int v=0;v<nvector;v++){
    assert(v<full.size());
    parallel_for(int site=0;site<lsites;site++){
+      assert(v*lsites+site < alldata.size());
      scalardata[site] = alldata[v*lsites+site];
    }
    vectorizeFromLexOrdArray(scalardata,full[v]);    
  }
 }

- 
 }
 #endif
--- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc
+++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc
@@ -95,7 +95,7 @@ int main (int argc, char ** argv)
  FermionField tmp(FGrid);

  for(int s=0;s<nrhs;s++) result[s]=zero;
-#undef LEXICO_TEST
+#define LEXICO_TEST
 #ifdef LEXICO_TEST
  {
    LatticeFermion lex(FGrid);  lex = zero;
@@ -121,12 +121,12 @@ int main (int argc, char ** argv)
    random(pRNG5,src[s]);
    tmp = 100.0*s;
    src[s] = (src[s] * 0.1) + tmp;
-    std::cout << " src ]"<<s<<"] "<<norm2(src[s])<<std::endl;
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
  }
 #endif

  for(int n =0 ; n< nrhs ; n++) { 
-    std::cout << " src"<<n<<"\n"<< src[n] <<std::endl;
+    //    std::cout << " src"<<n<<"\n"<< src[n] <<std::endl;
  }

  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
@@ -144,8 +144,8 @@ int main (int argc, char ** argv)
  ///////////////////////////////////////////////////////////////
  Grid_split  (Umu,s_Umu);
  Grid_split  (src,s_src);
-  std::cout << " split rank  " <<me << " s_src "<<norm2(s_src)<<std::endl;
-  std::cout << " s_src\n "<< s_src <<std::endl;
+  std::cout << GridLogMessage << " split rank  " <<me << " s_src "<<norm2(s_src)<<std::endl;
+  //  std::cout << " s_src\n "<< s_src <<std::endl;

 #ifdef LEXICO_TEST
  FermionField s_src_tmp(SFGrid);
@@ -168,11 +168,12 @@ int main (int argc, char ** argv)
    s_src_tmp = s_src_tmp + ftmp;
  }
  s_src_diff = s_src_tmp - s_src;
-  std::cout << " s_src_diff " << norm2(s_src_diff)<<std::endl;
+  std::cout << GridLogMessage <<" LEXICO test:  s_src_diff " << norm2(s_src_diff)<<std::endl;

-  std::cout << " s_src \n" << s_src << std::endl;
-  std::cout << " s_src_tmp \n" << s_src_tmp << std::endl;
-  std::cout << " s_src_diff \n" << s_src_diff << std::endl;
+  //  std::cout << " s_src \n" << s_src << std::endl;
+  //  std::cout << " s_src_tmp \n" << s_src_tmp << std::endl;
+  //  std::cout << " s_src_diff \n" << s_src_diff << std::endl;
+  //  exit(0);
 #endif

  ///////////////////////////////////////////////////////////////
@@ -189,11 +190,11 @@ int main (int argc, char ** argv)

  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOpCk(Dchk);
-  ConjugateGradient<FermionField> CG((1.0e-5),10000);
+  ConjugateGradient<FermionField> CG((1.0e-2),10000);
  s_res = zero;
  CG(HermOp,s_src,s_res);

-  std::cout << " s_res norm "<<norm2(s_res)<<std::endl;
+  std::cout << GridLogMessage << " split residual norm "<<norm2(s_res)<<std::endl;
  /////////////////////////////////////////////////////////////
  // Report how long they all took
  /////////////////////////////////////////////////////////////
@@ -214,7 +215,7 @@ int main (int argc, char ** argv)

  std::cout << GridLogMessage<< "Checking the residuals"<<std::endl;
  for(int n=0;n<nrhs;n++){
-    std::cout << " res["<<n<<"] norm "<<norm2(result[n])<<std::endl;
+    std::cout << GridLogMessage<< " res["<<n<<"] norm "<<norm2(result[n])<<std::endl;
    HermOpCk.HermOp(result[n],tmp); tmp = tmp - src[n];
    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< norm2(tmp)/norm2(src[n])<<std::endl;
  }
--- a/tests/solver/Test_staggered_cg_schur.cc
+++ b/tests/solver/Test_staggered_cg_schur.cc
@@ -70,7 +70,21 @@ int main (int argc, char ** argv)
  ConjugateGradient<FermionField> CG(1.0e-8,10000);
  SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);

+  double volume=1.0;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+  double t1=usecond();
  SchurSolver(Ds,src,result);
+  double t2=usecond();
+
+  // Schur solver: uses DeoDoe => volume * 1146
+  double ncall=CG.IterationsToComplete;
+  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
+
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+  std::cout<<GridLogMessage << "flop/s  =   "<< flops<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
  
  Grid_finalize();
 }