From 2db7e6f8ab20da54cbe4c42b9d8889e003cafc9c Mon Sep 17 00:00:00 2001
From: Yong-Chull Jang <integration.field@gmail.com>
Date: Tue, 24 Mar 2020 01:03:24 -0400
Subject: [PATCH 1/5] merge manually Block Lanczos files from Chulwoo's update
 (last state = commit 731a05 + untracked files) to develop branch; namespace
 QCD is removed; FIXME: multiple starting vectors result in nan after initial
 orthogonalization

---
 .../ImplicitlyRestartedBlockLanczos.h         | 1348 +++++++++++++++++
 Grid/util/Init.cc                             |    7 +
 Grid/util/Init.h                              |    4 +-
 tests/lanczos/Test_dwf_block_lanczos.cc       |  398 +++++
 4 files changed, 1756 insertions(+), 1 deletion(-)
 create mode 100644 Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
 create mode 100644 tests/lanczos/Test_dwf_block_lanczos.cc
diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
new file mode 100644
index 00000000..e3afe43c
--- /dev/null
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
@@ -0,0 +1,1348 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung
+Author: Yong-Chull Jang <ypj@quark.phy.bnl.gov> 
+Author: Guido Cossu
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_IRBL_H
+#define GRID_IRBL_H
+
+#include <string.h> //memset
+#ifdef USE_LAPACK
+#include <mkl_lapack.h>
+#endif
+
+#undef USE_LAPACK
+#define Glog std::cout << GridLogMessage 
+
+namespace Grid {
+
+////////////////////////////////////////////////////////////////////////////////
+// Helper class for sorting the evalues AND evectors by Field
+// Use pointer swizzle on vectors SHOULD GET RID OF IT SOON!
+////////////////////////////////////////////////////////////////////////////////
+template<class Field>
+class SortEigen {
+ private:
+  static bool less_lmd(RealD left,RealD right){
+    return left > right;
+  }  
+  static bool less_pair(std::pair<RealD,Field const*>& left,
+                        std::pair<RealD,Field const*>& right){
+    return left.first > (right.first);
+  }  
+  
+ public:
+  void push(std::vector<RealD>& lmd,std::vector<Field>& evec,int N) {
+    
+    ////////////////////////////////////////////////////////////////////////
+    // PAB: FIXME: VERY VERY VERY wasteful: takes a copy of the entire vector set.
+    //    : The vector reorder should be done by pointer swizzle somehow
+    ////////////////////////////////////////////////////////////////////////
+    std::vector<Field> cpy(lmd.size(),evec[0].Grid());
+    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
+    
+    std::vector<std::pair<RealD, Field const*> > emod(lmd.size());    
+
+    for(int i=0;i<lmd.size();++i)  emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
+
+    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
+
+    typename std::vector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
+    for(int i=0;i<N;++i){
+      lmd[i]=it->first;
+      evec[i]=*(it->second);
+      ++it;
+    }
+  }
+  void push(std::vector<RealD>& lmd,int N) {
+    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
+  }
+  bool saturated(RealD lmd, RealD thrs) {
+    return fabs(lmd) > fabs(thrs);
+  }
+};
+
+enum class LanczosType { irbl, rbl };
+
+/////////////////////////////////////////////////////////////
+// Implicitly restarted block lanczos
+/////////////////////////////////////////////////////////////
+template<class Field> 
+class ImplicitlyRestartedBlockLanczos {
+
+private:       
+  
+  std::string cname = std::string("ImplicitlyRestartedBlockLanczos");
+  int MaxIter;   // Max iterations
+  int Nstop;     // Number of evecs checked for convergence
+  int Nu;        // Numbeer of vecs in the unit block
+  int Nk;        // Number of converged sought
+  int Nm;        // total number of vectors
+  int Nblock_k;    // Nk/Nu
+  int Nblock_m;    // Nm/Nu
+  int Nconv_test_interval; // Number of skipped vectors when checking a convergence
+  RealD eresid;
+  IRLdiagonalisation diagonalisation;
+  int split_test; //test split in the first iteration
+  ////////////////////////////////////
+  // Embedded objects
+  ////////////////////////////////////
+           SortEigen<Field> _sort;
+  LinearOperatorBase<Field> &_Linop;
+  LinearOperatorBase<Field> &_SLinop;//for split
+  OperatorFunction<Field> &_poly;
+  GridRedBlackCartesian * f_grid;
+  GridRedBlackCartesian * sf_grid;
+  int mrhs;
+
+  /////////////////////////
+  // Constructor
+  /////////////////////////
+public:       
+ ImplicitlyRestartedBlockLanczos(LinearOperatorBase<Field> &Linop, // op
+ 				LinearOperatorBase<Field> &SLinop, // op
+				GridRedBlackCartesian * FrbGrid,
+				GridRedBlackCartesian * SFrbGrid,
+				int _mrhs,
+                                 OperatorFunction<Field> & poly,   // polynomial
+                                 int _Nstop, // really sought vecs
+                                 int _Nconv_test_interval, // conv check interval
+                                 int _Nu,    // vecs in the unit block
+                                 int _Nk,    // sought vecs
+                                 int _Nm,    // total vecs
+                                 RealD _eresid, // resid in lmd deficit 
+                                 int _MaxIter,  // Max iterations
+                                 IRLdiagonalisation _diagonalisation = IRLdiagonaliseWithEigen)
+   : _Linop(Linop),   _SLinop(SLinop),  _poly(poly),sf_grid(SFrbGrid),f_grid(FrbGrid),
+      Nstop(_Nstop), Nconv_test_interval(_Nconv_test_interval), mrhs(_mrhs),
+      Nu(_Nu), Nk(_Nk), Nm(_Nm), 
+      Nblock_m(_Nm/_Nu), Nblock_k(_Nk/_Nu),
+      //eresid(_eresid),  MaxIter(10),
+      eresid(_eresid),  MaxIter(_MaxIter),
+      diagonalisation(_diagonalisation),split_test(0)
+  { assert( (Nk%Nu==0) && (Nm%Nu==0) ); };
+
+  ////////////////////////////////
+  // Helpers
+  ////////////////////////////////
+  static RealD normalize(Field& v, int if_print=0) 
+  {
+    RealD nn = norm2(v);
+    nn = sqrt(nn);
+#if 0
+    if(if_print && nn < 1e20)
+    Glog<<"normalize: "<<nn<<std::endl;
+#endif
+    v = v * (1.0/nn);
+    return nn;
+  }
+  
+  void orthogonalize(Field& w, std::vector<Field>& evec, int k, int if_print=0)
+  {
+    typedef typename Field::scalar_type MyComplex;
+//    MyComplex ip;
+    ComplexD ip;
+    
+    for(int j=0; j<k; ++j){
+      ip = innerProduct(evec[j],w); 
+      if(if_print) 
+      if( norm(ip)/norm2(w) > 1e-14)
+      Glog<<"orthogonalize before: "<<j<<" of "<<k<<" "<< ip <<std::endl;
+      w = w - ip * evec[j];
+      if(if_print) {
+        ip = innerProduct(evec[j],w); 
+        if( norm(ip)/norm2(w) > 1e-14)
+          Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
+      }
+    }
+    normalize(w,if_print);
+  }
+  void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
+  {
+     orthogonalize(w, evec, k,1);
+  }
+
+  void orthogonalize(std::vector<Field>& w, int _Nu, std::vector<Field>& evec, int k, int if_print=0)
+  {
+    typedef typename Field::scalar_type MyComplex;
+    MyComplex ip;
+//    ComplexD ip;
+    
+    for(int j=0; j<k; ++j){
+    for(int i=0; i<_Nu; ++i){
+      ip = innerProduct(evec[j],w[i]); 
+#if 0
+      if(if_print) 
+      if( norm(ip)/norm2(w[i]) > 1e-14)
+      Glog<<"orthogonalize before: "<<i<<" "<<j<<" of "<<k<<" "<< ip <<std::endl;
+#endif
+      w[i] = w[i] - ip * evec[j];
+#if 0
+      if(if_print) {
+        ip = innerProduct(evec[j],w[i]); 
+        if( norm(ip)/norm2(w[i]) > 1e-14)
+          Glog<<"orthogonalize after: "<<i<<" "<<j<<" of "<<k<<" "<< ip <<std::endl;
+      }
+#endif
+    }}
+    for(int i=0; i<_Nu; ++i)
+    normalize(w[i],if_print);
+  }
+
+
+#if 0
+void innerProductD (std::vector<ComplexD> &inner, std::vector<Field>& lhs, int llhs, std::vector<Field>& rhs, int lrhs)
+{
+  typedef typename Field:vector_object vobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
+  GridBase *grid = lhs[0]._grid;
+  assert(grid == rhs[0]._grid;
+  const int pad = 8;
+  int total = llhs*lrhs;
+  assert(inner.size()==total);
+  int sum_size=grid->SumArraySize();
+
+//  std::vector<ComplexD> inner(total);
+  Vector<ComplexD> sumarray(sum_size*pad*total);
+
+  parallel_for(int thr=0;thr<sum_size;thr++){
+    int nwork, mywork, myoff;
+    GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
+    
+    std::vector< decltype(innerProductD(lhs[0]._odata[0],rhs[0]._odata[0])) > vinner(total,zero); // private to thread; sub summation
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+    for(int i=0; i<llhs; ++i){
+    for(int j=0; j<lrhs; ++j){
+      vinner[i*k+j] += innerProductD(lhs[i]._odata[ss],rhs[j]._odata[ss]);
+    }}
+    }
+    // All threads sum across SIMD; reduce serial work at end
+    // one write per cacheline with streaming store
+    for(int i=0; i<total; ++i){
+    ComplexD tmp = Reduce(TensorRemove(vinner[i])) ;
+    vstream(sumarray[(i*sum_size+thr)*pad],tmp);
+    }
+  }
+  
+for( int i =0;i<total;i++){
+  inner[i]=0.0;
+  for(int j=0;j<sum_size;j++){
+    inner[i] += sumarray[(i*sum_size+j)*pad];
+  } 
+}
+  for( int i =0;i<total;i++){
+    ComplexD tmp=inner[i];
+    evec[0]._grid->GlobalSum(tmp);
+    inner[i]=tmp;
+  }
+//  return inner;
+}
+#endif
+
+  
+  void orthogonalize_blockhead(Field& w, std::vector<Field>& evec, int k, int Nu)
+  {
+    typedef typename Field::scalar_type MyComplex;
+    MyComplex ip;
+    
+    for(int j=0; j<k; ++j){
+      ip = innerProduct(evec[j*Nu],w); 
+      w = w - ip * evec[j*Nu];
+    }
+    normalize(w);
+  }
+  
+  void calc(std::vector<RealD>& eval,  
+            std::vector<Field>& evec, 
+            const std::vector<Field>& src, int& Nconv, LanczosType Impl)
+  {
+    switch (Impl) {
+      case LanczosType::irbl: 
+        calc_irbl(eval,evec,src,Nconv);
+        break;
+      
+      case LanczosType::rbl: 
+        calc_rbl(eval,evec,src,Nconv);
+        break;
+    }
+  }
+
+  void calc_irbl(std::vector<RealD>& eval,  
+                 std::vector<Field>& evec, 
+                 const std::vector<Field>& src, int& Nconv)
+  {
+    std::string fname = std::string(cname+"::calc_irbl()"); 
+    GridBase *grid = evec[0].Grid();
+    assert(grid == src[0].Grid());
+    assert( Nu = src.size() );
+    
+    Glog << std::string(74,'*') << std::endl;
+    Glog << fname + " starting iteration 0 /  "<< MaxIter<< std::endl;
+    Glog << std::string(74,'*') << std::endl;
+    Glog <<" -- seek   Nk    = "<< Nk    <<" vectors"<< std::endl;
+    Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl;
+    Glog <<" -- total  Nm    = "<< Nm    <<" vectors"<< std::endl;
+    Glog <<" -- size of eval = "<< eval.size() << std::endl;
+    Glog <<" -- size of evec = "<< evec.size() << std::endl;
+    if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+      Glog << "Diagonalisation is Eigen "<< std::endl;
+#ifdef USE_LAPACK
+    } else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) { 
+      Glog << "Diagonalisation is LAPACK "<< std::endl;
+#endif
+    } else {
+      abort();
+    }
+    Glog << std::string(74,'*') << std::endl;
+    
+    assert(Nm == evec.size() && Nm == eval.size());
+
+    std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));  
+    std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));  
+    std::vector<std::vector<ComplexD>> lmd2(Nu,std::vector<ComplexD>(Nm,0.0));  
+    std::vector<std::vector<ComplexD>> lme2(Nu,std::vector<ComplexD>(Nm,0.0));  
+    std::vector<RealD> eval2(Nm);
+    std::vector<RealD> resid(Nk);
+
+    Eigen::MatrixXcd    Qt = Eigen::MatrixXcd::Zero(Nm,Nm);
+    Eigen::MatrixXcd    Q = Eigen::MatrixXcd::Zero(Nm,Nm);
+
+    std::vector<int>   Iconv(Nm);
+    std::vector<Field>  B(Nm,grid); // waste of space replicating
+    
+    std::vector<Field> f(Nu,grid);
+    std::vector<Field> f_copy(Nu,grid);
+    Field v(grid);
+    
+    Nconv = 0;
+    
+    RealD beta_k;
+  
+    // set initial vector
+    for (int i=0; i<Nu; ++i) {
+      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
+      evec[i] = src[i];
+      orthogonalize(evec[i],evec,i);
+      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
+    }
+    
+    // initial Nblock_k steps
+    for(int b=0; b<Nblock_k; ++b) blockwiseStep(lmd,lme,evec,f,f_copy,b);
+
+    // restarting loop begins
+    int iter;
+    for(iter = 0; iter<MaxIter; ++iter){
+      
+      Glog <<"#Restart iteration = "<< iter << std::endl;
+      // additional (Nblock_m - Nblock_k) steps
+      for(int b=Nblock_k; b<Nblock_m; ++b) blockwiseStep(lmd,lme,evec,f,f_copy,b);
+      
+      // getting eigenvalues
+      for(int u=0; u<Nu; ++u){
+        for(int k=0; k<Nm; ++k){
+          lmd2[u][k] = lmd[u][k];
+          lme2[u][k] = lme[u][k];
+        }
+      }
+      Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
+      diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
+      _sort.push(eval2,Nm);
+      Glog << "#Ritz value before shift: "<< std::endl;
+      for(int i=0; i<Nm; ++i){
+        std::cout.precision(13);
+        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+      }
+      
+      //----------------------------------------------------------------------
+      if ( Nm>Nk ) {
+        Glog <<" #Apply shifted QR transformations "<<std::endl;
+        //int k2 = Nk+Nu;
+        int k2 = Nk;
+      
+        Eigen::MatrixXcd BTDM = Eigen::MatrixXcd::Identity(Nm,Nm);
+        Q = Eigen::MatrixXcd::Identity(Nm,Nm);
+        
+        unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
+
+        for(int ip=Nk; ip<Nm; ++ip){ 
+          shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
+        }
+        
+        packHermitBlockTriDiagMatfromEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
+
+        for(int i=0; i<k2; ++i) B[i] = 0.0;
+        for(int j=0; j<k2; ++j){
+          for(int k=0; k<Nm; ++k){
+            B[j].Checkerboard() = evec[k].Checkerboard();
+            B[j] += evec[k]*Q(k,j);
+          }
+        }
+        for(int i=0; i<k2; ++i) evec[i] = B[i];
+
+        // reconstruct initial vector for additional pole space
+        blockwiseStep(lmd,lme,evec,f,f_copy,Nblock_k-1);
+
+        // getting eigenvalues
+        for(int u=0; u<Nu; ++u){
+          for(int k=0; k<Nm; ++k){
+            lmd2[u][k] = lmd[u][k];
+            lme2[u][k] = lme[u][k];
+          }
+        }
+        Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
+        diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
+        _sort.push(eval2,Nk);
+        Glog << "#Ritz value after shift: "<< std::endl;
+        for(int i=0; i<Nk; ++i){
+          std::cout.precision(13);
+          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+          std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+        }
+      }
+      //----------------------------------------------------------------------
+
+      // Convergence test
+      Glog <<" #Convergence test: "<<std::endl;
+      for(int k = 0; k<Nk; ++k) B[k]=0.0;
+      for(int j = 0; j<Nk; ++j){
+	for(int k = 0; k<Nk; ++k){
+	  B[j].Checkerboard() = evec[k].Checkerboard();
+	  B[j] += evec[k]*Qt(k,j);
+	}
+      }
+      
+      Nconv = 0;
+      for(int i=0; i<Nk; ++i){
+	
+        _Linop.HermOp(B[i],v);
+	RealD vnum = real(innerProduct(B[i],v)); // HermOp.
+	RealD vden = norm2(B[i]);
+	eval2[i] = vnum/vden;
+	v -= eval2[i]*B[i];
+	RealD vv = norm2(v);
+        resid[i] = vv;
+	
+	std::cout.precision(13);
+        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	std::cout << "eval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i];
+	std::cout << "   resid^2 = "<< std::setw(20)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
+	
+	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
+	//if( (vv<eresid*eresid) && (i == Nconv) ){
+	if (vv<eresid*eresid) {
+	  Iconv[Nconv] = i;
+	  ++Nconv;
+	}
+	
+      }  // i-loop end
+      
+      Glog <<" #modes converged: "<<Nconv<<std::endl;
+      for(int i=0; i<Nconv; ++i){
+	std::cout.precision(13);
+        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<Iconv[i]<<"] ";
+	std::cout << "eval_conv = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[Iconv[i]];
+	std::cout << "   resid^2 = "<< std::setw(20)<< std::setiosflags(std::ios_base::right)<< resid[Iconv[i]]<< std::endl;
+      } 
+
+      if ( Nconv>=Nstop ) break;
+
+    } // end of iter loop
+    
+    Glog << std::string(74,'*') << std::endl;
+    if ( Nconv<Nstop ) {
+      Glog << fname + " NOT converged ; Summary :\n";
+    } else {
+      Glog << fname + " CONVERGED ; Summary :\n";
+      // Sort convered eigenpairs.
+      eval.resize(Nconv);
+      evec.resize(Nconv,grid);
+      for(int i=0; i<Nconv; ++i){
+        eval[i] = eval2[Iconv[i]];
+        evec[i] = B[Iconv[i]];
+      }
+      _sort.push(eval,evec,Nconv);
+    }
+    Glog << std::string(74,'*') << std::endl;
+    Glog << " -- Iterations  = "<< iter   << "\n";
+    //Glog << " -- beta(k)     = "<< beta_k << "\n";
+    Glog << " -- Nconv       = "<< Nconv  << "\n";
+    Glog << std::string(74,'*') << std::endl;
+  
+  }
+  
+  
+  void calc_rbl(std::vector<RealD>& eval,  
+                 std::vector<Field>& evec, 
+                 const std::vector<Field>& src, int& Nconv)
+  {
+    std::string fname = std::string(cname+"::calc_rbl()"); 
+    GridBase *grid = evec[0].Grid();
+    assert(grid == src[0].Grid());
+    assert( Nu = src.size() );
+
+    int Np = (Nm-Nk);
+    if (Np > 0 && MaxIter > 1) Np /= MaxIter;
+    int Nblock_p = Np/Nu;
+    
+    Glog << std::string(74,'*') << std::endl;
+    Glog << fname + " starting iteration 0 /  "<< MaxIter<< std::endl;
+    Glog << std::string(74,'*') << std::endl;
+    Glog <<" -- seek (min) Nk    = "<< Nk    <<" vectors"<< std::endl;
+    Glog <<" -- seek (inc) Np    = "<< Np <<" vectors"<< std::endl;
+    Glog <<" -- seek (max) Nm    = "<< Nm    <<" vectors"<< std::endl;
+    Glog <<" -- accept Nstop     = "<< Nstop <<" vectors"<< std::endl;
+    Glog <<" -- size of eval     = "<< eval.size() << std::endl;
+    Glog <<" -- size of evec     = "<< evec.size() << std::endl;
+    if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+      Glog << "Diagonalisation is Eigen "<< std::endl;
+#ifdef USE_LAPACK
+    } else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) { 
+      Glog << "Diagonalisation is LAPACK "<< std::endl;
+#endif
+    } else {
+      abort();
+    }
+    Glog << std::string(74,'*') << std::endl;
+    
+    assert(Nm == evec.size() && Nm == eval.size());
+	
+    std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));  
+    std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));  
+    std::vector<std::vector<ComplexD>> lmd2(Nu,std::vector<ComplexD>(Nm,0.0));  
+    std::vector<std::vector<ComplexD>> lme2(Nu,std::vector<ComplexD>(Nm,0.0));  
+    std::vector<RealD> eval2(Nk);
+    std::vector<RealD> resid(Nm);
+
+    Eigen::MatrixXcd    Qt = Eigen::MatrixXcd::Zero(Nm,Nm);
+    Eigen::MatrixXcd    Q = Eigen::MatrixXcd::Zero(Nm,Nm);
+
+    std::vector<int>   Iconv(Nm);
+    std::vector<Field>  B(Nm,grid); // waste of space replicating
+    
+    std::vector<Field> f(Nu,grid);
+    std::vector<Field> f_copy(Nu,grid);
+    Field v(grid);
+    
+    Nconv = 0;
+    
+    RealD beta_k;
+  
+    // set initial vector
+    for (int i=0; i<Nu; ++i) {
+      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
+      evec[i] = src[i];
+      orthogonalize(evec[i],evec,i);
+      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
+    }
+//    exit(-43);
+    
+    // initial Nblock_k steps
+    for(int b=0; b<Nblock_k; ++b) blockwiseStep(lmd,lme,evec,f,f_copy,b);
+
+    // restarting loop begins
+    int iter;
+    int Nblock_l, Nblock_r;
+    int Nl, Nr;
+    int Nconv_guess = 0;
+
+    for(iter = 0; iter<MaxIter; ++iter){
+         
+      Glog <<"#Restart iteration = "<< iter << std::endl;
+      
+      Nblock_l = Nblock_k + iter*Nblock_p;
+      Nblock_r = Nblock_l + Nblock_p;
+      Nl = Nblock_l*Nu;
+      Nr = Nblock_r*Nu;
+      eval2.resize(Nr);
+
+      // additional Nblock_p steps
+      for(int b=Nblock_l; b<Nblock_r; ++b) blockwiseStep(lmd,lme,evec,f,f_copy,b);
+      
+      // getting eigenvalues
+      for(int u=0; u<Nu; ++u){
+        for(int k=0; k<Nr; ++k){
+          lmd2[u][k] = lmd[u][k];
+          lme2[u][k] = lme[u][k];
+        }
+      }
+      Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
+      diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
+      _sort.push(eval2,Nr);
+      Glog << "#Ritz value: "<< std::endl;
+      for(int i=0; i<Nr; ++i){
+        std::cout.precision(13);
+        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+      }
+      
+      // Convergence test
+      Glog <<" #Convergence test: "<<std::endl;
+      Nconv = 0;
+      for(int k = 0; k<Nr; ++k) B[k]=0.0;
+      for(int j = 0; j<Nr; j+=Nconv_test_interval){
+        if ( j/Nconv_test_interval == Nconv ) {
+          Glog <<" #rotation for next check point evec" 
+               << std::setw(4)<< std::setiosflags(std::ios_base::right) 
+               << "["<< j <<"]" <<std::endl;
+          for(int k = 0; k<Nr; ++k){
+            B[j].Checkerboard() = evec[k].Checkerboard();
+            B[j] += evec[k]*Qt(k,j);
+          }
+          
+          _Linop.HermOp(B[j],v);
+          RealD vnum = real(innerProduct(B[j],v)); // HermOp.
+          RealD vden = norm2(B[j]);
+          eval2[j] = vnum/vden;
+          v -= eval2[j]*B[j];
+          RealD vv = norm2(v);
+          resid[j] = vv;
+          
+          std::cout.precision(13);
+          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<j<<"] ";
+          std::cout << "eval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[j];
+          std::cout << "   resid^2 = "<< std::setw(20)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
+          
+          // change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
+          //if( (vv<eresid*eresid) && (i == Nconv) ){
+          if (vv<eresid*eresid) {
+            Iconv[Nconv] = j;
+            ++Nconv;
+          }
+        } else {
+          break;
+        }
+      }  // j-loop end
+      
+      Glog <<" #modes converged: "<<Nconv<<std::endl;
+      for(int i=0; i<Nconv; ++i){
+	std::cout.precision(13);
+        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<Iconv[i]<<"] ";
+	std::cout << "eval_conv = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[Iconv[i]];
+	std::cout << "   resid^2 = "<< std::setw(20)<< std::setiosflags(std::ios_base::right)<< resid[Iconv[i]]<< std::endl;
+      } 
+
+      (Nconv > 0 ) ? Nconv_guess = 1 + (Nconv-1)*Nconv_test_interval : Nconv_guess = 0;
+      if ( Nconv_guess >= Nstop ) break;
+
+    } // end of iter loop
+    
+    Glog << std::string(74,'*') << std::endl;
+    if ( Nconv_guess < Nstop ) {
+      Glog << fname + " NOT converged ; Summary :\n";
+    } else {
+      Glog << fname + " CONVERGED ; Summary :\n";
+      // Sort convered eigenpairs.
+      eval.resize(Nconv);
+      evec.resize(Nconv,grid);
+      for(int i=0; i<Nconv; ++i){
+        eval[i] = eval2[Iconv[i]];
+        evec[i] = B[Iconv[i]];
+      }
+      _sort.push(eval,evec,Nconv);
+    }
+    Glog << std::string(74,'*') << std::endl;
+    Glog << " -- Iterations    = "<< iter   << "\n";
+    //Glog << " -- beta(k)       = "<< beta_k << "\n";
+    Glog << " -- Nconv         = "<< Nconv  << "\n";
+    Glog << " -- Nconv (guess) = "<< Nconv_guess  << "\n";
+    Glog << std::string(74,'*') << std::endl;
+  
+  }
+
+private:
+  void blockwiseStep(std::vector<std::vector<ComplexD>>& lmd,
+	             std::vector<std::vector<ComplexD>>& lme, 
+	             std::vector<Field>& evec,
+	             std::vector<Field>& w, 
+	             std::vector<Field>& w_copy, 
+                     int b)
+  {
+    const RealD tiny = 1.0e-20;
+    
+    int Nu = w.size();
+    int Nm = evec.size();
+    assert( b < Nm/Nu );
+//    GridCartesian *grid = evec[0]._grid;
+    
+    // converts block index to full indicies for an interval [L,R)
+    int L = Nu*b;
+    int R = Nu*(b+1);
+
+    Real beta;
+
+    Glog << "Using split grid"<< std::endl;
+//   LatticeGaugeField s_Umu(SGrid);
+   assert((Nu%mrhs)==0);
+   std::vector<Field>   in(mrhs,f_grid);
+     
+   Field s_in(sf_grid);
+   Field s_out(sf_grid);
+   // unnecessary copy. Can or should it be avoided?
+int k_start = 0;
+while ( k_start < Nu) {
+   Glog << "k_start= "<<k_start<< std::endl;
+   for (int u=0; u<mrhs; ++u) in[u] = evec[L+k_start+u];
+Glog << "Split "<< std::endl;
+   Grid_split(in, s_in);
+Glog << "Split done "<< std::endl;
+      _poly(_SLinop,s_in,s_out);
+Glog << "Unsplit "<< std::endl;
+   Grid_unsplit(in,s_out);
+Glog << "Unsplit done "<< std::endl;
+   for (int u=0; u<mrhs; ++u) w[k_start+u] = in[u];
+   k_start +=mrhs;
+}
+    Glog << "Using split grid done "<< std::endl;
+    
+// test split in the first iteration
+if(!split_test){
+    Glog << "Not using split grid"<< std::endl;
+    // 3. wk:=Avkβkv_{k1}
+    for (int k=L, u=0; k<R; ++k, ++u) {
+      _poly(_Linop,evec[k],w_copy[u]);      
+    }
+    Glog << "Not using split grid done"<< std::endl;
+   for (int u=0; u<Nu; ++u) {
+	 w_copy[u] -= w[u];
+    Glog << "diff(split - non_split) "<<u<<" " << norm2(w_copy[u]) << std::endl;
+   }
+   split_test=1;
+}
+    Glog << "Poly done"<< std::endl;
+    Glog << "LinAlg "<< std::endl;
+//    exit(-42);
+    
+    if (b>0) {
+      for (int u=0; u<Nu; ++u) {
+        //for (int k=L-Nu; k<L; ++k) {
+        for (int k=L-Nu+u; k<L; ++k) {
+          w[u] = w[u] - evec[k] * conjugate(lme[u][k]);
+        }
+      }
+    }
+    
+    // 4. αk:=(vk,wk)
+    //for (int u=0; u<Nu; ++u) {
+    //  for (int k=L; k<R; ++k) {
+    //    lmd[u][k] = innerProduct(evec[k],w[u]);  // lmd = transpose of alpha
+    //  }
+    //  lmd[u][L+u] = real(lmd[u][L+u]);  // force diagonal to be real
+    //}
+    for (int u=0; u<Nu; ++u) {
+      for (int k=L+u; k<R; ++k) {
+        lmd[u][k] = innerProduct(evec[k],w[u]);  // lmd = transpose of alpha
+//        Glog <<"lmd "<<u<<" "<<k<<lmd[u][k] -conjugate(innerProduct(evec[u+L],w[k-L]))<<std::endl;
+        lmd[k-L][u+L] = conjugate(lmd[u][k]);     // force hermicity
+      }
+      lmd[u][L+u] = real(lmd[u][L+u]);  // force diagonal to be real
+    }
+    
+    // 5. wk:=wk−αkvk
+    for (int u=0; u<Nu; ++u) {
+      for (int k=L; k<R; ++k) {
+        w[u] = w[u] - evec[k]*lmd[u][k];
+      }
+      w_copy[u] = w[u];
+    }
+    Glog << "LinAlg done"<< std::endl;
+    
+    // In block version, the steps 6 and 7 in Lanczos construction is
+    // replaced by the QR decomposition of new basis block.
+    // It results block version beta and orthonormal block basis. 
+    // Here, QR decomposition is done by using Gram-Schmidt.
+    for (int u=0; u<Nu; ++u) {
+      for (int k=L; k<R; ++k) {
+        lme[u][k] = 0.0;
+      }
+    }
+
+    Glog << "Gram Schmidt"<< std::endl;
+    // re-orthogonalization for numerical stability
+#if 0
+    for (int u=0; u<Nu; ++u) {
+      orthogonalize(w[u],evec,R);
+    }
+#else
+      orthogonalize(w,Nu,evec,R);
+#endif
+    // QR part
+    for (int u=1; u<Nu; ++u) {
+      orthogonalize(w[u],w,u);
+    }
+    Glog << "Gram Schmidt done "<< std::endl;
+    
+    Glog << "LinAlg "<< std::endl;
+    for (int u=0; u<Nu; ++u) {
+      //for (int v=0; v<Nu; ++v) {
+      for (int v=u; v<Nu; ++v) {
+        lme[u][L+v] = innerProduct(w[u],w_copy[v]);
+      }
+      lme[u][L+u] = real(lme[u][L+u]);  // force diagonal to be real
+    }
+    //lme[0][L] = beta;
+    
+    for (int u=0; u<Nu; ++u) {
+      Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
+      assert (!isnan(norm2(w[u])));
+      for (int k=L+u; k<R; ++k) {
+        Glog <<" In block "<< b << ","; 
+        std::cout <<" beta[" << u << "," << k-L << "] = ";
+        std::cout << lme[u][k] << std::endl;
+      }
+    }
+    Glog << "LinAlg done "<< std::endl;
+#if 0    
+    Glog << "Gram Schmidt "<< std::endl;
+    // re-orthogonalization for numerical stability
+    if (b>0) {
+      for (int u=0; u<Nu; ++u) {
+        orthogonalize(w[u],evec,R);
+      }
+      for (int u=1; u<Nu; ++u) {
+        orthogonalize(w[u],w,u);
+      }
+    }
+    //if (b>0) {
+    //  orthogonalize_blockhead(w[0],evec,b,Nu);
+    //  for (int u=1; u<Nu; ++u) {
+    //    orthogonalize(w[u],w,u);
+    //  }
+    //}
+    Glog << "Gram Schmidt done "<< std::endl;
+#endif
+
+    if (b < Nm/Nu-1) {
+      for (int u=0; u<Nu; ++u) {
+        evec[R+u] = w[u];
+      }
+    }
+
+  }
+  
+    
+  void diagonalize_Eigen(std::vector<RealD>& eval, 
+                         std::vector<std::vector<ComplexD>>& lmd,
+                         std::vector<std::vector<ComplexD>>& lme, 
+			 int Nu, int Nk, int Nm,
+			 Eigen::MatrixXcd & Qt, // Nm x Nm
+			 GridBase *grid)
+  {
+    assert( Nk%Nu == 0 && Nm%Nu == 0 );
+    assert( Nk <= Nm );
+    Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
+    
+    for ( int u=0; u<Nu; ++u ) {
+      for (int k=0; k<Nk; ++k ) {
+        BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k];
+      }
+    }
+    
+    for ( int u=0; u<Nu; ++u ) {
+      for (int k=Nu; k<Nk; ++k ) {
+        BlockTriDiag(k-Nu,u+(k/Nu)*Nu) = conjugate(lme[u][k-Nu]);
+        BlockTriDiag(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
+      }
+    }
+    //std::cout << BlockTriDiag << std::endl;
+    
+    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(BlockTriDiag);
+
+    for (int i = 0; i < Nk; i++) {
+      eval[Nk-1-i] = eigensolver.eigenvalues()(i);
+    }
+    for (int i = 0; i < Nk; i++) {
+      for (int j = 0; j < Nk; j++) {
+	Qt(j,Nk-1-i) = eigensolver.eigenvectors()(j,i);
+	//Qt(Nk-1-i,j) = eigensolver.eigenvectors()(i,j);
+	//Qt(i,j) = eigensolver.eigenvectors()(i,j);
+      }
+    }
+  }
+
+#ifdef USE_LAPACK
+  void diagonalize_lapack(std::vector<RealD>& eval, 
+                         std::vector<std::vector<ComplexD>>& lmd,
+                         std::vector<std::vector<ComplexD>>& lme, 
+			 int Nu, int Nk, int Nm,
+			 Eigen::MatrixXcd & Qt, // Nm x Nm
+			 GridBase *grid)
+  {
+    Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
+    assert( Nk%Nu == 0 && Nm%Nu == 0 );
+    assert( Nk <= Nm );
+    Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
+    
+    for ( int u=0; u<Nu; ++u ) {
+      for (int k=0; k<Nk; ++k ) {
+//        Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
+        BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k];
+      }
+    }
+    
+    for ( int u=0; u<Nu; ++u ) {
+      for (int k=Nu; k<Nk; ++k ) {
+//        Glog << "lme "<<u<<" "<<k<<" "<<lme[u][k] -conjugate(lme[u][k])<<std::endl;
+        BlockTriDiag(k-Nu,u+(k/Nu)*Nu) = conjugate(lme[u][k-Nu]);
+        BlockTriDiag(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
+      }
+    }
+    //std::cout << BlockTriDiag << std::endl;
+//#ifdef USE_LAPACK
+#if 1
+  const int size = Nm;
+  MKL_INT NN = Nk;
+//  double evals_tmp[NN];
+//  double evec_tmp[NN][NN];
+  double *evals_tmp = (double *) malloc(NN*sizeof(double));
+  MKL_Complex16 *evec_tmp = (MKL_Complex16 *) malloc(NN*NN*sizeof(MKL_Complex16));
+  MKL_Complex16 *DD = (MKL_Complex16 *) malloc(NN*NN*sizeof(MKL_Complex16));
+  for (int i = 0; i< NN; i++) {
+    for (int j = 0; j <NN ; j++) {
+        evec_tmp[i*NN+j].real=0.;
+        evec_tmp[i*NN+j].imag=0.;
+        DD[i*NN+j].real=BlockTriDiag(i,j).real();
+        DD[i*NN+j].imag=BlockTriDiag(i,j).imag();
+    }
+  }
+  MKL_INT evals_found;
+  MKL_INT lwork = (3*NN);
+  MKL_INT lrwork = (24*NN);
+  MKL_INT liwork =  NN*10 ;
+  MKL_INT iwork[liwork];
+  double rwork[lrwork];
+//  double work[lwork];
+  MKL_Complex16 *work = (MKL_Complex16 *) malloc(lwork*sizeof(MKL_Complex16));
+  MKL_INT isuppz[2*NN];
+  char jobz = 'V'; // calculate evals & evecs
+  char range = 'I'; // calculate all evals
+  //    char range = 'A'; // calculate all evals
+  char uplo = 'U'; // refer to upper half of original matrix
+  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
+  int ifail[NN];
+  MKL_INT info;
+  int total = grid->_Nprocessors;
+  int node  = grid->_processor;
+  int interval = (NN/total)+1;
+  double vl = 0.0, vu = 0.0;
+  MKL_INT il = interval*node+1 , iu = interval*(node+1);
+  if (iu > NN)  iu=NN;
+  Glog << "node "<<node<<"il "<<il<<"iu "<<iu<<std::endl;
+  double tol = 0.0;
+  if (1) {
+    memset(evals_tmp,0,sizeof(double)*NN);
+    if ( il <= NN){
+      zheevr(&jobz, &range, &uplo, &NN,
+		    DD,  &NN,
+		    &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
+		    &tol, // tolerance
+		    &evals_found, evals_tmp, (MKL_Complex16*)evec_tmp, &NN,
+		    isuppz,
+		    work, &lwork, 
+		    rwork, &lrwork, 
+		    iwork, &liwork,
+		    &info);
+//			(double*)EE,
+      for (int i = iu-1; i>= il-1; i--){
+	evals_tmp[i] = evals_tmp[i - (il-1)];
+	if (il>1) evals_tmp[i-(il-1)]=0.;
+	for (int j = 0; j< NN; j++){
+	  evec_tmp[i*NN+j] = evec_tmp[(i - (il-1))*NN+j];
+	  if (il>1) {
+		evec_tmp[(i-(il-1))*NN+j].imag=0.;
+		evec_tmp[(i-(il-1))*NN+j].real=0.;
+          }
+	}
+      }
+    }
+    {
+      grid->GlobalSumVector(evals_tmp,NN);
+      grid->GlobalSumVector((double*)evec_tmp,2*NN*NN);
+    }
+  } 
+  // Safer to sort instead of just reversing it, 
+  // but the document of the routine says evals are sorted in increasing order. 
+  // qr gives evals in decreasing order.
+//  for(int i=0;i<NN;i++){
+//    lmd [NN-1-i]=evals_tmp[i];
+//    for(int j=0;j<NN;j++){
+//      Qt((NN-1-i),j)=evec_tmp[i][j];
+//    }
+//  }
+
+//  MKL_Complex16 *eval_tmp = malloc(NN*sizeof(MKL_Complex16));
+//  MKL_Complex16 *evec_tmp = malloc(NN*NN*sizeof(MKL_Complex16));
+//  MKL_Complex16 *DD = malloc(NN*NN*sizeof(MKL_Complex16));
+#endif
+    for (int i = 0; i < Nk; i++) 
+      eval[Nk-1-i] = evals_tmp[i];
+    for (int i = 0; i < Nk; i++) {
+      for (int j = 0; j < Nk; j++) {
+//	Qt(j,Nk-1-i) = eigensolver.eigenvectors()(j,i);
+	Qt(j,Nk-1-i)=std::complex<double>  
+	( evec_tmp[i*Nk+j].real,
+	 evec_tmp[i*Nk+j].imag);
+//	( evec_tmp[(Nk-1-j)*Nk+Nk-1-i].real,
+//	evec_tmp[(Nk-1-j)*Nk+Nk-1-i].imag);
+	
+      }
+    }
+    
+if (1){
+    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(BlockTriDiag);
+
+    for (int i = 0; i < Nk; i++) {
+      Glog << "eval = "<<i<<" " <<eval[Nk-1-i] <<" "<< eigensolver.eigenvalues()(i) <<std::endl;
+//      eval[Nk-1-i] = eigensolver.eigenvalues()(i);
+    }
+    for (int i = 0; i < Nk; i++) {
+      for (int j = 0; j < Nk; j++) {
+//	Qt(j,Nk-1-i) = eigensolver.eigenvectors()(j,i);
+//        Glog<<"Qt "<<j<<" "<<Nk-1-i<<" = " <<Qt(j,Nk-1-i) <<" "<<eigensolver.eigenvectors()(j,i) <<std::endl;
+        MKL_Complex16 tmp = evec_tmp[i*Nk+j];
+//        Glog<<"Qt "<<j<<" "<<Nk-1-i<<" = " <<evec_tmp[(Nk-1-j)*Nk+Nk-1-i].real<<" "<<
+//evec_tmp[(Nk-1-j)*Nk+Nk-1-i].imag <<" "<<eigensolver.eigenvectors()(j,i) <<std::endl;
+	if ( (i<5)&& (j<5))
+       Glog<<"Qt "<<j<<" "<<Nk-1-i<<" = " << norm(Qt(j,Nk-1-i))<<" "<<
+          norm(eigensolver.eigenvectors()(j,i)) <<std::endl;
+      }
+    }
+}
+//  exit(-43);
+
+  free (evals_tmp);
+  free (evec_tmp);
+  free (DD);
+  free (work);
+  }
+#endif
+
+
+  void diagonalize(std::vector<RealD>& eval, 
+                   std::vector<std::vector<ComplexD>>& lmd, 
+                   std::vector<std::vector<ComplexD>>& lme, 
+		   int Nu, int Nk, int Nm,   
+		   Eigen::MatrixXcd & Qt,
+		   GridBase *grid)
+  {
+    Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
+    if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+      diagonalize_Eigen(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
+#ifdef USE_LAPACK
+    } else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) { 
+      diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
+#endif
+    } else { 
+      assert(0);
+    }
+  }
+  
+
+  void unpackHermitBlockTriDiagMatToEigen(
+         std::vector<std::vector<ComplexD>>& lmd,  
+         std::vector<std::vector<ComplexD>>& lme,
+         int Nu, int Nb, int Nk, int Nm,
+         Eigen::MatrixXcd& M)
+  {
+    //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
+    assert( Nk%Nu == 0 && Nm%Nu == 0 );
+    assert( Nk <= Nm );
+    M = Eigen::MatrixXcd::Zero(Nk,Nk);
+    
+    // rearrange 
+    for ( int u=0; u<Nu; ++u ) {
+      for (int k=0; k<Nk; ++k ) {
+        M(k,u+(k/Nu)*Nu) = lmd[u][k];
+      }
+    }
+
+    for ( int u=0; u<Nu; ++u ) {
+      for (int k=Nu; k<Nk; ++k ) {
+        M(k-Nu,u+(k/Nu)*Nu) = conjugate(lme[u][k-Nu]);
+        M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
+      }
+    }
+    //Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl; 
+  }
+ 
+
+  void packHermitBlockTriDiagMatfromEigen(
+         std::vector<std::vector<ComplexD>>& lmd,
+         std::vector<std::vector<ComplexD>>& lme,
+         int Nu, int Nb, int Nk, int Nm,
+         Eigen::MatrixXcd& M)
+  {
+    //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
+    assert( Nk%Nu == 0 && Nm%Nu == 0 );
+    assert( Nk <= Nm );
+    
+    // rearrange 
+    for ( int u=0; u<Nu; ++u ) {
+      for (int k=0; k<Nk; ++k ) {
+        lmd[u][k] = M(k,u+(k/Nu)*Nu);
+      }
+    }
+
+    for ( int u=0; u<Nu; ++u ) {
+      for (int k=Nu; k<Nk; ++k ) {
+        lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
+      }
+    }
+    //Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl; 
+  }
+
+
+  // assume the input matrix M is a band matrix
+  void shiftedQRDecompEigen(Eigen::MatrixXcd& M, int Nu, int Nm,
+		            RealD Dsh,
+		            Eigen::MatrixXcd& Qprod)
+  {
+    //Glog << "shiftedQRDecompEigen() begin" << '\n'; 
+    Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
+    Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
+    Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
+    
+    Mtmp = M;
+    for (int i=0; i<Nm; ++i ) {
+      Mtmp(i,i) = M(i,i) - Dsh;
+    }
+    
+    Eigen::HouseholderQR<Eigen::MatrixXcd> QRD(Mtmp);
+    Q = QRD.householderQ();
+    R = QRD.matrixQR(); // upper triangular part is the R matrix.
+                        // lower triangular part used to represent series
+                        // of Q sequence.
+
+    // equivalent operation of Qprod *= Q
+    //M = Eigen::MatrixXcd::Zero(Nm,Nm);
+    
+    //for (int i=0; i<Nm; ++i) {
+    //  for (int j=0; j<Nm-2*(Nu+1); ++j) {
+    //    for (int k=0; k<2*(Nu+1)+j; ++k) {
+    //      M(i,j) += Qprod(i,k)*Q(k,j);
+    //    }
+    //  }
+    //}
+    //for (int i=0; i<Nm; ++i) {
+    //  for (int j=Nm-2*(Nu+1); j<Nm; ++j) {
+    //    for (int k=0; k<Nm; ++k) {
+    //      M(i,j) += Qprod(i,k)*Q(k,j);
+    //    }
+    //  }
+    //}
+    
+    Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
+
+    for (int i=0; i<Nm; ++i) {
+      for (int j=0; j<Nm-(Nu+1); ++j) {
+        for (int k=0; k<Nu+1+j; ++k) {
+          Mtmp(i,j) += Qprod(i,k)*Q(k,j);
+        }
+      }
+    }
+    for (int i=0; i<Nm; ++i) {
+      for (int j=Nm-(Nu+1); j<Nm; ++j) {
+        for (int k=0; k<Nm; ++k) {
+          Mtmp(i,j) += Qprod(i,k)*Q(k,j);
+        }
+      }
+    }
+    
+    //static int ntimes = 2;
+    //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
+    //  for (int i=ntimes*Nu+j; i<Nm; ++i) {
+    //    Mtmp(i,j) = 0.0;
+    //  }
+    //}
+    //ntimes++;
+
+    Qprod = Mtmp;
+     
+    // equivalent operation of M = Q.adjoint()*(M*Q)
+    Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
+    
+    for (int a=0, i=0, kmax=0; a<Nu+1; ++a) {
+      for (int j=0; j<Nm-a; ++j) {
+        i = j+a;
+        kmax = (Nu+1)+j;
+        if (kmax > Nm) kmax = Nm;
+        for (int k=i; k<kmax; ++k) { 
+          Mtmp(i,j) += R(i,k)*Q(k,j);
+        }
+        Mtmp(j,i) = conj(Mtmp(i,j));
+      }
+    }
+
+    for (int i=0; i<Nm; ++i) {
+      Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
+    }
+    
+    M = Mtmp;
+
+    //M = Q.adjoint()*(M*Q);
+    //for (int i=0; i<Nm; ++i) {
+    //  for (int j=0; j<Nm; ++j) {
+    //    if (i==j) M(i,i) = real(M(i,i));
+    //    if (j>i)  M(i,j) = conj(M(j,i));
+    //    if (i-j > Nu || j-i > Nu) M(i,j) = 0.;
+    //  }
+    //}
+    
+    //Glog << "shiftedQRDecompEigen() end" << endl; 
+  }
+
+  void exampleQRDecompEigen(void)
+  {
+    Eigen::MatrixXd A = Eigen::MatrixXd::Zero(3,3);
+    Eigen::MatrixXd Q = Eigen::MatrixXd::Zero(3,3);
+    Eigen::MatrixXd R = Eigen::MatrixXd::Zero(3,3);
+    Eigen::MatrixXd P = Eigen::MatrixXd::Zero(3,3);
+
+    A(0,0) = 12.0;
+    A(0,1) = -51.0;
+    A(0,2) = 4.0;
+    A(1,0) = 6.0;
+    A(1,1) = 167.0;
+    A(1,2) = -68.0;
+    A(2,0) = -4.0;
+    A(2,1) = 24.0;
+    A(2,2) = -41.0;
+    
+    Glog << "matrix A before ColPivHouseholder" << std::endl;
+    for ( int i=0; i<3; i++ ) {
+      for ( int j=0; j<3; j++ ) {
+        Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n';
+      }
+    }
+    Glog << std::endl;
+
+    Eigen::ColPivHouseholderQR<Eigen::MatrixXd> QRD(A);
+    
+    Glog << "matrix A after ColPivHouseholder" << std::endl;
+    for ( int i=0; i<3; i++ ) {
+      for ( int j=0; j<3; j++ ) {
+        Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n';
+      }
+    }
+    Glog << std::endl;
+    
+    Glog << "HouseholderQ with sequence lenth = nonzeroPiviots" << std::endl;
+    Q = QRD.householderQ().setLength(QRD.nonzeroPivots());
+    for ( int i=0; i<3; i++ ) {
+      for ( int j=0; j<3; j++ ) {
+        Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n';
+      }
+    }
+    Glog << std::endl;
+    
+    Glog << "HouseholderQ with sequence lenth = 1" << std::endl;
+    Q = QRD.householderQ().setLength(1);
+    for ( int i=0; i<3; i++ ) {
+      for ( int j=0; j<3; j++ ) {
+        Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n';
+      }
+    }
+    Glog << std::endl;
+    
+    Glog << "HouseholderQ with sequence lenth = 2" << std::endl;
+    Q = QRD.householderQ().setLength(2);
+    for ( int i=0; i<3; i++ ) {
+      for ( int j=0; j<3; j++ ) {
+        Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n';
+      }
+    }
+    Glog << std::endl;
+    
+    Glog << "matrixR" << std::endl;
+    R = QRD.matrixR();
+    for ( int i=0; i<3; i++ ) {
+      for ( int j=0; j<3; j++ ) {
+        Glog << "R[" << i << "," << j << "] = " << R(i,j) << '\n';
+      }
+    }
+    Glog << std::endl;
+
+    Glog << "rank = " << QRD.rank() << std::endl;
+    Glog << "threshold = " << QRD.threshold() << std::endl;
+    
+    Glog << "matrixP" << std::endl;
+    P = QRD.colsPermutation();
+    for ( int i=0; i<3; i++ ) {
+      for ( int j=0; j<3; j++ ) {
+        Glog << "P[" << i << "," << j << "] = " << P(i,j) << '\n';
+      }
+    }
+    Glog << std::endl;
+
+
+    Glog << "QR decomposition without column pivoting" << std::endl;
+    
+    A(0,0) = 12.0;
+    A(0,1) = -51.0;
+    A(0,2) = 4.0;
+    A(1,0) = 6.0;
+    A(1,1) = 167.0;
+    A(1,2) = -68.0;
+    A(2,0) = -4.0;
+    A(2,1) = 24.0;
+    A(2,2) = -41.0;
+    
+    Glog << "matrix A before Householder" << std::endl;
+    for ( int i=0; i<3; i++ ) {
+      for ( int j=0; j<3; j++ ) {
+        Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n';
+      }
+    }
+    Glog << std::endl;
+    
+    Eigen::HouseholderQR<Eigen::MatrixXd> QRDplain(A);
+    
+    Glog << "HouseholderQ" << std::endl;
+    Q = QRDplain.householderQ();
+    for ( int i=0; i<3; i++ ) {
+      for ( int j=0; j<3; j++ ) {
+        Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n';
+      }
+    }
+    Glog << std::endl;
+    
+    Glog << "matrix A after Householder" << std::endl;
+    for ( int i=0; i<3; i++ ) {
+      for ( int j=0; j<3; j++ ) {
+        Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n';
+      }
+    }
+    Glog << std::endl;
+  }
+
+ };
+}
+#undef Glog
+#undef USE_LAPACK
+#endif
diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 472013f4..4611b209 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -162,6 +162,13 @@ void GridCmdOptionInt(std::string &str,int & val)
   return;
 }
 
+// ypj [add]
+void GridCmdOptionFloat(std::string &str,double & val)
+{
+  std::stringstream ss(str);
+  ss>>val;
+  return;
+}
 
 void GridParseLayout(char **argv,int argc,
 		     Coordinate &latt_c,
diff --git a/Grid/util/Init.h b/Grid/util/Init.h
index f7f032ba..cb8b6c49 100644
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -56,7 +56,9 @@ std::string GridCmdVectorIntToString(const VectorInt & vec);
 void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(std::string &str,VectorInt & vec);
-
+// ypj [add]
+void GridCmdOptionInt(std::string &str,int & val);
+void GridCmdOptionFloat(std::string &str,double & val);
 
 void GridParseLayout(char **argv,int argc,
 		     std::vector<int> &latt,
diff --git a/tests/lanczos/Test_dwf_block_lanczos.cc b/tests/lanczos/Test_dwf_block_lanczos.cc
new file mode 100644
index 00000000..2aa83be1
--- /dev/null
+++ b/tests/lanczos/Test_dwf_block_lanczos.cc
@@ -0,0 +1,398 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_block_lanczos.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/util/Init.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+
+using namespace std;
+using namespace Grid;
+//using namespace Grid::QCD;
+
+//typedef typename GparityDomainWallFermionR::FermionField FermionField;
+typedef typename ZMobiusFermionR::FermionField FermionField;
+
+RealD AllZero(RealD x){ return 0.;}
+
+class CmdJobParams 
+{
+  public:
+    std::string gaugefile;
+
+    int Ls;
+    double mass;
+    double M5;
+    double mob_b;
+    std::vector<ComplexD> omega;
+    std::vector<Complex> boundary_phase;
+    std::vector<int> mpi_split;
+    
+    LanczosType Impl;
+    int Nu;
+    int Nk;
+    int Np;
+    int Nm;
+    int Nstop;
+    int Ntest;
+    int MaxIter;
+    double resid;
+    
+    double low;
+    double high;
+    int order;
+
+    CmdJobParams()
+      : gaugefile("Hot"),
+        Ls(8), mass(0.01), M5(1.8), mob_b(1.5),
+        Impl(LanczosType::irbl),mpi_split(4,1),
+        Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8), 
+        low(0.2), high(5.5), order(11)
+    {Nm=Nk+Np;};
+    
+    void Parse(char **argv, int argc);
+};
+
+
+void CmdJobParams::Parse(char **argv,int argc)
+{
+  std::string arg;
+  std::vector<int> vi;
+  double re,im;
+  int expect, idx;
+  std::string vstr;
+  std::ifstream pfile;
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){
+    gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf");
+  }
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
+    pfile.open(arg);
+    assert(pfile);
+    expect = 0;
+    while( pfile >> vstr ) {
+      if ( vstr.compare("boundary_phase") == 0 ) {
+        pfile >> vstr;
+        GridCmdOptionInt(vstr,idx);
+        assert(expect==idx);
+        pfile >> vstr;
+        GridCmdOptionFloat(vstr,re);
+        pfile >> vstr;
+        GridCmdOptionFloat(vstr,im);
+        boundary_phase.push_back({re,im});
+        expect++;
+      }
+    }
+    pfile.close();
+  } else {
+    for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.});
+  }
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
+    pfile.open(arg);
+    assert(pfile);
+    Ls = 0;
+    while( pfile >> vstr ) {
+      if ( vstr.compare("omega") == 0 ) {
+        pfile >> vstr;
+        GridCmdOptionInt(vstr,idx);
+        assert(Ls==idx);
+        pfile >> vstr;
+        GridCmdOptionFloat(vstr,re);
+        pfile >> vstr;
+        GridCmdOptionFloat(vstr,im);
+        omega.push_back({re,im});
+        Ls++;
+      }
+    }
+    pfile.close();
+  } else {
+    if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){
+      arg = GridCmdOptionPayload(argv,argv+argc,"--Ls");
+      GridCmdOptionInt(arg,Ls);
+    }
+  }
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
+    GridCmdOptionFloat(arg,mass);
+  }
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--M5") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--M5");
+    GridCmdOptionFloat(arg,M5);
+  }
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b");
+    GridCmdOptionFloat(arg,mob_b);
+  }
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--irbl");
+    GridCmdOptionIntVector(arg,vi);
+    Nu = vi[0];
+    Nk = vi[1];
+    Np = vi[2];
+    Nstop = vi[3];
+    MaxIter = vi[4];
+    // ypj[fixme] mode overriding message is needed.
+    Impl = LanczosType::irbl;
+    Nm = Nk+Np;
+  }
+  
+  // block Lanczos with explicit extension of its dimensions
+  if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--rbl");
+    GridCmdOptionIntVector(arg,vi);
+    Nu = vi[0];
+    Nk = vi[1];
+    Np = vi[2]; // vector space is enlarged by adding Np vectors
+    Nstop = vi[3];
+    MaxIter = vi[4];
+    // ypj[fixme] mode overriding message is needed.
+    Impl = LanczosType::rbl;
+    Nm = Nk+Np*MaxIter;
+  }
+  
+#if 1
+  // block Lanczos with explicit extension of its dimensions
+  if( GridCmdOptionExists(argv,argv+argc,"--split") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--split");
+    GridCmdOptionIntVector(arg,vi);
+    for(int i=0;i<mpi_split.size();i++)
+    mpi_split[i] = vi[i];
+  }
+#endif
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--check_int") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--check_int");
+    GridCmdOptionInt(arg,Ntest);
+  }
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--resid") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--resid");
+    GridCmdOptionFloat(arg,resid);
+  }
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--cheby_l") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_l");
+    GridCmdOptionFloat(arg,low);
+  }
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--cheby_u") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_u");
+    GridCmdOptionFloat(arg,high);
+  }
+  
+  if( GridCmdOptionExists(argv,argv+argc,"--cheby_n") ){
+    arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_n");
+    GridCmdOptionInt(arg,order);
+  }
+  
+  if ( CartesianCommunicator::RankWorld() == 0 ) {
+    std::streamsize ss = std::cout.precision();
+    std::cout << GridLogMessage <<" Gauge Configuration "<< gaugefile << '\n';
+    std::cout.precision(15);
+    for ( int i=0; i<4; ++i ) std::cout << GridLogMessage <<" boundary_phase["<< i << "] = " << boundary_phase[i] << '\n';
+    std::cout.precision(ss);
+    std::cout << GridLogMessage <<" Ls "<< Ls << '\n';
+    std::cout << GridLogMessage <<" mass "<< mass << '\n';
+    std::cout << GridLogMessage <<" M5 "<< M5 << '\n';
+    std::cout << GridLogMessage <<" mob_b "<< mob_b << '\n';
+    std::cout.precision(15);
+    for ( int i=0; i<Ls; ++i ) std::cout << GridLogMessage <<" omega["<< i << "] = " << omega[i] << '\n';
+    std::cout.precision(ss);
+    std::cout << GridLogMessage <<" Nu "<< Nu << '\n'; 
+    std::cout << GridLogMessage <<" Nk "<< Nk << '\n'; 
+    std::cout << GridLogMessage <<" Np "<< Np << '\n'; 
+    std::cout << GridLogMessage <<" Nm "<< Nm << '\n'; 
+    std::cout << GridLogMessage <<" Nstop "<< Nstop << '\n'; 
+    std::cout << GridLogMessage <<" Ntest "<< Ntest << '\n'; 
+    std::cout << GridLogMessage <<" MaxIter "<< MaxIter << '\n'; 
+    std::cout << GridLogMessage <<" resid "<< resid << '\n'; 
+    std::cout << GridLogMessage <<" Cheby Poly "<< low << "," << high << "," << order << std::endl; 
+  }
+}
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+  
+  CmdJobParams JP;
+  JP.Parse(argv,argc);
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGrid);
+//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  // ypj [note] why seed RNG5 again? bug? In this case, run with a default seed().
+  GridParallelRNG          RNG5rb(FrbGrid);  RNG5rb.SeedFixedIntegers(seeds5);
+
+  LatticeGaugeField Umu(UGrid); 
+  std::vector<LatticeColourMatrix> U(4,UGrid);
+  
+  if ( JP.gaugefile.compare("Hot") == 0 ) {
+    SU3::HotConfiguration(RNG4, Umu);
+  } else {
+    FieldMetaData header;
+    NerscIO::readConfiguration(Umu,header,JP.gaugefile);
+    // ypj [fixme] additional checks for the loaded configuration?
+  }
+  
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  }
+  
+  RealD mass = JP.mass;
+  RealD M5 = JP.M5;
+
+// ypj [fixme] flexible support for a various Fermions
+//  RealD mob_b = JP.mob_b;      // Gparity
+//  std::vector<ComplexD> omega; // ZMobius
+  
+//  GparityMobiusFermionD ::ImplParams params;
+//  std::vector<int> twists({1,1,1,0});
+//  params.twists = twists;
+//  GparityMobiusFermionR  Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
+//  SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
+
+
+//  int mrhs = JP.Nu;
+  int Ndir=4;
+  auto mpi_layout  = GridDefaultMpi();
+  std::vector<int> mpi_split (Ndir,1);
+#if 0
+    int tmp=mrhs, dir=0;
+    std::cout << GridLogMessage  << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_split[dir]<<std::endl;
+    while ( tmp> 1) {
+    if ((mpi_split[dir]*2) <= mpi_layout[dir]){
+        mpi_split[dir] *=2;
+        tmp = tmp/2;
+    }
+    std::cout << GridLogMessage  << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_layout[dir]<<std::endl;
+        dir = (dir+1)%Ndir;
+    }
+#endif
+    int mrhs=1;
+    for(int i =0;i<Ndir;i++){
+      mpi_split[i] = mpi_layout[i] / JP.mpi_split[i] ;
+      mrhs *= JP.mpi_split[i];
+    }
+    std::cout << GridLogMessage  << "mpi_layout= " << mpi_layout << std::endl;
+    std::cout << GridLogMessage  << "mpi_split= " << mpi_split << std::endl;
+    std::cout << GridLogMessage  << "mrhs= " << mrhs << std::endl;
+//    assert(JP.Nu==tmp);
+
+  /////////////////////////////////////////////
+  // Split into 1^4 mpi communicators
+  /////////////////////////////////////////////
+  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
+                                                    GridDefaultSimd(Nd,vComplex::Nsimd()),
+                                                    mpi_split,
+                                                    *UGrid);
+
+  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,SGrid);
+  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
+  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
+
+  LatticeGaugeField s_Umu(SGrid);
+  Grid_split  (Umu,s_Umu);
+
+  //WilsonFermionR::ImplParams params;
+  ZMobiusFermionR::ImplParams params;
+  params.overlapCommsCompute = true;
+  params.boundary_phases = JP.boundary_phase;
+  ZMobiusFermionR  Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,JP.omega,1.,0.,params);
+//  SchurDiagTwoOperator<ZMobiusFermionR,FermionField> HermOp(Ddwf);
+  SchurDiagOneOperator<ZMobiusFermionR,FermionField> HermOp(Ddwf);
+  ZMobiusFermionR  Dsplit(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,JP.omega,1.,0.,params);
+//  SchurDiagTwoOperator<ZMobiusFermionR,FermionField> SHermOp(Dsplit);
+  SchurDiagOneOperator<ZMobiusFermionR,FermionField> SHermOp(Dsplit);
+
+  //std::vector<double> Coeffs { 0.,-1.}; 
+  // ypj [note] this may not be supported by some compilers
+  std::vector<double> Coeffs({ 0.,-1.}); 
+  Polynomial<FermionField> PolyX(Coeffs);
+  //Chebyshev<FermionField> Cheb(0.2,5.5,11);
+  Chebyshev<FermionField> Cheb(JP.low,JP.high,JP.order);
+//  Cheb.csv(std::cout);
+  ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
+						     FrbGrid,SFrbGrid,mrhs,
+                                                     Cheb,
+                                                     JP.Nstop, JP.Ntest,
+                                                     JP.Nu, JP.Nk, JP.Nm,
+                                                     JP.resid,
+                                                     JP.MaxIter,
+						     IRLdiagonaliseWithEigen);
+//						     IRLdiagonaliseWithLAPACK);
+  
+  std::vector<RealD> eval(JP.Nm);
+  
+  std::vector<FermionField> src(JP.Nu,FrbGrid);
+if (1)
+{
+  std::cout << GridLogMessage << "Using RNG5"<<std::endl;
+  FermionField src_tmp(FGrid);
+  for ( int i=0; i<JP.Nu; ++i ){
+//    gaussian(RNG5,src_tmp);
+     ComplexD rnd;
+     RealD re;
+     fillScalar(re,RNG5._gaussian[0],RNG5._generators[0]);
+    std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" rnd "<< rnd  << std::endl;
+    src_tmp=re;
+    pickCheckerboard(Odd,src[i],src_tmp);
+  }
+  RNG5.Report();
+} else {
+  std::cout << GridLogMessage << "Using RNG5rb"<<std::endl;
+  for ( int i=0; i<JP.Nu; ++i )
+    gaussian(RNG5rb,src[i]);
+  RNG5rb.Report();
+
+}
+  
+  std::vector<FermionField> evec(JP.Nm,FrbGrid);
+  for(int i=0;i<1;++i){
+    std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl;
+  };
+
+  int Nconv;
+  IRBL.calc(eval,evec,src,Nconv,JP.Impl);
+
+
+  Grid_finalize();
+}

From 9266b89ad8c8cb62e9f70c527b8f0f28a7cd94e5 Mon Sep 17 00:00:00 2001
From: Yong-Chull Jang <integration.field@gmail.com>
Date: Wed, 25 Mar 2020 15:45:50 -0400
Subject: [PATCH 2/5] fix rngs issue; block Lanczos is working

---
 Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h | 2 +-
 tests/lanczos/Test_dwf_block_lanczos.cc                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
index e3afe43c..5076a527 100644
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
@@ -180,7 +180,7 @@ public:
           Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
       }
     }
-    normalize(w,if_print);
+    assert(normalize(w,if_print) != 0);
   }
   void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
   {
diff --git a/tests/lanczos/Test_dwf_block_lanczos.cc b/tests/lanczos/Test_dwf_block_lanczos.cc
index 2aa83be1..a55c06a4 100644
--- a/tests/lanczos/Test_dwf_block_lanczos.cc
+++ b/tests/lanczos/Test_dwf_block_lanczos.cc
@@ -363,7 +363,7 @@ int main (int argc, char ** argv)
   std::vector<RealD> eval(JP.Nm);
   
   std::vector<FermionField> src(JP.Nu,FrbGrid);
-if (1)
+if (0)
 {
   std::cout << GridLogMessage << "Using RNG5"<<std::endl;
   FermionField src_tmp(FGrid);

From 02edbe624f782e9ca4912e5d203d4913fb3ef6ec Mon Sep 17 00:00:00 2001
From: Yong-Chull Jang <integration.field@gmail.com>
Date: Mon, 30 Mar 2020 18:36:21 -0400
Subject: [PATCH 3/5] first working version of Gram Schmidt using cublas gemm;
 explicit data type and site vector size has to be removed

---
 .gitignore                                    |   2 +
 .../ImplicitlyRestartedBlockLanczos.h         | 150 ++++++++++++++++--
 tests/lanczos/Test_dwf_block_lanczos.cc       |   4 +-
 3 files changed, 144 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5338acb9..13efd67c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,8 @@
 *~
 *#
 *.sublime-*
+.ctags
+tags
 
 # Precompiled Headers #
 #######################
diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
index 5076a527..7cc11653 100644
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
@@ -39,6 +39,10 @@ Author: Guido Cossu
 #undef USE_LAPACK
 #define Glog std::cout << GridLogMessage 
 
+#ifdef GRID_NVCC
+#include "cublas_v2.h"
+#endif
+
 namespace Grid {
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -89,6 +93,12 @@ class SortEigen {
 
 enum class LanczosType { irbl, rbl };
 
+enum IRBLdiagonalisation { 
+  IRBLdiagonaliseWithDSTEGR,
+  IRBLdiagonaliseWithQR,
+  IRBLdiagonaliseWithEigen
+};
+
 /////////////////////////////////////////////////////////////
 // Implicitly restarted block lanczos
 /////////////////////////////////////////////////////////////
@@ -107,7 +117,7 @@ private:
   int Nblock_m;    // Nm/Nu
   int Nconv_test_interval; // Number of skipped vectors when checking a convergence
   RealD eresid;
-  IRLdiagonalisation diagonalisation;
+  IRBLdiagonalisation diagonalisation;
   int split_test; //test split in the first iteration
   ////////////////////////////////////
   // Embedded objects
@@ -137,7 +147,7 @@ public:
                                  int _Nm,    // total vecs
                                  RealD _eresid, // resid in lmd deficit 
                                  int _MaxIter,  // Max iterations
-                                 IRLdiagonalisation _diagonalisation = IRLdiagonaliseWithEigen)
+                                 IRBLdiagonalisation _diagonalisation = IRBLdiagonaliseWithEigen)
    : _Linop(Linop),   _SLinop(SLinop),  _poly(poly),sf_grid(SFrbGrid),f_grid(FrbGrid),
       Nstop(_Nstop), Nconv_test_interval(_Nconv_test_interval), mrhs(_mrhs),
       Nu(_Nu), Nk(_Nk), Nm(_Nm), 
@@ -211,7 +221,126 @@ public:
 #endif
     }}
     for(int i=0; i<_Nu; ++i)
-    normalize(w[i],if_print);
+    assert(normalize(w[i],if_print) !=0);
+  }
+
+
+  void orthogonalize_blas(std::vector<Field>& w, int _Nu, std::vector<Field>& evec, int _R, int _print=0) 
+  { 
+#ifdef GRID_NVCC
+    Glog << "cuBLAS orthogonalize" << std::endl;
+    
+    typedef typename Field::vector_object vobj;
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::vector_type vector_type;
+    
+    typedef typename Field::scalar_type MyComplex;
+    
+    GridBase *grid = w[0].Grid();
+    //grid->show_decomposition();
+    //const uint64_t nsimd = grid->Nsimd();
+    const uint64_t sites = grid->lSites();
+      
+    //auto w_v = w[0].View();
+    //cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&w_v._odata[0]);
+    //cuDoubleComplex *z = w_v._odata._internal;
+    //thread_for(ss,w_v.size(),{
+    //    Glog << w_v[ss] << std::endl;
+    //});
+    //w_v[0]
+    //exit(0);
+    //scalar_type *z = (scalar_type *)&w_v[0]; // OK
+    //cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex *>(&w_v[0]); // OK
+  
+    cudaError_t cudaStat;
+    
+    cuDoubleComplex *w_acc, *evec_acc, *c_acc;
+    
+    cudaStat = cudaMallocManaged((void **)&w_acc, _Nu*sites*12*sizeof(cuDoubleComplex));
+    Glog << cudaStat << std::endl;
+    cudaStat = cudaMallocManaged((void **)&evec_acc, _R*sites*12*sizeof(cuDoubleComplex));
+    Glog << cudaStat << std::endl;
+    cudaStat = cudaMallocManaged((void **)&c_acc, _Nu*_R*12*sizeof(cuDoubleComplex));
+    Glog << cudaStat << std::endl;
+    
+    Glog << "cuBLAS prepare array"<< std::endl;
+#if 0 // a trivial test
+    for (int col=0; col<_Nu; ++col) {
+      for (size_t row=0; row<sites*12; ++row) {
+        w_acc[col*sites*12+row].x = 1.0;
+        w_acc[col*sites*12+row].y = 0.0;
+      }
+    }
+   
+    for (int col=0; col<_R; ++col) {
+      for (size_t row=0; row<sites*12; ++row) {
+        evec_acc[col*sites*12+row].x = 1.0;
+        evec_acc[col*sites*12+row].y = 0.0;
+      }
+    }
+#else 
+    for (int col=0; col<_Nu; ++col) {
+      auto w_v = w[col].View();
+      cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&w_v[0]);
+      for (size_t row=0; row<sites*12; ++row) {
+        //w_acc[col*sites*12+row].x = z[2*row];
+        //w_acc[col*sites*12+row].y = z[2*row+1];
+        w_acc[col*sites*12+row] = z[row];
+      }
+    }
+   
+    for (int col=0; col<_R; ++col) {
+      auto evec_v = evec[col].View();
+      cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&evec_v[0]);
+      for (size_t row=0; row<sites*12; ++row) {
+        //evec_acc[col*sites*12+row].x = z[2*row];
+        //evec_acc[col*sites*12+row].y = z[2*row+1];
+        evec_acc[col*sites*12+row] = z[row];
+      }
+    }
+#endif 
+    Glog << "cuBLAS prepare array done"<< std::endl;
+    
+    Glog << "cuBLAS Zgemm"<< std::endl;
+    
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    
+    stat = cublasCreate(&handle);
+    cuDoubleComplex alpha = make_cuDoubleComplex(1.0,0.0);
+    cuDoubleComplex beta = make_cuDoubleComplex(0.0,0.0);
+    stat = cublasZgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N, _R, _Nu, 12*sites,
+                       &alpha, evec_acc, 12*sites, w_acc, 12*sites,  &beta, c_acc, _R);
+    Glog << stat << std::endl;
+    
+    grid->GlobalSumVector((double*)c_acc,2*_Nu*_R);
+
+    cublasDestroy(handle);
+    
+    Glog << "cuBLAS Zgemm done"<< std::endl;
+    
+    for (int i=0; i<_Nu; ++i) {
+      for (size_t j=0; j<_R; ++j) {
+        cuDoubleComplex z = c_acc[i*_R+j];
+        MyComplex ip(z.x,z.y);
+        if (_print) {
+          Glog << "<evec,w>[" << j << "," << i << "] = " 
+               << z.x << " + i " << z.y << std::endl;
+        }
+        w[i] = w[i] - ip * evec[j];
+      }
+      assert(normalize(w[i],_print)!=0);
+    }
+    
+    cudaFree(w_acc);
+    cudaFree(evec_acc);
+    cudaFree(c_acc);
+
+    Glog << "cuBLAS orthogonalize done" << std::endl;
+#else
+    Glog << "BLAS wrapper is not implemented" << std::endl;
+    exit(1);
+#endif
   }
 
 
@@ -310,10 +439,10 @@ for( int i =0;i<total;i++){
     Glog <<" -- total  Nm    = "<< Nm    <<" vectors"<< std::endl;
     Glog <<" -- size of eval = "<< eval.size() << std::endl;
     Glog <<" -- size of evec = "<< evec.size() << std::endl;
-    if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+    if ( diagonalisation == IRBLdiagonaliseWithEigen ) { 
       Glog << "Diagonalisation is Eigen "<< std::endl;
 #ifdef USE_LAPACK
-    } else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) { 
+    } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { 
       Glog << "Diagonalisation is LAPACK "<< std::endl;
 #endif
     } else {
@@ -520,10 +649,10 @@ for( int i =0;i<total;i++){
     Glog <<" -- accept Nstop     = "<< Nstop <<" vectors"<< std::endl;
     Glog <<" -- size of eval     = "<< eval.size() << std::endl;
     Glog <<" -- size of evec     = "<< evec.size() << std::endl;
-    if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+    if ( diagonalisation == IRBLdiagonaliseWithEigen ) { 
       Glog << "Diagonalisation is Eigen "<< std::endl;
 #ifdef USE_LAPACK
-    } else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) { 
+    } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { 
       Glog << "Diagonalisation is LAPACK "<< std::endl;
 #endif
     } else {
@@ -790,7 +919,8 @@ if(!split_test){
       orthogonalize(w[u],evec,R);
     }
 #else
-      orthogonalize(w,Nu,evec,R);
+      //orthogonalize(w,Nu,evec,R);
+      orthogonalize_blas(w,Nu,evec,R);
 #endif
     // QR part
     for (int u=1; u<Nu; ++u) {
@@ -1052,10 +1182,10 @@ if (1){
 		   GridBase *grid)
   {
     Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
-    if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+    if ( diagonalisation == IRBLdiagonaliseWithEigen ) { 
       diagonalize_Eigen(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
 #ifdef USE_LAPACK
-    } else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) { 
+    } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { 
       diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
 #endif
     } else { 
diff --git a/tests/lanczos/Test_dwf_block_lanczos.cc b/tests/lanczos/Test_dwf_block_lanczos.cc
index a55c06a4..660d0cdc 100644
--- a/tests/lanczos/Test_dwf_block_lanczos.cc
+++ b/tests/lanczos/Test_dwf_block_lanczos.cc
@@ -357,8 +357,8 @@ int main (int argc, char ** argv)
                                                      JP.Nu, JP.Nk, JP.Nm,
                                                      JP.resid,
                                                      JP.MaxIter,
-						     IRLdiagonaliseWithEigen);
-//						     IRLdiagonaliseWithLAPACK);
+						     IRBLdiagonaliseWithEigen);
+//						     IRBLdiagonaliseWithLAPACK);
   
   std::vector<RealD> eval(JP.Nm);
   

From ac7090e6d3af4e9d468323bd2a03e2f2437b813f Mon Sep 17 00:00:00 2001
From: Yong-Chull Jang <integration.field@gmail.com>
Date: Mon, 30 Mar 2020 22:25:50 -0400
Subject: [PATCH 4/5] block Lanczos cublas buffer is set at the inital step;
 buffer width is fixed to the block size then cublas Zgemm is called multiple
 times

---
 .../ImplicitlyRestartedBlockLanczos.h         | 173 ++++++++++--------
 1 file changed, 92 insertions(+), 81 deletions(-)

diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
index 7cc11653..cd6bd682 100644
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
@@ -110,7 +110,7 @@ private:
   std::string cname = std::string("ImplicitlyRestartedBlockLanczos");
   int MaxIter;   // Max iterations
   int Nstop;     // Number of evecs checked for convergence
-  int Nu;        // Numbeer of vecs in the unit block
+  int Nu;        // Number of vecs in the unit block
   int Nk;        // Number of converged sought
   int Nm;        // total number of vectors
   int Nblock_k;    // Nk/Nu
@@ -129,7 +129,15 @@ private:
   GridRedBlackCartesian * f_grid;
   GridRedBlackCartesian * sf_grid;
   int mrhs;
-
+  /////////////////////////
+  // BLAS objects
+  /////////////////////////
+#ifdef GRID_NVCC
+  cudaError_t cudaStat;
+  cuDoubleComplex *w_acc, *evec_acc, *c_acc;
+#endif
+  int Nevec_acc; // Number of eigenvectors stored in the buffer evec_acc
+  
   /////////////////////////
   // Constructor
   /////////////////////////
@@ -154,7 +162,8 @@ public:
       Nblock_m(_Nm/_Nu), Nblock_k(_Nk/_Nu),
       //eresid(_eresid),  MaxIter(10),
       eresid(_eresid),  MaxIter(_MaxIter),
-      diagonalisation(_diagonalisation),split_test(0)
+      diagonalisation(_diagonalisation),split_test(0),
+      Nevec_acc(_Nu)
   { assert( (Nk%Nu==0) && (Nm%Nu==0) ); };
 
   ////////////////////////////////
@@ -225,7 +234,7 @@ public:
   }
 
 
-  void orthogonalize_blas(std::vector<Field>& w, int _Nu, std::vector<Field>& evec, int _R, int _print=0) 
+  void orthogonalize_blas(std::vector<Field>& w, std::vector<Field>& evec, int R, int do_print=0) 
   { 
 #ifdef GRID_NVCC
     Glog << "cuBLAS orthogonalize" << std::endl;
@@ -237,105 +246,86 @@ public:
     typedef typename Field::scalar_type MyComplex;
     
     GridBase *grid = w[0].Grid();
-    //grid->show_decomposition();
-    //const uint64_t nsimd = grid->Nsimd();
     const uint64_t sites = grid->lSites();
-      
-    //auto w_v = w[0].View();
-    //cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&w_v._odata[0]);
-    //cuDoubleComplex *z = w_v._odata._internal;
-    //thread_for(ss,w_v.size(),{
-    //    Glog << w_v[ss] << std::endl;
-    //});
-    //w_v[0]
-    //exit(0);
-    //scalar_type *z = (scalar_type *)&w_v[0]; // OK
-    //cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex *>(&w_v[0]); // OK
-  
-    cudaError_t cudaStat;
+
+    int Nbatch = R/Nevec_acc;
+    assert( R%Nevec_acc == 0 );
+    Glog << "nBatch, Nevec_acc, R, Nu = " 
+         << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl;
     
-    cuDoubleComplex *w_acc, *evec_acc, *c_acc;
-    
-    cudaStat = cudaMallocManaged((void **)&w_acc, _Nu*sites*12*sizeof(cuDoubleComplex));
-    Glog << cudaStat << std::endl;
-    cudaStat = cudaMallocManaged((void **)&evec_acc, _R*sites*12*sizeof(cuDoubleComplex));
-    Glog << cudaStat << std::endl;
-    cudaStat = cudaMallocManaged((void **)&c_acc, _Nu*_R*12*sizeof(cuDoubleComplex));
-    Glog << cudaStat << std::endl;
-    
-    Glog << "cuBLAS prepare array"<< std::endl;
 #if 0 // a trivial test
-    for (int col=0; col<_Nu; ++col) {
+    for (int col=0; col<Nu; ++col) {
       for (size_t row=0; row<sites*12; ++row) {
         w_acc[col*sites*12+row].x = 1.0;
         w_acc[col*sites*12+row].y = 0.0;
       }
     }
-   
-    for (int col=0; col<_R; ++col) {
-      for (size_t row=0; row<sites*12; ++row) {
-        evec_acc[col*sites*12+row].x = 1.0;
-        evec_acc[col*sites*12+row].y = 0.0;
-      }
-    }
 #else 
-    for (int col=0; col<_Nu; ++col) {
+    for (int col=0; col<Nu; ++col) {
       auto w_v = w[col].View();
       cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&w_v[0]);
       for (size_t row=0; row<sites*12; ++row) {
-        //w_acc[col*sites*12+row].x = z[2*row];
-        //w_acc[col*sites*12+row].y = z[2*row+1];
         w_acc[col*sites*12+row] = z[row];
       }
     }
-   
-    for (int col=0; col<_R; ++col) {
-      auto evec_v = evec[col].View();
-      cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&evec_v[0]);
-      for (size_t row=0; row<sites*12; ++row) {
-        //evec_acc[col*sites*12+row].x = z[2*row];
-        //evec_acc[col*sites*12+row].y = z[2*row+1];
-        evec_acc[col*sites*12+row] = z[row];
-      }
-    }
-#endif 
-    Glog << "cuBLAS prepare array done"<< std::endl;
-    
-    Glog << "cuBLAS Zgemm"<< std::endl;
-    
+#endif
     cublasHandle_t handle;
     cublasStatus_t stat;
     
     stat = cublasCreate(&handle);
-    cuDoubleComplex alpha = make_cuDoubleComplex(1.0,0.0);
-    cuDoubleComplex beta = make_cuDoubleComplex(0.0,0.0);
-    stat = cublasZgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N, _R, _Nu, 12*sites,
-                       &alpha, evec_acc, 12*sites, w_acc, 12*sites,  &beta, c_acc, _R);
-    Glog << stat << std::endl;
-    
-    grid->GlobalSumVector((double*)c_acc,2*_Nu*_R);
 
-    cublasDestroy(handle);
+    Glog << "cuBLAS Zgemm"<< std::endl;
+    
+    for (int b=0; b<Nbatch; ++b) {
+#if 0 // a trivial test
+      for (int col=0; col<Nevec_acc; ++col) {
+        for (size_t row=0; row<sites*12; ++row) {
+          evec_acc[col*sites*12+row].x = 1.0;
+          evec_acc[col*sites*12+row].y = 0.0;
+        }
+      }
+#else 
+      for (int col=0; col<Nevec_acc; ++col) {
+        auto evec_v = evec[b*Nevec_acc+col].View();
+        cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&evec_v[0]);
+        for (size_t row=0; row<sites*12; ++row) {
+          evec_acc[col*sites*12+row] = z[row];
+        }
+      }
+#endif 
+      cuDoubleComplex alpha = make_cuDoubleComplex(1.0,0.0);
+      cuDoubleComplex beta = make_cuDoubleComplex(0.0,0.0);
+      stat = cublasZgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N, Nevec_acc, Nu, 12*sites,
+                         &alpha, 
+                         evec_acc, 12*sites, w_acc, 12*sites,  
+                         &beta, 
+                         c_acc, Nevec_acc);
+      //Glog << stat << std::endl;
+      
+      grid->GlobalSumVector((double*)c_acc,2*Nu*Nevec_acc);
+      
+      for (int i=0; i<Nu; ++i) {
+        for (size_t j=0; j<Nevec_acc; ++j) {
+          cuDoubleComplex z = c_acc[i*Nevec_acc+j];
+          MyComplex ip(z.x,z.y);
+          if (do_print) {
+            Glog << "<evec,w>[" << j << "," << i << "] = " 
+                 << z.x << " + i " << z.y << std::endl;
+          }
+          w[i] = w[i] - ip * evec[b*Nevec_acc+j];
+        }
+        //assert(normalize(w[i],do_print)!=0);
+      }
+    }
+
+    for (int i=0; i<Nu; ++i) {
+      assert(normalize(w[i],do_print)!=0);
+    }
     
     Glog << "cuBLAS Zgemm done"<< std::endl;
     
-    for (int i=0; i<_Nu; ++i) {
-      for (size_t j=0; j<_R; ++j) {
-        cuDoubleComplex z = c_acc[i*_R+j];
-        MyComplex ip(z.x,z.y);
-        if (_print) {
-          Glog << "<evec,w>[" << j << "," << i << "] = " 
-               << z.x << " + i " << z.y << std::endl;
-        }
-        w[i] = w[i] - ip * evec[j];
-      }
-      assert(normalize(w[i],_print)!=0);
-    }
+    cublasDestroy(handle);
     
-    cudaFree(w_acc);
-    cudaFree(evec_acc);
-    cudaFree(c_acc);
-
     Glog << "cuBLAS orthogonalize done" << std::endl;
 #else
     Glog << "BLAS wrapper is not implemented" << std::endl;
@@ -411,6 +401,21 @@ for( int i =0;i<total;i++){
             std::vector<Field>& evec, 
             const std::vector<Field>& src, int& Nconv, LanczosType Impl)
   {
+#ifdef GRID_NVCC
+    GridBase *grid = src[0].Grid();
+    grid->show_decomposition();
+    
+    // set eigenvector buffers for the cuBLAS calls
+    //const uint64_t nsimd = grid->Nsimd();
+    const uint64_t sites = grid->lSites();
+    
+    cudaStat = cudaMallocManaged((void **)&w_acc, Nu*sites*12*sizeof(cuDoubleComplex));
+    //Glog << cudaStat << std::endl;
+    cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(cuDoubleComplex));
+    //Glog << cudaStat << std::endl;
+    cudaStat = cudaMallocManaged((void **)&c_acc, Nu*Nevec_acc*sizeof(cuDoubleComplex));
+    //Glog << cudaStat << std::endl;
+#endif
     switch (Impl) {
       case LanczosType::irbl: 
         calc_irbl(eval,evec,src,Nconv);
@@ -420,6 +425,12 @@ for( int i =0;i<total;i++){
         calc_rbl(eval,evec,src,Nconv);
         break;
     }
+#ifdef GRID_NVCC
+    // free eigenvector buffers for the cuBLAS calls
+    cudaFree(w_acc);
+    cudaFree(evec_acc);
+    cudaFree(c_acc);
+#endif
   }
 
   void calc_irbl(std::vector<RealD>& eval,  
@@ -919,8 +930,8 @@ if(!split_test){
       orthogonalize(w[u],evec,R);
     }
 #else
-      //orthogonalize(w,Nu,evec,R);
-      orthogonalize_blas(w,Nu,evec,R);
+    //orthogonalize(w,Nu,evec,R);
+    orthogonalize_blas(w,evec,R);
 #endif
     // QR part
     for (int u=1; u<Nu; ++u) {

From b89b1280d5d384ce10ea8fc77eae1c804d2e9569 Mon Sep 17 00:00:00 2001
From: Yong-Chull Jang <integration.field@gmail.com>
Date: Tue, 31 Mar 2020 05:39:31 -0400
Subject: [PATCH 5/5] use gemm twice to complete the Gram Schmidt

---
 .../ImplicitlyRestartedBlockLanczos.h         | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
index cd6bd682..95fedbca 100644
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
@@ -303,7 +303,7 @@ public:
       //Glog << stat << std::endl;
       
       grid->GlobalSumVector((double*)c_acc,2*Nu*Nevec_acc);
-      
+#if 0      
       for (int i=0; i<Nu; ++i) {
         for (size_t j=0; j<Nevec_acc; ++j) {
           cuDoubleComplex z = c_acc[i*Nevec_acc+j];
@@ -314,10 +314,27 @@ public:
           }
           w[i] = w[i] - ip * evec[b*Nevec_acc+j];
         }
-        //assert(normalize(w[i],do_print)!=0);
+      }
+#else
+      alpha = make_cuDoubleComplex(-1.0,0.0);
+      beta = make_cuDoubleComplex(1.0,0.0);
+      stat = cublasZgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 12*sites, Nu, Nevec_acc,
+                         &alpha, 
+                         evec_acc, 12*sites, c_acc, Nevec_acc,  
+                         &beta, 
+                         w_acc, 12*sites);
+      //Glog << stat << std::endl;
+#endif
+    }
+#if 1    
+    for (int col=0; col<Nu; ++col) {
+      auto w_v = w[col].View();
+      cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&w_v[0]);
+      for (size_t row=0; row<sites*12; ++row) {
+        z[row] = w_acc[col*sites*12+row];
       }
     }
-
+#endif
     for (int i=0; i<Nu; ++i) {
       assert(normalize(w[i],do_print)!=0);
     }