Merge remote-tracking branch 'origin/chulwoo-dec12-2015'

Merge Chulwoo's Lanczos related improvements. Merge Nd!=4 fixes for pure gauge HMC from Evan.
2025-11-10 00:29:32 +00:00 · 2016-03-08 09:55:14 +00:00
parent 2dce9c3cff 1e72bd8b8c
commit 090e7aa930
27 changed files with 4429 additions and 3388 deletions
--- a/7243
+++ b/7243
--- a/configure.ac
+++ b/configure.ac
@@ -227,6 +227,26 @@ esac

 AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])

+#
+# Lapack
+#
+AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
+
+case ${ac_LAPACK} in
+     yes)
+       echo Enabling lapack
+     ;;
+     no)
+       echo Disabling lapack
+     ;;
+     *)
+       echo Enabling lapack at ${ac_LAPACK}
+     ;;
+esac
+
+AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ])
+AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
+
 ###################################################################
 # Checks for doxygen support
 # if present enables the "make doxyfile" command
@@ -270,6 +290,7 @@ The following features are enabled:
 - communications type           : ${ac_COMMS}
 - default precision             : ${ac_PRECISION}
 - RNG choice                    : ${ac_RNG} 
+- LAPACK	                : ${ac_LAPACK} 


 "
--- a/lib/.dirstamp
+++ b/lib/.dirstamp
--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@@ -30,6 +30,15 @@
 /* GRID_DEFAULT_PRECISION is SINGLE */
 #undef GRID_DEFAULT_PRECISION_SINGLE

+/* Support Altivec instructions */
+#undef HAVE_ALTIVEC
+
+/* Support AVX (Advanced Vector Extensions) instructions */
+#undef HAVE_AVX
+
+/* Support AVX2 (Advanced Vector Extensions 2) instructions */
+#undef HAVE_AVX2
+
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #undef HAVE_DECL_BE64TOH
@@ -44,6 +53,9 @@
 /* Define to 1 if you have the <execinfo.h> header file. */
 #undef HAVE_EXECINFO_H

+/* Support FMA3 (Fused Multiply-Add) instructions */
+#undef HAVE_FMA
+
 /* Define to 1 if you have the `gettimeofday' function. */
 #undef HAVE_GETTIMEOFDAY

@@ -62,9 +74,30 @@
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H

+/* Support mmx instructions */
+#undef HAVE_MMX
+
 /* Define to 1 if you have the <mm_malloc.h> header file. */
 #undef HAVE_MM_MALLOC_H

+/* Support SSE (Streaming SIMD Extensions) instructions */
+#undef HAVE_SSE
+
+/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
+#undef HAVE_SSE2
+
+/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
+#undef HAVE_SSE3
+
+/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
+#undef HAVE_SSE4_1
+
+/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
+#undef HAVE_SSE4_2
+
+/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
+#undef HAVE_SSSE3
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H

@@ -107,6 +140,9 @@
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME

+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION

--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -222,6 +222,7 @@ namespace Grid {
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
+//	std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;

 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
@@ -251,10 +252,10 @@ namespace Grid {
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);

-	_Mat.Meooe(in,tmp);
-	_Mat.MooeeInv(tmp,out);
-	_Mat.Meooe(out,tmp);
-	_Mat.MooeeInv(tmp,out);
+	_Mat.Meooe(in,out);
+	_Mat.MooeeInv(out,tmp);
+	_Mat.Meooe(tmp,out);
+	_Mat.MooeeInv(out,tmp);

 	return axpy_norm(out,-1.0,tmp,in);
      }
@@ -270,6 +271,35 @@ namespace Grid {
      }
    };

+    template<class Matrix,class Field>
+      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
+    protected:
+      Matrix &_Mat;
+    public:
+      SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
+
+      virtual  RealD Mpc      (const Field &in, Field &out) {
+	Field tmp(in._grid);
+
+	_Mat.MooeeInv(in,out);
+	_Mat.Meooe(out,tmp);
+	_Mat.MooeeInv(tmp,out);
+	_Mat.Meooe(out,tmp);
+
+	return axpy_norm(out,-1.0,tmp,in);
+      }
+      virtual  RealD MpcDag   (const Field &in, Field &out){
+	Field tmp(in._grid);
+
+	_Mat.MeooeDag(in,out);
+	_Mat.MooeeInvDag(out,tmp);
+	_Mat.MeooeDag(tmp,out);
+	_Mat.MooeeInvDag(out,tmp);
+
+	return axpy_norm(out,-1.0,tmp,in);
+      }
+    };
+

    /////////////////////////////////////////////////////////////
    // Base classes for functions of operators
--- a/lib/algorithms/approx/.dirstamp
+++ b/lib/algorithms/approx/.dirstamp
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -58,13 +58,14 @@ namespace Grid {
      Field Mtmp(in._grid);
      AtoN = in;
      out = AtoN*Coeffs[0];
-      //      std::cout <<"Poly in " <<norm2(in)<<std::endl;
-      //      std::cout <<"0 " <<norm2(out)<<std::endl;
+//            std::cout <<"Poly in " <<norm2(in)<<" size "<< Coeffs.size()<<std::endl;
+//            std::cout <<"Coeffs[0]= "<<Coeffs[0]<< " 0 " <<norm2(out)<<std::endl;
      for(int n=1;n<Coeffs.size();n++){
 	Mtmp = AtoN;
 	Linop.HermOp(Mtmp,AtoN);
 	out=out+AtoN*Coeffs[n];
-	//	std::cout << n<<" " <<norm2(out)<<std::endl;
+//            std::cout <<"Coeffs "<<n<<"= "<< Coeffs[n]<< " 0 " <<std::endl;
+//		std::cout << n<<" " <<norm2(out)<<std::endl;
      }
    };
  };
@@ -82,7 +83,8 @@ namespace Grid {

  public:
    void csv(std::ostream &out){
-      for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
+	RealD diff = hi-lo;
+      for (RealD x=lo-0.2*diff; x<hi+0.2*diff; x+=(hi-lo)/1000) {
 	RealD f = approx(x);
 	out<< x<<" "<<f<<std::endl;
      }
@@ -99,10 +101,24 @@ namespace Grid {

    Chebyshev(){};
    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
+    Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};

    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
    ////////////////////////////////////////////////////////////////////////////////////////////////////
+// CJ: the one we need for Lanczos
+    void Init(RealD _lo,RealD _hi,int _order)
+    {
+      lo=_lo;
+      hi=_hi;
+      order=_order;
+      
+      if(order < 2) exit(-1);
+      Coeffs.resize(order);
+      Coeffs.assign(0.,order);
+      Coeffs[order-1] = 1.;
+    };
+
    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
    {
      lo=_lo;
@@ -182,6 +198,8 @@ namespace Grid {
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {

      GridBase *grid=in._grid;
+//std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
+//<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;

      int vol=grid->gSites();

--- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -274,7 +274,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  }
  // ugly hack
  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-  assert(0);
+//  assert(0);
 }

  };
--- a/lib/algorithms/iterative/EigenSort.h
+++ b/lib/algorithms/iterative/EigenSort.h
@@ -38,6 +38,8 @@ template<class Field>
 class SortEigen {
 private:
  
+//hacking for testing for now
+#if 0
  static bool less_lmd(RealD left,RealD right){
    return fabs(left) < fabs(right);
  }  
@@ -45,6 +47,15 @@ class SortEigen {
 		 std::pair<RealD,Field>& right){
    return fabs(left.first) < fabs(right.first);
  }  
+#else
+  static bool less_lmd(RealD left,RealD right){
+    return left > right;
+  }  
+  static bool less_pair(std::pair<RealD,Field>& left,
+		 std::pair<RealD,Field>& right){
+    return left.first > (right.first);
+  }  
+#endif
  
 public:

--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -29,6 +29,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_IRL_H
 #define GRID_IRL_H

+#include <string.h> //memset
+#ifdef USE_LAPACK
+#include <lapacke.h>
+#endif
 #include <algorithms/iterative/DenseMatrix.h>
 #include <algorithms/iterative/EigenSort.h>

@@ -49,6 +53,7 @@ public:
    int Niter;
    int converged;

+    int Nstop;   // Number of evecs checked for convergence
    int Nk;      // Number of converged sought
    int Np;      // Np -- Number of spare vecs in kryloc space
    int Nm;      // Nm -- total number of vectors
@@ -57,6 +62,8 @@ public:

    SortEigen<Field> _sort;

+//    GridCartesian &_fgrid;
+
    LinearOperatorBase<Field> &_Linop;

    OperatorFunction<Field>   &_poly;
@@ -67,7 +74,27 @@ public:
    void init(void){};
    void Abort(int ff, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs);

-    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
+    ImplicitlyRestartedLanczos(
+				LinearOperatorBase<Field> &Linop, // op
+			       OperatorFunction<Field> & poly,   // polynmial
+			       int _Nstop, // sought vecs
+			       int _Nk, // sought vecs
+			       int _Nm, // spare vecs
+			       RealD _eresid, // resid in lmdue deficit 
+			       int _Niter) : // Max iterations
+      _Linop(Linop),
+      _poly(poly),
+      Nstop(_Nstop),
+      Nk(_Nk),
+      Nm(_Nm),
+      eresid(_eresid),
+      Niter(_Niter)
+    { 
+      Np = Nm-Nk; assert(Np>0);
+    };
+
+    ImplicitlyRestartedLanczos(
+				LinearOperatorBase<Field> &Linop, // op
 			       OperatorFunction<Field> & poly,   // polynmial
 			       int _Nk, // sought vecs
 			       int _Nm, // spare vecs
@@ -75,6 +102,7 @@ public:
 			       int _Niter) : // Max iterations
      _Linop(Linop),
      _poly(poly),
+      Nstop(_Nk),
      Nk(_Nk),
      Nm(_Nm),
      eresid(_eresid),
@@ -142,6 +170,7 @@ public:
      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
                                 // 7. vk+1 := wk/βk+1

+//	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
      const RealD tiny = 1.0e-20;
      if ( beta < tiny ) { 
 	std::cout << " beta is tiny "<<beta<<std::endl;
@@ -219,15 +248,122 @@ public:
      }
    }

+#ifdef USE_LAPACK
+    void diagonalize_lapack(DenseVector<RealD>& lmd,
+		     DenseVector<RealD>& lme, 
+		     int N1,
+		     int N2,
+		     DenseVector<RealD>& Qt,
+		     GridBase *grid){
+  const int size = Nm;
+//  tevals.resize(size);
+//  tevecs.resize(size);
+  int NN = N1;
+  double evals_tmp[NN];
+  double evec_tmp[NN][NN];
+  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
+//  double AA[NN][NN];
+  double DD[NN];
+  double EE[NN];
+  for (int i = 0; i< NN; i++)
+    for (int j = i - 1; j <= i + 1; j++)
+      if ( j < NN && j >= 0 ) {
+        if (i==j) DD[i] = lmd[i];
+        if (i==j) evals_tmp[i] = lmd[i];
+        if (j==(i-1)) EE[j] = lme[j];
+      }
+  int evals_found;
+  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
+  int liwork =  3+NN*10 ;
+  int iwork[liwork];
+  double work[lwork];
+  int isuppz[2*NN];
+  char jobz = 'V'; // calculate evals & evecs
+  char range = 'I'; // calculate all evals
+  //    char range = 'A'; // calculate all evals
+  char uplo = 'U'; // refer to upper half of original matrix
+  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
+  int ifail[NN];
+  int info;
+//  int total = QMP_get_number_of_nodes();
+//  int node = QMP_get_node_number();
+//  GridBase *grid = evec[0]._grid;
+  int total = grid->_Nprocessors;
+  int node = grid->_processor;
+  int interval = (NN/total)+1;
+  double vl = 0.0, vu = 0.0;
+  int il = interval*node+1 , iu = interval*(node+1);
+  if (iu > NN)  iu=NN;
+  double tol = 0.0;
+    if (1) {
+      memset(evals_tmp,0,sizeof(double)*NN);
+      if ( il <= NN){
+        printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
+        LAPACK_dstegr(&jobz, &range, &NN,
+            (double*)DD, (double*)EE,
+            &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
+            &tol, // tolerance
+            &evals_found, evals_tmp, (double*)evec_tmp, &NN,
+            isuppz,
+            work, &lwork, iwork, &liwork,
+            &info);
+        for (int i = iu-1; i>= il-1; i--){
+          printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]);
+          evals_tmp[i] = evals_tmp[i - (il-1)];
+          if (il>1) evals_tmp[i-(il-1)]=0.;
+          for (int j = 0; j< NN; j++){
+            evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
+            if (il>1) evec_tmp[i-(il-1)][j]=0.;
+          }
+        }
+      }
+      {
+//        QMP_sum_double_array(evals_tmp,NN);
+//        QMP_sum_double_array((double *)evec_tmp,NN*NN);
+         grid->GlobalSumVector(evals_tmp,NN);
+         grid->GlobalSumVector((double*)evec_tmp,NN*NN);
+      }
+    } 
+// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
+  for(int i=0;i<NN;i++){
+    for(int j=0;j<NN;j++)
+      Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
+      lmd [NN-1-i]=evals_tmp[i];
+  }
+}
+#endif
+
+
    void diagonalize(DenseVector<RealD>& lmd,
 		     DenseVector<RealD>& lme, 
-		     int Nm2,
-		     int Nm,
-		     DenseVector<RealD>& Qt)
+		     int N2,
+		     int N1,
+		     DenseVector<RealD>& Qt,
+		     GridBase *grid)
    {
-      int Niter = 100*Nm;
+
+#ifdef USE_LAPACK
+    const int check_lapack=0; // just use lapack if 0, check against lapack if 1
+
+    if(!check_lapack)
+	return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
+
+	DenseVector <RealD> lmd2(N1);
+	DenseVector <RealD> lme2(N1);
+	DenseVector<RealD> Qt2(N1*N1);
+         for(int k=0; k<N1; ++k){
+	    lmd2[k] = lmd[k];
+	    lme2[k] = lme[k];
+	  }
+         for(int k=0; k<N1*N1; ++k)
+	Qt2[k] = Qt[k];
+
+//	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
+#endif
+
+      int Niter = 100*N1;
      int kmin = 1;
-      int kmax = Nk;
+      int kmax = N2;
      // (this should be more sophisticated)

      for(int iter=0; iter<Niter; ++iter){
@@ -239,7 +375,7 @@ public:
 	// (Dsh: shift)
 	
 	// transformation
-	qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax);
+	qr_decomp(lmd,lme,N2,N1,Qt,Dsh,kmin,kmax);
 	
 	// Convergence criterion (redef of kmin and kamx)
 	for(int j=kmax-1; j>= kmin; --j){
@@ -250,6 +386,23 @@ public:
 	  }
 	}
 	Niter = iter;
+#ifdef USE_LAPACK
+    if(check_lapack){
+	const double SMALL=1e-8;
+	diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
+	DenseVector <RealD> lmd3(N2);
+         for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
+        _sort.push(lmd3,N2);
+        _sort.push(lmd2,N2);
+         for(int k=0; k<N2; ++k){
+	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
+//	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
+	  }
+         for(int k=0; k<N1*N1; ++k){
+//	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
+	}
+    }
+#endif
 	return;

      continued:
@@ -265,6 +418,7 @@ public:
      abort();
    }

+#if 1
    static RealD normalise(Field& v) 
    {
      RealD nn = norm2(v);
@@ -326,6 +480,7 @@ until convergence
      {

 	GridBase *grid = evec[0]._grid;
+	assert(grid == src._grid);

 	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
 	std::cout << " -- Nm = " << Nm << std::endl;
@@ -356,11 +511,21 @@ until convergence
 	// (uniform vector) Why not src??
 	//	evec[0] = 1.0;
 	evec[0] = src;
+	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
+// << src._grid  << std::endl;
 	normalise(evec[0]);
+	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
+// << evec[0]._grid << std::endl;
 	
 	// Initial Nk steps
 	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
+//	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
+//	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
 	RitzMatrix(evec,Nk);
+	for(int k=0; k<Nk; ++k){
+//	std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
+//	std:: cout <<"lme " << k << " " << lme[k] << std::endl;
+	}

 	// Restarting loop begins
 	for(int iter = 0; iter<Niter; ++iter){
@@ -382,16 +547,19 @@ until convergence
 	    lme2[k] = lme[k+k1-1];
 	  }
 	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nm,Nm,Qt);
+	  diagonalize(eval2,lme2,Nm,Nm,Qt,grid);

 	  // sorting
 	  _sort.push(eval2,Nm);
 	  
 	  // Implicitly shifted QR transformations
 	  setUnit_Qt(Nm,Qt);
-	  for(int ip=k2; ip<Nm; ++ip) 
+	  for(int ip=k2; ip<Nm; ++ip){ 
+	std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
 	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
 		
+	}
+    
 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
 	  
 	  for(int j=k1-1; j<k2+1; ++j){
@@ -418,7 +586,7 @@ until convergence
 	    lme2[k] = lme[k];
 	  }
 	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nk,Nm,Qt);
+	  diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
 	  
 	  for(int k = 0; k<Nk; ++k) B[k]=0.0;
 	  
@@ -426,13 +594,16 @@ until convergence
 	    for(int k = 0; k<Nk; ++k){
 	      B[j] += Qt[k+j*Nm] * evec[k];
 	    }
+//	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
 	  }
+//	_sort.push(eval2,B,Nk);

 	  Nconv = 0;
 	  //	  std::cout << std::setiosflags(std::ios_base::scientific);
 	  for(int i=0; i<Nk; ++i){

-	    _poly(_Linop,B[i],v);
+//	    _poly(_Linop,B[i],v);
+	    _Linop.HermOp(B[i],v);
 	    
 	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
 	    RealD vden = norm2(B[i]);
@@ -440,11 +611,13 @@ until convergence
 	    v -= eval2[i]*B[i];
 	    RealD vv = norm2(v);
 	    
+	    std::cout.precision(13);
 	    std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
 	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
 	    
-	    if(vv<eresid*eresid){
+	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
+	    if((vv<eresid*eresid) && (i == Nconv) ){
 	      Iconv[Nconv] = i;
 	      ++Nconv;
 	    }
@@ -455,7 +628,7 @@ until convergence

 	  std::cout<<" #modes converged: "<<Nconv<<std::endl;

-	  if( Nconv>=Nk ){
+	  if( Nconv>=Nstop ){
 	    goto converged;
 	  }
 	} // end of iter loop
@@ -1025,6 +1198,7 @@ static void Lock(DenseMatrix<T> &H, 	///Hess mtx

  }
 }
+#endif


 };
--- a/lib/communicator/.dirstamp
+++ b/lib/communicator/.dirstamp
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -32,7 +32,11 @@ namespace Grid {

  // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
+  int flag;
+  MPI_Initialized(&flag); // needed to coexist with other libs apparently
+  if ( !flag ) {
    MPI_Init(argc,argv);
+  }
 }

  int Rank(void) {
@@ -49,6 +53,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
+  std::cout << processors << std::endl;
  
  MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -90,7 +90,7 @@ namespace QCD {
    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;

-    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;

    // Spin matrix
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -113,6 +113,8 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		

+#define GparityFermOpTemplateInstantiate(A) 
+
 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////
@@ -208,6 +210,14 @@ typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;

+typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
+typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
+typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
+typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
+typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
+typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
+
+
  }}
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
--- a/lib/qcd/action/fermion/.dirstamp
+++ b/lib/qcd/action/fermion/.dirstamp
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -527,6 +527,7 @@ namespace QCD {
  }

  FermOpTemplateInstantiate(CayleyFermion5D);
+  GparityFermOpTemplateInstantiate(CayleyFermion5D);

 }}

--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@@ -48,14 +48,16 @@ namespace Grid {
 			GridCartesian         &FourDimGrid,
 			GridRedBlackCartesian &FourDimRedBlackGrid,
 			RealD _mass,RealD _M5,
-			RealD scale) :
+//			RealD scale):
+			RealD scale,const ImplParams &p= ImplParams()) :
      
      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
      MobiusFermion<Impl>(_Umu,
 		    FiveDimGrid,
 		    FiveDimRedBlackGrid,
 		    FourDimGrid,
-		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
+	FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
+//		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
      {
      }

--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -403,6 +403,7 @@ PARALLEL_FOR_LOOP

 
  FermOpTemplateInstantiate(WilsonFermion);
+  GparityFermOpTemplateInstantiate(WilsonFermion);


 }}
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -595,6 +595,7 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 }

 FermOpTemplateInstantiate(WilsonFermion5D);
+GparityFermOpTemplateInstantiate(WilsonFermion5D);

 }}

--- a/lib/qcd/hmc/.dirstamp
+++ b/lib/qcd/hmc/.dirstamp
--- a/lib/qcd/spin/.dirstamp
+++ b/lib/qcd/spin/.dirstamp
--- a/lib/qcd/utils/.dirstamp
+++ b/lib/qcd/utils/.dirstamp
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -83,7 +83,7 @@ public:
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static RealD sumPlaquette(const GaugeLorentz &Umu){
-    std::vector<GaugeMat> U(4,Umu._grid);
+    std::vector<GaugeMat> U(Nd,Umu._grid);

    for(int mu=0;mu<Nd;mu++){
      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@@ -111,7 +111,7 @@ public:
    return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
  }
  static RealD linkTrace(const GaugeLorentz &Umu){
-    std::vector<GaugeMat> U(4,Umu._grid);
+    std::vector<GaugeMat> U(Nd,Umu._grid);

    LatticeComplex Tr(Umu._grid); Tr=zero;
    for(int mu=0;mu<Nd;mu++){
@@ -124,7 +124,7 @@ public:

    double vol = Umu._grid->gSites();

-    return p.real()/vol/4.0/3.0;
+    return p.real()/vol/((double)(Nd*(Nd-1)));
  };
  //////////////////////////////////////////////////
  // the sum over all staples on each site
@@ -133,7 +133,7 @@ public:

    GridBase *grid = Umu._grid;

-    std::vector<GaugeMat> U(4,grid);
+    std::vector<GaugeMat> U(Nd,grid);
    for(int d=0;d<Nd;d++){
      U[d] = PeekIndex<LorentzIndex>(Umu,d);
    }
@@ -203,7 +203,7 @@ public:
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static RealD sumRectangle(const GaugeLorentz &Umu){
-    std::vector<GaugeMat> U(4,Umu._grid);
+    std::vector<GaugeMat> U(Nd,Umu._grid);

    for(int mu=0;mu<Nd;mu++){
      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@@ -329,7 +329,7 @@ public:
  static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
    GridBase *grid = Umu._grid;

-    std::vector<GaugeMat> U(4,grid);
+    std::vector<GaugeMat> U(Nd,grid);
    for(int d=0;d<Nd;d++){
      U[d] = PeekIndex<LorentzIndex>(Umu,d);
    }
--- a/lib/stamp-h1
+++ b/lib/stamp-h1
@@ -1 +0,0 @@
-timestamp for lib/Config.h
--- a/lib/stencil/.dirstamp
+++ b/lib/stencil/.dirstamp
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -8,6 +8,16 @@ endif
 AM_CXXFLAGS = -I$(top_srcdir)/lib
 AM_LDFLAGS = -L$(top_builddir)/lib

+if USE_LAPACK
+AM_CXXFLAGS += -DUSE_LAPACK
+if USE_LAPACK_LIB
+#if test "X${ac_LAPACK}X" != XyesX 
+AM_CXXFLAGS += -I$(ac_LAPACK)/include
+AM_LDFLAGS += -L$(ac_LAPACK)/lib
+#fi
+endif
+endif
+
 if BUILD_ZMM
  bin_PROGRAMS=Test_zmm
 endif
--- a/tests/Test_dwf_lanczos.cc
+++ b/tests/Test_dwf_lanczos.cc
@@ -31,6 +31,10 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;

+typedef typename GparityDomainWallFermionR::FermionField FermionField;
+
+RealD AllZero(RealD x){ return 0.;}
+
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@@ -41,41 +45,56 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);

  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG5rb(FrbGrid);  RNG5.SeedFixedIntegers(seeds5);

-  LatticeFermion    src(FGrid); gaussian(RNG5,src);
  LatticeGaugeField Umu(UGrid); 
-  SU3::HotConfiguration(RNG4, Umu);
+  SU3::TepidConfiguration(RNG4, Umu);

  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  
-  RealD mass=0.1;
+  RealD mass=0.01;
  RealD M5=1.8;
-  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  RealD mob_b=1.5;
+//  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  GparityMobiusFermionD ::ImplParams params;
+  std::vector<int> twists({1,1,1,0});
+  params.twists = twists;
+  GparityMobiusFermionR  Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);

-  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
+//  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
+//  SchurDiagTwoOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
+  SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
+//  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);

-  const int Nk = 30;
-  const int Np = 10;
+  const int Nstop = 30;
+  const int Nk = 40;
+  const int Np = 40;
  const int Nm = Nk+Np;
  const int MaxIt= 10000;
  RealD resid = 1.0e-8;

-  std::vector<double> Coeffs(1,1.0);
-  Polynomial<LatticeFermion> PolyX(Coeffs);
-  ImplicitlyRestartedLanczos<LatticeFermion> IRL(HermOp,PolyX,Nk,Nm,resid,MaxIt);
+  std::vector<double> Coeffs { 0.,-1.};
+  Polynomial<FermionField> PolyX(Coeffs);
+  Chebyshev<FermionField> Cheb(0.2,5.,11);
+//  ChebyshevLanczos<LatticeFermion> Cheb(9.,1.,0.,20);
+//  Cheb.csv(std::cout);
+//  exit(-24);
+  ImplicitlyRestartedLanczos<FermionField> IRL(HermOp,Cheb,Nstop,Nk,Nm,resid,MaxIt);

  
  std::vector<RealD>          eval(Nm);
-  std::vector<LatticeFermion> evec(Nm,FGrid);
-  for(int i=0;i<Nm;i++){
+  FermionField    src(FrbGrid); gaussian(RNG5rb,src);
+  std::vector<FermionField> evec(Nm,FrbGrid);
+  for(int i=0;i<1;i++){
    std::cout << i<<" / "<< Nm<< " grid pointer "<<evec[i]._grid<<std::endl;
  };