Merge branch 'master' of https://github.com/paboyle/Grid

2025-11-07 23:29:32 +00:00 · 2015-12-09 12:48:44 +00:00
parent 967be91692 26161addd0
commit a32a59fc43
24 changed files with 406 additions and 221 deletions
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -82,7 +82,7 @@ int main (int argc, char ** argv)
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall=100;
+  int ncall=10000;
  {
    double t0=usecond();
    for(int i=0;i<ncall;i++){
--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@@ -137,9 +137,6 @@
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
 /* Define to the home page for this package. */
 #undef PACKAGE_URL
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -17,7 +17,6 @@
 #define __X86_64
 #ifdef HAVE_EXECINFO_H
 #include <execinfo.h>
 #endif
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -5,22 +5,29 @@
 #include <ctime>
 #include <chrono>
 #include <string.h>
 #include <sys/ioctl.h>
 #include <sys/syscall.h>
 #include <linux/perf_event.h>
 #include <sys/ioctl.h>
 #ifdef __linux__
 #include <syscall.h>
 #include <linux/perf_event.h>
 #else
 #include <sys/syscall.h>
 #endif
 namespace Grid {
 #ifdef __linux__
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 			    int cpu, int group_fd, unsigned long flags)
 {
-  int ret;
+  int ret=0;
  ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
 		group_fd, flags);
  return ret;
 }
 #endif
 class PerformanceCounter {
@@ -63,7 +70,6 @@ public:
  int PCT;
  struct perf_event_attr pe;
  long long count;
  int fd;
  uint64_t elapsed;
@@ -74,15 +80,19 @@ public:
  }
  PerformanceCounter(int _pct) {
 #ifdef __linux__
    assert(_pct>=0);
    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
    fd=-1;
    count=0;
    PCT =_pct;
    Open();
 #endif
  }
  void Open(void) 
  {
 #ifdef __linux__
    struct perf_event_attr pe;
    memset(&pe, 0, sizeof(struct perf_event_attr));
    pe.size = sizeof(struct perf_event_attr);
@@ -99,32 +109,48 @@ public:
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
 #endif
  }
  void Start(void)
  {
 #ifdef __linux__
    if ( fd!= -1) {
      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
    }
    begin  =__rdtsc();
 #else
    begin = 0;
 #endif
  }
  void Stop(void) {
    count=0;
 #ifdef __linux__
    if ( fd!= -1) {
      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::read(fd, &count, sizeof(long long));
    }
    elapsed = __rdtsc() - begin;
 #else
    elapsed = 0;
 #endif
  }
  void Report(void) {
 #ifdef __linux__
    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
 #else
    printf("%llu cycles \n", elapsed );
 #endif
  }
  ~PerformanceCounter()
  {
 #ifdef __linux__
    close(fd);
 #endif
  }
 };
--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -44,7 +44,7 @@ class GridThread {
  };
  static void SetMaxThreads(void) { 
 #ifdef GRID_OMP
-    setenv("KMP_AFFINITY","balanced",1);
+    //    setenv("KMP_AFFINITY","balanced",1);
    _threads = omp_get_max_threads();
    omp_set_num_threads(_threads);
 #else 
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -264,6 +264,9 @@ PARALLEL_FOR_LOOP
      for(int i=0;i<nbasis;i++){
 	phi=Subspace.subspace[i];
 	std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
 	for(int p=0;p<geom.npoint;p++){ 
 	  int dir   = geom.directions[p];
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -166,7 +166,6 @@ namespace Grid {
      Field *Tn  = &T1;
      Field *Tnp = &T2;
      std::cout<<GridLogMessage << "Chebyshev ["<<lo<<","<<hi<<"]"<< " order "<<order <<std::endl;
      // Tn=T1 = (xscale M + mscale)in
      RealD xscale = 2.0/(hi-lo);
      RealD mscale = -(hi+lo)/(hi-lo);
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@@ -25,6 +25,9 @@ template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N)
  assert(N==M);
 }
 template<class T> void Resize(DenseVector<T > & mat, int N) { 
  mat.resize(N);
 }
 template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
  mat.resize(N);
  for(int i=0;i<N;i++){
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -1,4 +1,3 @@
 #if 0
 #ifndef GRID_IRL_H
 #define GRID_IRL_H
@@ -18,8 +17,9 @@ template<class Field>
    const RealD small = 1.0e-16;
 public:       
    int lock;
-    int converged;
+    int get;
    int Niter;
    int converged;
    int Nk;      // Number of converged sought
    int Np;      // Np -- Number of spare vecs in kryloc space
@@ -59,6 +59,7 @@ public:
    // Sanity checked this routine (step) against Saad.
    /////////////////////////
    void RitzMatrix(DenseVector<Field>& evec,int k){
      if(1) return;
      GridBase *grid = evec[0]._grid;
@@ -451,8 +452,9 @@ until convergence
 	std::cout << " -- Nconv       = "<< Nconv  << "\n";
      }
-
+    /////////////////////////////////////////////////
    // Adapted from Rudy's lanczos factor routine
    /////////////////////////////////////////////////
    int Lanczos_Factor(int start, int end,  int cont,
 		       DenseVector<Field> & bq, 
 		       Field &bf,
@@ -546,10 +548,16 @@ until convergence
 	std::cout << "alpha = " << alpha << " fnorm = " << fnorm << '\n';
 	///Iterative refinement of orthogonality V = [ bq[0]  bq[1]  ...  bq[M] ]
 #if 0
 	int re = 0;
 	// FIXME undefined params; how set in Rudy's code
 	int ref =0;
 	Real rho = 1.0e-8;
 	while( re == ref || (sqbt < rho * bck && re < 5) ){
 	  Field tmp2(grid);
 	  Field tmp1(grid);
 	  //bex = V^dag bf
 	  DenseVector<ComplexD> bex(j+1);
 	  for(int k=0;k<j+1;k++){
@@ -566,14 +574,14 @@ until convergence
 	  //bf = bf - V V^dag bf.   Subtracting off any component in span { V[j] } 
 	  RealD btc = axpy_norm(bf,-1.0,tmp2,bf);
-	  alpha = alpha + bex[j];	      sqbt = sqrt(real(btc));	      
+	  alpha = alpha + real(bex[j]);	      sqbt = sqrt(real(btc));	      
 	  // FIXME is alpha real in RUDY's code?
 	  RealD nmbex = 0;for(int k=0;k<j+1;k++){nmbex = nmbex + real( conjugate(bex[k])*bex[k]  );}
 	  bck = sqrt( nmbex );
 	  re++;
 	}
 	std::cout << "Iteratively refined orthogonality, changes alpha\n";
 	if(re > 1) std::cout << "orthagonality refined " << re << " times" <<std::endl;
 #endif
 	H[j][j]=alpha;
      }
@@ -641,7 +649,7 @@ until convergence
      int M=Nm;
      DenseMatrix<RealD> H; Resize(H,Nm,Nm);
-      Resize(evals,Nm,Nm);
+      Resize(evals,Nm);
      Resize(evecs,Nm);
      int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with
@@ -702,7 +710,6 @@ until convergence
 	RealD beta;
 	Householder_vector<RealD>(ck, 0, 2, v, beta);
 	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,0);
 	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,1);
 	///Accumulate eigenvector
@@ -758,11 +765,11 @@ until convergence
      RealD resid_nrm=  norm2(bf);
      if(!lock) converged = 0;
-
+#if 0
      for(int i = SS - lock_num - 1; i >= SS - Nk && i >= 0; --i){
 	RealD diff = 0;
-	diff = abs(tevecs[i][Nm - 1 - lock_num]) * resid_nrm;
+	diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm;
 	std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl;
@@ -785,53 +792,29 @@ until convergence
 	  break;
 	}
      }
 #endif
      std::cout << "Got " << converged << " so far " <<std::endl;	
    }
 #if 0
    ///Check
    void Check(void) {
-      DenseVector<RealD> goodval(get);
+    ///Check
    void Check(DenseVector<RealD> &evals,
 	       DenseVector<DenseVector<RealD> > &evecs) {
      DenseVector<RealD> goodval(this->get);
      EigenSort(evals,evecs);
      int NM = Nm;
      int Nget = this->get;
      S **V;
      V = new S* [NM];
-      RealD *QZ;
+      DenseVector< DenseVector<RealD> > V; Size(V,NM);
-      QZ = new RealD [NM*NM];
+      DenseVector<RealD> QZ(NM*NM);
      for(int i = 0; i < NM; i++){
 	for(int j = 0; j < NM; j++){
-
+	  // evecs[i][j];
 	  QZ[i*NM+j] = this->evecs[i][j];
          int f_size_cb = 24*dop.cbLs*dop.node_cbvol;
 	  for(int cb = this->prec; cb < 2; cb++){
 	    for(int i = 0; i < NM; i++){
 	      V[i] = (S*)(this->bq[i][cb]);
 	      const int m0 = 4 * 4; // this is new code
 	      assert(m0 % 16 == 0); // see the reason in VtimesQ.C
 	      const int row_per_thread = f_size_cb / (bfmarg::threads);
 	      {
 		{
 		  DenseVector<RealD> vrow_tmp0(m0*NM);
 		  DenseVector<RealD> vrow_tmp1(m0*NM);
 		  RealD *row_tmp0 = vrow_tmp0.data();
 		  RealD *row_tmp1 = vrow_tmp1.data();
 		  VtimesQ(QZ, NM, V, row_tmp0, row_tmp1, id * row_per_thread, m0, (id + 1) * row_per_thread);
 		}
 	      }
 	    }
 	  }
 	}
      }
    }
 #endif
 /**
@@ -1020,4 +1003,4 @@ static void Lock(DenseMatrix<T> &H, 	///Hess mtx
 }
 #endif
-#endif
+
--- a/lib/lattice/Lattice_unary.h
+++ b/lib/lattice/Lattice_unary.h
@@ -24,6 +24,17 @@ PARALLEL_FOR_LOOP
    return ret;
  }
  template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=div(rhs._odata[ss],y);
    }
    return ret;
  }
  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, ComplexD alpha, Integer Nexp = DEFAULT_MAT_EXP){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -266,11 +266,8 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
    if( this->HandOptDslash ) {
 #pragma omp parallel for schedule(static)
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sU=ss;
 	  if (    LebesgueOrder::UseLebesgueOrder ) {
 	    sU=lo.Reorder(ss);
 	  }
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 	  }
@@ -323,52 +320,42 @@ PARALLEL_FOR_LOOP
      //      Counter.Report();
      //      }
    } else if( this->HandOptDslash ) {
      /*
-#pragma omp parallel for 
+#pragma omp parallel for schedule(static)
      for(int t=0;t<threads;t++){
 	int hyperthread = t%HT;
 	int core        = t/HT;
-        int sswork, swork,soff, sU,sF;
+        int sswork, swork,soff,ssoff,  sU,sF;
-
+	
-	sswork = (nwork + cores-1)/cores;
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 	for(int ss=0;ss<sswork;ss++){
-	  sU=ss+core*sswork; // max locality within an L2 slice
+	  sU=ss+ ssoff;
-	  if ( LebesgueOrder::UseLebesgueOrder ) {
+	  for(int s=soff;s<soff+swork;s++){
-	    sU = lo.Reorder(sU);
+	    sF = s+Ls*sU;
 	    Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	  }
 	  if ( sU < nwork ) {
 	    for(int s=soff;s<soff+swork;s++){
 	      sF = s+Ls*sU;
 	      Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	    }
 	  }
 	}
      }
      /*
 #pragma omp parallel for schedule(static)
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<Ls;s++){
 	  int sU=ss;
 	  if (    LebesgueOrder::UseLebesgueOrder ) {
 	    sU=lo.Reorder(ss);
 	  }
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
      }
      */
 #pragma omp parallel for schedule(static)
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  //	  int sU=lo.Reorder(ss);
 	  int sU=ss;
 	  int sF = s+Ls*sU; 
 	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -29,7 +29,7 @@ namespace Grid {
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
 #if defined(AVX512) || defined(IMCI)
-     void DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
 #else
--- a/lib/serialisation/BinaryIO.cc
+++ b/lib/serialisation/BinaryIO.cc
@@ -1,15 +1,14 @@
 #include <Grid.h>
 using namespace Grid;
 using namespace std;
 namespace Grid {
 // Writer implementation ///////////////////////////////////////////////////////
-BinaryWriter::BinaryWriter(const string &fileName)
+BinaryWriter::BinaryWriter(const std::string &fileName)
-: file_(fileName, ios::binary|ios::out)
+: file_(fileName, std::ios::binary|std::ios::out)
 {}
 template <>
-void BinaryWriter::writeDefault(const string &s, const string &output)
+void BinaryWriter::writeDefault(const std::string &s, const std::string &output)
 {
  uint64_t sz = output.size();
@@ -21,12 +20,12 @@ void BinaryWriter::writeDefault(const string &s, const string &output)
 }
 // Reader implementation ///////////////////////////////////////////////////////
-BinaryReader::BinaryReader(const string &fileName)
+BinaryReader::BinaryReader(const std::string &fileName)
-: file_(fileName, ios::binary|ios::in)
+: file_(fileName, std::ios::binary|std::ios::in)
 {}
 template <>
-void BinaryReader::readDefault(const string &s, string &output)
+void BinaryReader::readDefault(const std::string &s, std::string &output)
 {
  uint64_t sz;
@@ -34,3 +33,4 @@ void BinaryReader::readDefault(const string &s, string &output)
  output.reserve(sz);
  file_.read((char *)output.data(), sz);
 }
 }
--- a/lib/serialisation/TextIO.cc
+++ b/lib/serialisation/TextIO.cc
@@ -1,14 +1,12 @@
 #include <Grid.h>
-using namespace Grid;
+namespace Grid {
 using namespace std;
 // Writer implementation ///////////////////////////////////////////////////////
-TextWriter::TextWriter(const string &fileName)
+TextWriter::TextWriter(const std::string &fileName)
-: file_(fileName, ios::out)
+: file_(fileName, std::ios::out)
 {}
-void TextWriter::push(const string &s)
+void TextWriter::push(const std::string &s)
 {
  level_++;
 };
@@ -27,11 +25,11 @@ void TextWriter::indent(void)
 };
 // Reader implementation ///////////////////////////////////////////////////////
-TextReader::TextReader(const string &fileName)
+TextReader::TextReader(const std::string &fileName)
-: file_(fileName, ios::in)
+: file_(fileName, std::ios::in)
 {}
-void TextReader::push(const string &s)
+void TextReader::push(const std::string &s)
 {
  level_++;
 };
@@ -50,17 +48,18 @@ void TextReader::checkIndent(void)
    file_.get(c);
    if (c != '\t')
    {
-      cerr << "mismatch on tab " << c << " level " << level_;
+      std::cerr << "mismatch on tab " << c << " level " << level_;
-      cerr << " i "<< i <<endl;
+      std::cerr << " i "<< i <<std::endl;
-      abort();
+      std::abort();
    }
  }
 }
 template <>
-void TextReader::readDefault(const string &s, string &output)
+void TextReader::readDefault(const std::string &s, std::string &output)
 {
  checkIndent();
  output.clear();
  getline(file_, output);
 }
 }
--- a/lib/serialisation/XmlIO.cc
+++ b/lib/serialisation/XmlIO.cc
@@ -1,10 +1,8 @@
 #include <Grid.h>
-using namespace Grid;
+namespace Grid {
 using namespace std;
 // Writer implementation ///////////////////////////////////////////////////////
-XmlWriter::XmlWriter(const string &fileName)
+XmlWriter::XmlWriter(const std::string &fileName)
 : fileName_(fileName)
 {
  node_ = doc_.append_child();
@@ -16,7 +14,7 @@ XmlWriter::~XmlWriter(void)
  doc_.save_file(fileName_.c_str(), "  ");
 }
-void XmlWriter::push(const string &s)
+void XmlWriter::push(const std::string &s)
 {
  node_ = node_.append_child(s.c_str());
 }
@@ -27,22 +25,22 @@ void XmlWriter::pop(void)
 }
 // Reader implementation ///////////////////////////////////////////////////////
-XmlReader::XmlReader(const string &fileName)
+XmlReader::XmlReader(const std::string &fileName)
 : fileName_(fileName)
 {
  pugi::xml_parse_result result = doc_.load_file(fileName_.c_str());
  if ( !result )
  {
-    cerr << "XML error description: " << result.description() << "\n";
+    std::cerr << "XML error description: " << result.description() << "\n";
-    cerr << "XML error offset     : " << result.offset        << "\n";
+    std::cerr << "XML error offset     : " << result.offset        << "\n";
-    abort();
+    std::abort();
  }
  node_ = doc_.child("grid");
 }
-void XmlReader::push(const string &s)
+void XmlReader::push(const std::string &s)
 {
  node_ = node_.child(s.c_str());
 }
@@ -53,7 +51,8 @@ void XmlReader::pop(void)
 }
 template <>
-void XmlReader::readDefault(const string &s, string &output)
+void XmlReader::readDefault(const std::string &s, std::string &output)
 {
  output = node_.child(s.c_str()).first_child().value();
 }
 }
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@@ -96,6 +96,7 @@ namespace Grid
      node_.child("elem").set_name("elem-done");
      i++;
    }
    //    assert( is.tellg()==-1);
    pop();
  }
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -67,6 +67,14 @@ namespace Grid {
    }
  };
  template<class scalar> struct DivIntFunctor {
    Integer y;
  DivIntFunctor(Integer _y) : y(_y) {};
    scalar operator()(const scalar &a)  const {
      return Integer(a)/y;
    }
  };
  template<class scalar> struct RealFunctor {
    scalar operator()(const scalar &a)  const {
      return real(a);
@@ -131,6 +139,10 @@ namespace Grid {
  inline Grid_simd<S,V> mod(const Grid_simd<S,V> &r,Integer y) {
    return SimdApply(ModIntFunctor<S>(y),r);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> div(const Grid_simd<S,V> &r,Integer y) {
    return SimdApply(DivIntFunctor<S>(y),r);
  }
  ////////////////////////////////////////////////////////////////////////////
  // Allows us to assign into **conformable** real vectors from complex
  ////////////////////////////////////////////////////////////////////////////
--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -111,7 +111,7 @@ template<class obj,int N> inline auto toComplex(const iMatrix<obj,N> &z) -> type
  return ret;
 }
-
+BINARY_RSCALAR(div,Integer);
 BINARY_RSCALAR(mod,Integer);
 BINARY_RSCALAR(pow,RealD);
--- a/scripts/configure-commands
+++ b/scripts/configure-commands
@@ -59,7 +59,7 @@ clang-avx2)
 CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-avx-openmp)
-CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
+CXX=clang-omp++ ../../configure --enable-precision=double --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-xc30)
 CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS=""  LIBS="-lgmp -lmpfr" --enable-comms=none
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,6 @@
 bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_RectPlaq
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
 Test_GaugeAction_LDADD=-lGrid
--- a/tests/Test_cheby.cc
+++ b/tests/Test_cheby.cc
@@ -57,5 +57,21 @@ int main (int argc, char ** argv)
    ChebyStep.csv(of);
  }
  lo=-8;
  hi=8;
  Chebyshev<LatticeFermion> ChebyIndefInv(lo,hi,40,InverseApproximation);
  {
    std::ofstream of("chebyindefinv");
    ChebyIndefInv.csv(of);
  }
  lo=0;
  hi=64;
  Chebyshev<LatticeFermion> ChebyNE(lo,hi,40,InverseApproximation);
  {
    std::ofstream of("chebyNE");
    ChebyNE.csv(of);
  }
  Grid_finalize();
 }
--- a/tests/Test_dwf_hdcr.cc
+++ b/tests/Test_dwf_hdcr.cc
@@ -6,6 +6,22 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 class myclass: Serializable {
 public:
  GRID_DECL_CLASS_MEMBERS(myclass,
 			  int, domaindecompose,
 			  int, domainsize,
 			  int, order,
 			  double, lo,
 			  double, hi,
 			  int, steps);
  myclass(){};
 };
 myclass params;
 RealD InverseApproximation(RealD x){
  return 1.0/x;
 }
@@ -26,15 +42,21 @@ public:
  Aggregates     & _Aggregates;
  CoarseOperator & _CoarseOperator;
-  Matrix         & _Matrix;
+  Matrix         & _FineMatrix;
  FineOperator   & _FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
  // Constructor
-  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, FineOperator &Fine,Matrix &FineMatrix) 
+  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
 			  FineOperator &Fine,Matrix &FineMatrix,
 			  FineOperator &Smooth,Matrix &SmootherMatrix) 
    : _Aggregates(Agg),
      _CoarseOperator(Coarse),
      _FineOperator(Fine),
-      _Matrix(FineMatrix)
+      _FineMatrix(FineMatrix),
      _SmootherOperator(Smooth),
      _SmootherMatrix(SmootherMatrix)
  {
  }
@@ -43,7 +65,7 @@ public:
    FineField p1(in._grid);
    FineField p2(in._grid);
-    MdagMLinearOperator<Matrix,FineField>   fMdagMOp(_Matrix);
+    MdagMLinearOperator<Matrix,FineField>   fMdagMOp(_FineMatrix);
    p1=in;
    RealD absp2;
@@ -58,74 +80,20 @@ public:
    }
  }
 #if 0
  void operator()(const FineField &in, FineField & out) {
-
+    if ( params.domaindecompose ) {
-    FineField Min(in._grid);
+      operatorSAP(in,out);
-    FineField tmp(in._grid);
+    } else { 
-
+      operatorCheby(in,out);
-    CoarseVector Csrc(_CoarseOperator.Grid());
+    }
    CoarseVector Ctmp(_CoarseOperator.Grid());
    CoarseVector Csol(_CoarseOperator.Grid());
    // Monitor completeness of low mode space
    _Aggregates.ProjectToSubspace  (Csrc,in);
    _Aggregates.PromoteFromSubspace(Csrc,out);
    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
    // Build some solvers
    ConjugateGradient<FineField>    fCG(1.0e-3,1000);
    ConjugateGradient<CoarseVector>  CG(1.0e-8,100000);
    ////////////////////////////////////////////////////////////////////////
    // ADEF2: [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
    ////////////////////////////////////////////////////////////////////////
    // Smoothing step, followed by coarse grid correction
    MdagMLinearOperator<Matrix,FineField> MdagMOp(_Matrix);
    Min=in;
    std::cout<<GridLogMessage<< " Preconditioner in  " << norm2(in)<<std::endl; 
    _FineOperator.AdjOp(Min,tmp);
    std::cout<<GridLogMessage<< " Preconditioner tmp  " << norm2(in)<<std::endl; 
    fCG(MdagMOp,tmp,out);
    _FineOperator.Op(out,tmp);
    std::cout<<GridLogMessage<< " Preconditioner in  " << norm2(in)<<std::endl; 
    std::cout<<GridLogMessage<< " Preconditioner out " << norm2(out)<<std::endl; 
    std::cout<<GridLogMessage<< " Preconditioner Aout" << norm2(tmp)<<std::endl; 
    tmp = tmp - in;
    std::cout<<GridLogMessage<<"preconditioner thinks residual is "<<std::sqrt(norm2(tmp)/norm2(in))<<std::endl;
    /*
    //    _FineOperator.Op(Min,out);
    //    out = in -out; // out = in - A Min
    out = in;
        MdagMLinearOperator<CoarseOperator,CoarseVector> MdagMOp(_CoarseOperator);
    HermitianLinearOperator<CoarseOperator,CoarseVector> HermOp(_CoarseOperator);
    Csol=zero;
    _Aggregates.ProjectToSubspace  (Csrc,out);
    HermOp.AdjOp(Csrc,Ctmp);// Normal equations
    CG(MdagMOp  ,Ctmp,Csol);
    _Aggregates.PromoteFromSubspace(Csol,out);
    out = Min + out;;
    */
  }
 #endif
    ////////////////////////////////////////////////////////////////////////
    // ADEF2: [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
    // ADEF1: [MP+Q ] in =M [1 - A Q] in + Q in  
    ////////////////////////////////////////////////////////////////////////
-#if 0
+#if 1
-  void operator()(const FineField &in, FineField & out) {
+  void operatorADEF2(const FineField &in, FineField & out) {
    CoarseVector Csrc(_CoarseOperator.Grid());
    CoarseVector Ctmp(_CoarseOperator.Grid());
@@ -136,7 +104,7 @@ public:
    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    MdagMLinearOperator<Matrix,FineField>               fMdagMOp(_Matrix);
+    MdagMLinearOperator<Matrix,FineField>               fMdagMOp(_FineMatrix);
    FineField tmp(in._grid);
    FineField res(in._grid);
@@ -189,8 +157,8 @@ public:
  }
 #endif
  // ADEF1: [MP+Q ] in =M [1 - A Q] in + Q in  
-#if 0
+#if 1
-  void operator()(const FineField &in, FineField & out) {
+  void operatorADEF1(const FineField &in, FineField & out) {
    CoarseVector Csrc(_CoarseOperator.Grid());
    CoarseVector Ctmp(_CoarseOperator.Grid());
@@ -201,7 +169,7 @@ public:
    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    ShiftedMdagMLinearOperator<Matrix,FineField>        fMdagMOp(_Matrix,0.1);
+    ShiftedMdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix,0.1);
    FineField tmp(in._grid);
    FineField res(in._grid);
@@ -234,14 +202,79 @@ public:
  }
 #endif
  void SAP (const FineField & src,FineField & psi){
    Lattice<iScalar<vInteger> > coor(src._grid);
    Lattice<iScalar<vInteger> > subset(src._grid);
    FineField r(src._grid);
    FineField zz(src._grid); zz=zero;
    FineField vec1(src._grid);
    FineField vec2(src._grid);
    const Integer block=params.domainsize;
    subset=zero;
    for(int mu=0;mu<Nd;mu++){
      LatticeCoordinate(coor,mu+1);
      coor = div(coor,block);
      subset = subset+coor;
    }
    subset = mod(subset,(Integer)2);
    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_SmootherMatrix,0.0);
    Chebyshev<FineField> Cheby  (params.lo,params.hi,params.order,InverseApproximation);
    RealD resid;
    for(int i=0;i<params.steps;i++){
      // Even domain residual
      _FineOperator.Op(psi,vec1);// this is the G5 herm bit
      r= src - vec1 ;
      resid = norm2(r) /norm2(src); 
      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
 // Npoly*outer*2 1/2 vol matmuls.
 // 71 iters => 20*71 = 1400 matmuls.
 // 2*71 = 140 comms.
      // Even domain solve
      r= where(subset==(Integer)0,r,zz);
      _SmootherOperator.AdjOp(r,vec1);
      Cheby(fMdagMOp,vec1,vec2);    // solves  MdagM = g5 M g5M
      psi = psi + vec2;  
      // Odd domain residual
      _FineOperator.Op(psi,vec1);// this is the G5 herm bit
      r= src - vec1 ;
      r= where(subset==(Integer)1,r,zz);
      resid = norm2(r) /norm2(src); 
      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
      // Odd domain solve
      _SmootherOperator.AdjOp(r,vec1);
      Cheby(fMdagMOp,vec1,vec2);    // solves  MdagM = g5 M g5M
      psi = psi + vec2;  
      _FineOperator.Op(psi,vec1);// this is the G5 herm bit
      r= src - vec1 ;
      resid = norm2(r) /norm2(src); 
      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
    }
  };
  void SmootherTest (const FineField & in){
    FineField vec1(in._grid);
    FineField vec2(in._grid);
    RealD lo[3] = { 0.5, 1.0, 2.0};
-    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_Matrix);
+    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix);
-    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_Matrix,0.5);
+    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_SmootherMatrix,0.0);
    RealD Ni,r;
@@ -250,7 +283,7 @@ public:
    for(int ilo=0;ilo<3;ilo++){
      for(int ord=5;ord<50;ord*=2){
-	_FineOperator.AdjOp(in,vec1);
+	_SmootherOperator.AdjOp(in,vec1);
 	Chebyshev<FineField> Cheby  (lo[ilo],70.0,ord,InverseApproximation);
 	Cheby(fMdagMOp,vec1,vec2);    // solves  MdagM = g5 M g5M
@@ -264,7 +297,7 @@ public:
    }
  }
-  void operator()(const FineField &in, FineField & out) {
+  void operatorCheby(const FineField &in, FineField & out) {
    CoarseVector Csrc(_CoarseOperator.Grid());
    CoarseVector Ctmp(_CoarseOperator.Grid());
@@ -275,18 +308,18 @@ public:
    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_Matrix);
+    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix);
-    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_Matrix,0.0);
+    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_SmootherMatrix,0.0);
    FineField vec1(in._grid);
    FineField vec2(in._grid);
    //    Chebyshev<FineField> Cheby    (0.5,70.0,30,InverseApproximation);
    //    Chebyshev<FineField> ChebyAccu(0.5,70.0,30,InverseApproximation);
-    Chebyshev<FineField> Cheby    (2.0,70.0,10,InverseApproximation);
+    Chebyshev<FineField> Cheby    (2.0,70.0,15,InverseApproximation);
-    Chebyshev<FineField> ChebyAccu(2.0,70.0,10,InverseApproximation);
+    Chebyshev<FineField> ChebyAccu(2.0,70.0,15,InverseApproximation);
-    Cheby.JacksonSmooth();
+    //    Cheby.JacksonSmooth();
-    ChebyAccu.JacksonSmooth();
+    //    ChebyAccu.JacksonSmooth();
    _Aggregates.ProjectToSubspace  (Csrc,in);
    _Aggregates.PromoteFromSubspace(Csrc,out);
@@ -305,7 +338,7 @@ public:
    RealD Ni = norm2(in);
-    _FineOperator.AdjOp(in,vec1);// this is the G5 herm bit
+    _SmootherOperator.AdjOp(in,vec1);// this is the G5 herm bit
    ChebyAccu(fMdagMOp,vec1,out);    // solves  MdagM = g5 M g5M
    std::cout<<GridLogMessage << "Smoother norm "<<norm2(out)<<std::endl;
@@ -334,23 +367,89 @@ public:
    std::cout<<GridLogMessage << "Coarse resid "<<std::sqrt(r/Ni)<<std::endl;
    // Reapply smoother
-    _FineOperator.Op(vec1,vec2);  // this is the G5 herm bit
+    _SmootherOperator.Op(vec1,vec2);  // this is the G5 herm bit
    ChebyAccu(fMdagMOp,vec2,vec1);    // solves  MdagM = g5 M g5M
    out =out+vec1;
    _FineOperator.Op(out,vec1);// this is the G5 herm bit
    vec1  = in - vec1;   // tmp  = in - A Min
    r=norm2(vec1);
    std::cout<<GridLogMessage << "Smoother resid "<<std::sqrt(r/Ni)<<std::endl;
  }
  void operatorSAP(const FineField &in, FineField & out) {
    CoarseVector Csrc(_CoarseOperator.Grid());
    CoarseVector Ctmp(_CoarseOperator.Grid());
    CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;
    ConjugateGradient<CoarseVector>  CG(1.0e-3,100000);
    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
    FineField vec1(in._grid);
    FineField vec2(in._grid);
    _Aggregates.ProjectToSubspace  (Csrc,in);
    _Aggregates.PromoteFromSubspace(Csrc,out);
    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
    // To make a working smoother for indefinite operator
    // must multiply by "Mdag" (ouch loses all low mode content)
    // and apply to poly approx of (mdagm)^-1.
    // so that we end up with an odd polynomial.
    SAP(in,out);
    // Update with residual for out
    _FineOperator.Op(out,vec1);// this is the G5 herm bit
    vec1  = in - vec1;   // tmp  = in - A Min
    RealD r = norm2(vec1);
    RealD Ni = norm2(in);
    std::cout<<GridLogMessage << "SAP resid "<<std::sqrt(r/Ni)<< " " << r << " " << Ni <<std::endl;
    _Aggregates.ProjectToSubspace  (Csrc,vec1);
    HermOp.AdjOp(Csrc,Ctmp);// Normal equations
    CG(MdagMOp,Ctmp,Csol);
    _Aggregates.PromoteFromSubspace(Csol,vec1); // Ass^{-1} [in - A Min]_s
                                             // Q = Q[in - A Min]  
    out = out+vec1;
    // Three preconditioner smoothing -- hermitian if C3 = C1
    // Recompute error
    _FineOperator.Op(out,vec1);// this is the G5 herm bit
    vec1  = in - vec1;   // tmp  = in - A Min
    r=norm2(vec1);
    std::cout<<GridLogMessage << "Coarse resid "<<std::sqrt(r/Ni)<<std::endl;
    // Reapply smoother
    SAP(vec1,vec2);
    out =out+vec2;
    // Update with residual for out
    _FineOperator.Op(out,vec1);// this is the G5 herm bit
    vec1  = in - vec1;   // tmp  = in - A Min
    r = norm2(vec1);
    Ni = norm2(in);
    std::cout<<GridLogMessage << "SAP resid(post) "<<std::sqrt(r/Ni)<< " " << r << " " << Ni <<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  XmlReader RD("params.xml");
  read(RD,"params",params);
  std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
  const int Ls=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
@@ -385,11 +484,27 @@ int main (int argc, char ** argv)
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeGaugeField Umu(UGrid); 
  LatticeGaugeField UmuDD(UGrid); 
  LatticeColourMatrix U(UGrid);
  LatticeColourMatrix zz(UGrid);
  NerscField header;
  std::string file("./ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu,header,file);
  if ( params.domaindecompose ) { 
    Lattice<iScalar<vInteger> > coor(UGrid);
    zz=zero;
    for(int mu=0;mu<Nd;mu++){
      LatticeCoordinate(coor,mu);
      U = PeekIndex<LorentzIndex>(Umu,mu);
      U = where(mod(coor,params.domainsize)==(Integer)0,zz,U);
      PokeIndex<LorentzIndex>(UmuDD,U,mu);
    }
  } else { 
    UmuDD = Umu;
  }
  //  SU3::ColdConfiguration(RNG4,Umu);
  //  SU3::TepidConfiguration(RNG4,Umu);
  //  SU3::HotConfiguration(RNG4,Umu);
@@ -402,6 +517,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionR DdwfDD(UmuDD,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  const int nbasis = 32;
  //  const int nbasis = 4;
@@ -438,6 +554,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOpDD(DdwfDD);
  CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LDOp(*Coarse5d);
  LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
@@ -467,7 +584,13 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Building deflation preconditioner "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR> Precon(Aggregates, LDOp,HermIndefOp,Ddwf);
+  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR> Precon  (Aggregates, LDOp,
 											   HermIndefOp,Ddwf,
 											   HermIndefOp,Ddwf);
  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR> PreconDD(Aggregates, LDOp,
 											   HermIndefOp,Ddwf,
 											   HermIndefOpDD,DdwfDD);
  TrivialPrecon<LatticeFermion> simple;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
@@ -475,9 +598,20 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  Precon.SmootherTest(src);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  PreconDD.SmootherTest(src);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  PreconDD.SAP(src,result);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  //  TrivialPrecon<LatticeFermion> simple;
  //  ConjugateGradient<LatticeFermion> fCG(1.0e-8,100000);
  //  fCG(HermDefOp,src,result);
@@ -496,12 +630,22 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  Precon.PowerMethod(src);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
  result=zero;
  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
  PGCRDD(HermIndefOp,src,result);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
+  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
-  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
-  PGCR(HermIndefOp,src,result);
+  //  result=zero;
  //  PGCR(HermIndefOp,src,result);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
@@ -516,6 +660,7 @@ int main (int argc, char ** argv)
  pCG(HermOpEO,src_o,result_o);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -52,6 +52,7 @@ int main (int argc, char ** argv)
  }
  */
  typedef CartesianStencil<vobj,vobj,SimpleCompressor<vobj> > Stencil;
    for(int dir=0;dir<4;dir++){
      for(int disp=0;disp<Fine._fdimensions[dir];disp++){
@@ -61,7 +62,7 @@ int main (int argc, char ** argv)
 	std::vector<int> directions(npoint,dir);
 	std::vector<int> displacements(npoint,disp);
-	CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
+	Stencil myStencil(&Fine,npoint,0,directions,displacements);
 	std::vector<int> ocoor(4);
 	for(int o=0;o<Fine.oSites();o++){
@@ -142,8 +143,8 @@ int main (int argc, char ** argv)
 	std::vector<int> directions(npoint,dir);
 	std::vector<int> displacements(npoint,disp);
-	CartesianStencil EStencil(&rbFine,npoint,Even,directions,displacements);
+	Stencil EStencil(&rbFine,npoint,Even,directions,displacements);
-	CartesianStencil OStencil(&rbFine,npoint,Odd,directions,displacements);
+	Stencil OStencil(&rbFine,npoint,Odd,directions,displacements);
 	std::vector<int> ocoor(4);
 	for(int o=0;o<Fine.oSites();o++){
--- a/tests/Test_synthetic_lanczos.cc
+++ b/tests/Test_synthetic_lanczos.cc
@@ -8,6 +8,7 @@ using namespace Grid::QCD;
 static int
 FEenableexcept (unsigned int excepts)
 {
 #if 0
  static fenv_t fenv;
  unsigned int new_excepts = excepts & FE_ALL_EXCEPT,
    old_excepts;  // previous masks
@@ -20,6 +21,9 @@ FEenableexcept (unsigned int excepts)
  fenv.__mxcsr   &= ~(new_excepts << 7);
  return ( fesetenv (&fenv) ? -1 : old_excepts );
 #else
  return 0;
 #endif
 }
@@ -35,7 +39,7 @@ public:
    random(pRNG,scale);
-    scale = exp(-real(scale)*6.0);
+    scale = exp(-real(scale)*3.0);
    std::cout << " True matrix \n"<< scale <<std::endl;
  }
@@ -70,7 +74,7 @@ public:
 int main (int argc, char ** argv)
 {
-  FEenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); 
+  //  FEenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); 
  Grid_init(&argc,&argv);
@@ -88,8 +92,7 @@ int main (int argc, char ** argv)
  RealD mu    = 0.0;
  int order = 11;
  ChebyshevLanczos<LatticeComplex> Cheby(alpha,beta,mu,order);
-
+  std::ofstream file("cheby.dat");
  std::ofstream file("pooh.dat");
  Cheby.csv(file);
  HermOpOperatorFunction<LatticeComplex> X;
@@ -114,9 +117,9 @@ int main (int argc, char ** argv)
  }
  {
-    std::vector<RealD>          eval(Nm);
+    //    std::vector<RealD>          eval(Nm);
-    std::vector<LatticeComplex> evec(Nm,grid);
+    //    std::vector<LatticeComplex> evec(Nm,grid);
-    ChebyIRL.calc(eval,evec,src, Nconv);
+    //    ChebyIRL.calc(eval,evec,src, Nconv);
  }
  Grid_finalize();