diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index f94a3e38..f86fd072 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -82,7 +82,7 @@ int main (int argc, char ** argv)
   DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
   
   std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall=100;
+  int ncall=10000;
   {
     double t0=usecond();
     for(int i=0;i<ncall;i++){
diff --git a/lib/Config.h.in b/lib/Config.h.in
index ecbacf3a..2cc9106a 100644
--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@@ -47,6 +47,9 @@
 /* Define to 1 if you have the <endian.h> header file. */
 #undef HAVE_ENDIAN_H
 
+/* Define to 1 if you have the <execinfo.h> header file. */
+#undef HAVE_EXECINFO_H
+
 /* Support FMA3 (Fused Multiply-Add) instructions */
 #undef HAVE_FMA
 
@@ -134,9 +137,6 @@
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
 
-/* Define to the home page for this package. */
-#undef PACKAGE_URL
-
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 
diff --git a/lib/Init.cc b/lib/Init.cc
index e427fe18..ab3b5571 100644
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -17,7 +17,6 @@
 
 #define __X86_64
 
-
 #ifdef HAVE_EXECINFO_H
 #include <execinfo.h>
 #endif
diff --git a/lib/PerfCount.h b/lib/PerfCount.h
index 264d571e..c379639d 100644
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -5,22 +5,27 @@
 #include <ctime>
 #include <chrono>
 #include <string.h>
+
 #include <sys/ioctl.h>
+
+#ifdef __linux__
 #include <syscall.h>
 #include <linux/perf_event.h>
-
+#endif
 namespace Grid {
 
 
+#ifdef __linux__
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 			    int cpu, int group_fd, unsigned long flags)
 {
-  int ret;
+  int ret=0;
 
   ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
 		group_fd, flags);
   return ret;
 }
+#endif
 
 
 class PerformanceCounter {
@@ -63,7 +68,6 @@ public:
     
   int PCT;
 
-  struct perf_event_attr pe;
   long long count;
   int fd;
   uint64_t elapsed;
@@ -74,15 +78,19 @@ public:
   }
 
   PerformanceCounter(int _pct) {
+#ifdef __linux__
     assert(_pct>=0);
     assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
     fd=-1;
     count=0;
     PCT =_pct;
     Open();
+#endif
   }
   void Open(void) 
   {
+#ifdef __linux__
+    struct perf_event_attr pe;
     memset(&pe, 0, sizeof(struct perf_event_attr));
     pe.size = sizeof(struct perf_event_attr);
 
@@ -99,32 +107,48 @@ public:
       fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
       perror("Error is");
     }
+#endif
   }
 
   void Start(void)
   {
+#ifdef __linux__
     if ( fd!= -1) {
       ioctl(fd, PERF_EVENT_IOC_RESET, 0);
       ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
     }
     begin  =__rdtsc();
+#else
+    begin = 0;
+#endif
   }
 
   void Stop(void) {
     count=0;
+#ifdef __linux__
     if ( fd!= -1) {
       ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
       ::read(fd, &count, sizeof(long long));
     }
     elapsed = __rdtsc() - begin;
+#else
+    elapsed = 0;
+#endif
+
   }
   void Report(void) {
+#ifdef __linux__
     printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
+#else
+    printf("%llu cycles \n", elapsed );
+#endif
   }
 
   ~PerformanceCounter()
   {
+#ifdef __linux__
     close(fd);
+#endif
   }
 
 };
diff --git a/lib/Threads.h b/lib/Threads.h
index 519a2a5b..64dcf893 100644
--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -44,7 +44,7 @@ class GridThread {
   };
   static void SetMaxThreads(void) { 
 #ifdef GRID_OMP
-    setenv("KMP_AFFINITY","balanced",1);
+    //    setenv("KMP_AFFINITY","balanced",1);
     _threads = omp_get_max_threads();
     omp_set_num_threads(_threads);
 #else 
diff --git a/lib/algorithms/CoarsenedMatrix.h b/lib/algorithms/CoarsenedMatrix.h
index 31ff43a9..0eac0e3e 100644
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -264,6 +264,9 @@ PARALLEL_FOR_LOOP
 
       for(int i=0;i<nbasis;i++){
 	phi=Subspace.subspace[i];
+	
+	std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
+
 	for(int p=0;p<geom.npoint;p++){ 
 
 	  int dir   = geom.directions[p];
diff --git a/lib/algorithms/approx/Chebyshev.h b/lib/algorithms/approx/Chebyshev.h
index f00170cf..1952799e 100644
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -166,7 +166,6 @@ namespace Grid {
       Field *Tn  = &T1;
       Field *Tnp = &T2;
 
-      std::cout<<GridLogMessage << "Chebyshev ["<<lo<<","<<hi<<"]"<< " order "<<order <<std::endl;
       // Tn=T1 = (xscale M + mscale)in
       RealD xscale = 2.0/(hi-lo);
       RealD mscale = -(hi+lo)/(hi-lo);
diff --git a/lib/algorithms/iterative/DenseMatrix.h b/lib/algorithms/iterative/DenseMatrix.h
index 2423677d..e8d1f9ab 100644
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@@ -25,6 +25,9 @@ template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N)
   assert(N==M);
 }
 
+template<class T> void Resize(DenseVector<T > & mat, int N) { 
+  mat.resize(N);
+}
 template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
   mat.resize(N);
   for(int i=0;i<N;i++){
diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index 97c10b30..a07f7f30 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -1,4 +1,3 @@
-#if 0
 #ifndef GRID_IRL_H
 #define GRID_IRL_H
 
@@ -18,8 +17,9 @@ template<class Field>
     const RealD small = 1.0e-16;
 public:       
     int lock;
-    int converged;
+    int get;
     int Niter;
+    int converged;
 
     int Nk;      // Number of converged sought
     int Np;      // Np -- Number of spare vecs in kryloc space
@@ -59,6 +59,7 @@ public:
     // Sanity checked this routine (step) against Saad.
     /////////////////////////
     void RitzMatrix(DenseVector<Field>& evec,int k){
+
       if(1) return;
 
       GridBase *grid = evec[0]._grid;
@@ -451,8 +452,9 @@ until convergence
 	std::cout << " -- Nconv       = "<< Nconv  << "\n";
       }
 
-
+    /////////////////////////////////////////////////
     // Adapted from Rudy's lanczos factor routine
+    /////////////////////////////////////////////////
     int Lanczos_Factor(int start, int end,  int cont,
 		       DenseVector<Field> & bq, 
 		       Field &bf,
@@ -546,10 +548,16 @@ until convergence
 	std::cout << "alpha = " << alpha << " fnorm = " << fnorm << '\n';
 
 	///Iterative refinement of orthogonality V = [ bq[0]  bq[1]  ...  bq[M] ]
-#if 0
 	int re = 0;
+	// FIXME undefined params; how set in Rudy's code
+	int ref =0;
+	Real rho = 1.0e-8;
+
 	while( re == ref || (sqbt < rho * bck && re < 5) ){
 
+	  Field tmp2(grid);
+	  Field tmp1(grid);
+
 	  //bex = V^dag bf
 	  DenseVector<ComplexD> bex(j+1);
 	  for(int k=0;k<j+1;k++){
@@ -566,14 +574,14 @@ until convergence
 
 	  //bf = bf - V V^dag bf.   Subtracting off any component in span { V[j] } 
 	  RealD btc = axpy_norm(bf,-1.0,tmp2,bf);
-	  alpha = alpha + bex[j];	      sqbt = sqrt(real(btc));	      
+	  alpha = alpha + real(bex[j]);	      sqbt = sqrt(real(btc));	      
+	  // FIXME is alpha real in RUDY's code?
 	  RealD nmbex = 0;for(int k=0;k<j+1;k++){nmbex = nmbex + real( conjugate(bex[k])*bex[k]  );}
 	  bck = sqrt( nmbex );
 	  re++;
 	}
 	std::cout << "Iteratively refined orthogonality, changes alpha\n";
 	if(re > 1) std::cout << "orthagonality refined " << re << " times" <<std::endl;
-#endif
 	H[j][j]=alpha;
       }
 
@@ -641,7 +649,7 @@ until convergence
       int M=Nm;
 
       DenseMatrix<RealD> H; Resize(H,Nm,Nm);
-      Resize(evals,Nm,Nm);
+      Resize(evals,Nm);
       Resize(evecs,Nm);
 
       int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with
@@ -702,7 +710,6 @@ until convergence
 	RealD beta;
 
 	Householder_vector<RealD>(ck, 0, 2, v, beta);
-
 	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,0);
 	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,1);
 	///Accumulate eigenvector
@@ -758,11 +765,11 @@ until convergence
       RealD resid_nrm=  norm2(bf);
 
       if(!lock) converged = 0;
-
+#if 0
       for(int i = SS - lock_num - 1; i >= SS - Nk && i >= 0; --i){
 
 	RealD diff = 0;
-	diff = abs(tevecs[i][Nm - 1 - lock_num]) * resid_nrm;
+	diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm;
 
 	std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl;
 
@@ -785,53 +792,29 @@ until convergence
 	  break;
 	}
       }
+#endif
       std::cout << "Got " << converged << " so far " <<std::endl;	
     }
-#if 0
-    ///Check
-    void Check(void) {
 
-      DenseVector<RealD> goodval(get);
+    ///Check
+    void Check(DenseVector<RealD> &evals,
+	       DenseVector<DenseVector<RealD> > &evecs) {
+
+      DenseVector<RealD> goodval(this->get);
+
       EigenSort(evals,evecs);
 
       int NM = Nm;
-      int Nget = this->get;
-      S **V;
-      V = new S* [NM];
 
-      RealD *QZ;
-      QZ = new RealD [NM*NM];
+      DenseVector< DenseVector<RealD> > V; Size(V,NM);
+      DenseVector<RealD> QZ(NM*NM);
+
       for(int i = 0; i < NM; i++){
 	for(int j = 0; j < NM; j++){
-
-	  QZ[i*NM+j] = this->evecs[i][j];
-      
-          int f_size_cb = 24*dop.cbLs*dop.node_cbvol;
-
-	  for(int cb = this->prec; cb < 2; cb++){
-	    for(int i = 0; i < NM; i++){
-	      V[i] = (S*)(this->bq[i][cb]);
-
-	      const int m0 = 4 * 4; // this is new code
-	      assert(m0 % 16 == 0); // see the reason in VtimesQ.C
-
-	      const int row_per_thread = f_size_cb / (bfmarg::threads);
-	      {
-
-		{
-		  DenseVector<RealD> vrow_tmp0(m0*NM);
-		  DenseVector<RealD> vrow_tmp1(m0*NM);
-		  RealD *row_tmp0 = vrow_tmp0.data();
-		  RealD *row_tmp1 = vrow_tmp1.data();
-		  VtimesQ(QZ, NM, V, row_tmp0, row_tmp1, id * row_per_thread, m0, (id + 1) * row_per_thread);
-		}
-	      }
-	    }
-	  }
+	  // evecs[i][j];
 	}
       }
     }
-#endif
 
 
 /**
@@ -1020,4 +1003,4 @@ static void Lock(DenseMatrix<T> &H, 	///Hess mtx
 
 }
 #endif
-#endif
+
diff --git a/lib/lattice/Lattice_unary.h b/lib/lattice/Lattice_unary.h
index 6d72ef31..c5698751 100644
--- a/lib/lattice/Lattice_unary.h
+++ b/lib/lattice/Lattice_unary.h
@@ -24,6 +24,17 @@ PARALLEL_FOR_LOOP
     return ret;
   }
 
+  template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=div(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+
   template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, ComplexD alpha, Integer Nexp = DEFAULT_MAT_EXP){
     Lattice<obj> ret(rhs._grid);
     ret.checkerboard = rhs.checkerboard;
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 9b336374..410aa629 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -266,11 +266,8 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
     if( this->HandOptDslash ) {
 #pragma omp parallel for schedule(static)
       for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
 	for(int s=0;s<Ls;s++){
-	  int sU=ss;
-	  if (    LebesgueOrder::UseLebesgueOrder ) {
-	    sU=lo.Reorder(ss);
-	  }
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 	  }
@@ -323,52 +320,42 @@ PARALLEL_FOR_LOOP
       //      Counter.Report();
       //      }
     } else if( this->HandOptDslash ) {
+      /*
 
-#pragma omp parallel for 
+#pragma omp parallel for schedule(static)
       for(int t=0;t<threads;t++){
 
 	int hyperthread = t%HT;
 	int core        = t/HT;
 
-        int sswork, swork,soff, sU,sF;
-
-	sswork = (nwork + cores-1)/cores;
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 
 	for(int ss=0;ss<sswork;ss++){
-	  sU=ss+core*sswork; // max locality within an L2 slice
-	  if ( LebesgueOrder::UseLebesgueOrder ) {
-	    sU = lo.Reorder(sU);
+	  sU=ss+ ssoff;
+	  for(int s=soff;s<soff+swork;s++){
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	  }
-	  if ( sU < nwork ) {
-	    for(int s=soff;s<soff+swork;s++){
-	      sF = s+Ls*sU;
-	      Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
-	    }
-	  }
-	}
-      }
-
-      /*
-#pragma omp parallel for schedule(static)
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	for(int s=0;s<Ls;s++){
-	  int sU=ss;
-	  if (    LebesgueOrder::UseLebesgueOrder ) {
-	    sU=lo.Reorder(ss);
-	  }
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
       }
       */
 
+#pragma omp parallel for schedule(static)
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
+	}
+      }
     } else { 
 PARALLEL_FOR_LOOP
       for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
 	for(int s=0;s<Ls;s++){
-	  //	  int sU=lo.Reorder(ss);
-	  int sU=ss;
 	  int sF = s+Ls*sU; 
 	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index a4b63085..988bc6f8 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -29,11 +29,11 @@ namespace Grid {
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
 #if defined(AVX512) || defined(IMCI)
-     void DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
 #else
-     void DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *p){
        DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
diff --git a/lib/serialisation/BinaryIO.cc b/lib/serialisation/BinaryIO.cc
index eb3f9833..f546ddb4 100644
--- a/lib/serialisation/BinaryIO.cc
+++ b/lib/serialisation/BinaryIO.cc
@@ -1,15 +1,14 @@
 #include <Grid.h>
 
-using namespace Grid;
-using namespace std;
 
+namespace Grid {
 // Writer implementation ///////////////////////////////////////////////////////
-BinaryWriter::BinaryWriter(const string &fileName)
-: file_(fileName, ios::binary|ios::out)
+BinaryWriter::BinaryWriter(const std::string &fileName)
+: file_(fileName, std::ios::binary|std::ios::out)
 {}
 
 template <>
-void BinaryWriter::writeDefault(const string &s, const string &output)
+void BinaryWriter::writeDefault(const std::string &s, const std::string &output)
 {
   uint64_t sz = output.size();
   
@@ -21,12 +20,12 @@ void BinaryWriter::writeDefault(const string &s, const string &output)
 }
 
 // Reader implementation ///////////////////////////////////////////////////////
-BinaryReader::BinaryReader(const string &fileName)
-: file_(fileName, ios::binary|ios::in)
+BinaryReader::BinaryReader(const std::string &fileName)
+: file_(fileName, std::ios::binary|std::ios::in)
 {}
 
 template <>
-void BinaryReader::readDefault(const string &s, string &output)
+void BinaryReader::readDefault(const std::string &s, std::string &output)
 {
   uint64_t sz;
   
@@ -34,3 +33,4 @@ void BinaryReader::readDefault(const string &s, string &output)
   output.reserve(sz);
   file_.read((char *)output.data(), sz);
 }
+}
diff --git a/lib/serialisation/TextIO.cc b/lib/serialisation/TextIO.cc
index 9d88b26a..8b9c4f01 100644
--- a/lib/serialisation/TextIO.cc
+++ b/lib/serialisation/TextIO.cc
@@ -1,14 +1,12 @@
 #include <Grid.h>
 
-using namespace Grid;
-using namespace std;
-
+namespace Grid {
 // Writer implementation ///////////////////////////////////////////////////////
-TextWriter::TextWriter(const string &fileName)
-: file_(fileName, ios::out)
+TextWriter::TextWriter(const std::string &fileName)
+: file_(fileName, std::ios::out)
 {}
 
-void TextWriter::push(const string &s)
+void TextWriter::push(const std::string &s)
 {
   level_++;
 };
@@ -27,11 +25,11 @@ void TextWriter::indent(void)
 };
 
 // Reader implementation ///////////////////////////////////////////////////////
-TextReader::TextReader(const string &fileName)
-: file_(fileName, ios::in)
+TextReader::TextReader(const std::string &fileName)
+: file_(fileName, std::ios::in)
 {}
 
-void TextReader::push(const string &s)
+void TextReader::push(const std::string &s)
 {
   level_++;
 };
@@ -50,17 +48,18 @@ void TextReader::checkIndent(void)
     file_.get(c);
     if (c != '\t')
     {
-      cerr << "mismatch on tab " << c << " level " << level_;
-      cerr << " i "<< i <<endl;
-      abort();
+      std::cerr << "mismatch on tab " << c << " level " << level_;
+      std::cerr << " i "<< i <<std::endl;
+      std::abort();
     }
   }
 }
 
 template <>
-void TextReader::readDefault(const string &s, string &output)
+void TextReader::readDefault(const std::string &s, std::string &output)
 {
   checkIndent();
   output.clear();
   getline(file_, output);
 }
+}
diff --git a/lib/serialisation/XmlIO.cc b/lib/serialisation/XmlIO.cc
index 8dd93de7..b5c6e5bd 100644
--- a/lib/serialisation/XmlIO.cc
+++ b/lib/serialisation/XmlIO.cc
@@ -1,10 +1,8 @@
 #include <Grid.h>
 
-using namespace Grid;
-using namespace std;
-
+namespace Grid {
 // Writer implementation ///////////////////////////////////////////////////////
-XmlWriter::XmlWriter(const string &fileName)
+XmlWriter::XmlWriter(const std::string &fileName)
 : fileName_(fileName)
 {
   node_ = doc_.append_child();
@@ -16,7 +14,7 @@ XmlWriter::~XmlWriter(void)
   doc_.save_file(fileName_.c_str(), "  ");
 }
 
-void XmlWriter::push(const string &s)
+void XmlWriter::push(const std::string &s)
 {
   node_ = node_.append_child(s.c_str());
 }
@@ -27,22 +25,22 @@ void XmlWriter::pop(void)
 }
 
 // Reader implementation ///////////////////////////////////////////////////////
-XmlReader::XmlReader(const string &fileName)
+XmlReader::XmlReader(const std::string &fileName)
 : fileName_(fileName)
 {
   pugi::xml_parse_result result = doc_.load_file(fileName_.c_str());
   
   if ( !result )
   {
-    cerr << "XML error description: " << result.description() << "\n";
-    cerr << "XML error offset     : " << result.offset        << "\n";
-    abort();
+    std::cerr << "XML error description: " << result.description() << "\n";
+    std::cerr << "XML error offset     : " << result.offset        << "\n";
+    std::abort();
   }
   
   node_ = doc_.child("grid");
 }
 
-void XmlReader::push(const string &s)
+void XmlReader::push(const std::string &s)
 {
   node_ = node_.child(s.c_str());
 }
@@ -53,7 +51,8 @@ void XmlReader::pop(void)
 }
 
 template <>
-void XmlReader::readDefault(const string &s, string &output)
+void XmlReader::readDefault(const std::string &s, std::string &output)
 {
   output = node_.child(s.c_str()).first_child().value();
 }
+}
diff --git a/lib/serialisation/XmlIO.h b/lib/serialisation/XmlIO.h
index ba8af167..45fc6fac 100644
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@@ -96,6 +96,7 @@ namespace Grid
       node_.child("elem").set_name("elem-done");
       i++;
     }
+    //    assert( is.tellg()==-1);
     pop();
   }
   
diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h
index e4942937..540a54f5 100644
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -67,6 +67,14 @@ namespace Grid {
     }
   };
 
+  template<class scalar> struct DivIntFunctor {
+    Integer y;
+  DivIntFunctor(Integer _y) : y(_y) {};
+    scalar operator()(const scalar &a)  const {
+      return Integer(a)/y;
+    }
+  };
+
   template<class scalar> struct RealFunctor {
     scalar operator()(const scalar &a)  const {
       return real(a);
@@ -131,6 +139,10 @@ namespace Grid {
   inline Grid_simd<S,V> mod(const Grid_simd<S,V> &r,Integer y) {
     return SimdApply(ModIntFunctor<S>(y),r);
   }
+  template < class S, class V > 
+  inline Grid_simd<S,V> div(const Grid_simd<S,V> &r,Integer y) {
+    return SimdApply(DivIntFunctor<S>(y),r);
+  }
   ////////////////////////////////////////////////////////////////////////////
   // Allows us to assign into **conformable** real vectors from complex
   ////////////////////////////////////////////////////////////////////////////
diff --git a/lib/tensors/Tensor_unary.h b/lib/tensors/Tensor_unary.h
index 045097a3..d2c3fae4 100644
--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -111,7 +111,7 @@ template<class obj,int N> inline auto toComplex(const iMatrix<obj,N> &z) -> type
   return ret;
 }
 
-
+BINARY_RSCALAR(div,Integer);
 BINARY_RSCALAR(mod,Integer);
 BINARY_RSCALAR(pow,RealD);
 
diff --git a/scripts/configure-commands b/scripts/configure-commands
index 07506d27..a3599d1f 100755
--- a/scripts/configure-commands
+++ b/scripts/configure-commands
@@ -59,7 +59,7 @@ clang-avx2)
 CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
   ;;
 clang-avx-openmp)
-CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
+CXX=clang-omp++ ../../configure --enable-precision=double --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
   ;;
 clang-xc30)
 CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS=""  LIBS="-lgmp -lmpfr" --enable-comms=none
diff --git a/tests/Make.inc b/tests/Make.inc
index ae1faca3..e0a17529 100644
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,5 @@
 
-bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
+bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_synthetic_lanczos
 
 
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
diff --git a/tests/Test_cheby.cc b/tests/Test_cheby.cc
index 08897c90..81b08816 100644
--- a/tests/Test_cheby.cc
+++ b/tests/Test_cheby.cc
@@ -57,5 +57,21 @@ int main (int argc, char ** argv)
     ChebyStep.csv(of);
   }
 
+  lo=-8;
+  hi=8;
+  Chebyshev<LatticeFermion> ChebyIndefInv(lo,hi,40,InverseApproximation);
+  {
+    std::ofstream of("chebyindefinv");
+    ChebyIndefInv.csv(of);
+  }
+
+  lo=0;
+  hi=64;
+  Chebyshev<LatticeFermion> ChebyNE(lo,hi,40,InverseApproximation);
+  {
+    std::ofstream of("chebyNE");
+    ChebyNE.csv(of);
+  }
+
   Grid_finalize();
 }
diff --git a/tests/Test_dwf_hdcr.cc b/tests/Test_dwf_hdcr.cc
index 94b8481a..006d0e5e 100644
--- a/tests/Test_dwf_hdcr.cc
+++ b/tests/Test_dwf_hdcr.cc
@@ -6,6 +6,22 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 
+class myclass: Serializable {
+public:
+
+  GRID_DECL_CLASS_MEMBERS(myclass,
+			  int, domaindecompose,
+			  int, domainsize,
+			  int, order,
+			  double, lo,
+			  double, hi,
+			  int, steps);
+
+  myclass(){};
+
+};
+myclass params;
+
 RealD InverseApproximation(RealD x){
   return 1.0/x;
 }
@@ -26,15 +42,21 @@ public:
 
   Aggregates     & _Aggregates;
   CoarseOperator & _CoarseOperator;
-  Matrix         & _Matrix;
+  Matrix         & _FineMatrix;
   FineOperator   & _FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;
 
   // Constructor
-  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, FineOperator &Fine,Matrix &FineMatrix) 
+  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
+			  FineOperator &Fine,Matrix &FineMatrix,
+			  FineOperator &Smooth,Matrix &SmootherMatrix) 
     : _Aggregates(Agg),
       _CoarseOperator(Coarse),
       _FineOperator(Fine),
-      _Matrix(FineMatrix)
+      _FineMatrix(FineMatrix),
+      _SmootherOperator(Smooth),
+      _SmootherMatrix(SmootherMatrix)
   {
   }
 
@@ -43,7 +65,7 @@ public:
     FineField p1(in._grid);
     FineField p2(in._grid);
 
-    MdagMLinearOperator<Matrix,FineField>   fMdagMOp(_Matrix);
+    MdagMLinearOperator<Matrix,FineField>   fMdagMOp(_FineMatrix);
 
     p1=in;
     RealD absp2;
@@ -58,74 +80,20 @@ public:
     }
   }
 
-#if 0
   void operator()(const FineField &in, FineField & out) {
-
-    FineField Min(in._grid);
-    FineField tmp(in._grid);
-
-    CoarseVector Csrc(_CoarseOperator.Grid());
-    CoarseVector Ctmp(_CoarseOperator.Grid());
-    CoarseVector Csol(_CoarseOperator.Grid());
-
-    // Monitor completeness of low mode space
-    _Aggregates.ProjectToSubspace  (Csrc,in);
-    _Aggregates.PromoteFromSubspace(Csrc,out);
-    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
-
-    // Build some solvers
-    ConjugateGradient<FineField>    fCG(1.0e-3,1000);
-    ConjugateGradient<CoarseVector>  CG(1.0e-8,100000);
-
-    ////////////////////////////////////////////////////////////////////////
-    // ADEF2: [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
-    ////////////////////////////////////////////////////////////////////////
-
-    // Smoothing step, followed by coarse grid correction
-    MdagMLinearOperator<Matrix,FineField> MdagMOp(_Matrix);
-
-    Min=in;
-    std::cout<<GridLogMessage<< " Preconditioner in  " << norm2(in)<<std::endl; 
-    _FineOperator.AdjOp(Min,tmp);
-    std::cout<<GridLogMessage<< " Preconditioner tmp  " << norm2(in)<<std::endl; 
-
-    fCG(MdagMOp,tmp,out);
-
-    _FineOperator.Op(out,tmp);
-
-    std::cout<<GridLogMessage<< " Preconditioner in  " << norm2(in)<<std::endl; 
-    std::cout<<GridLogMessage<< " Preconditioner out " << norm2(out)<<std::endl; 
-    std::cout<<GridLogMessage<< " Preconditioner Aout" << norm2(tmp)<<std::endl; 
-
-    tmp = tmp - in;
-    
-    std::cout<<GridLogMessage<<"preconditioner thinks residual is "<<std::sqrt(norm2(tmp)/norm2(in))<<std::endl;
-
-    /*
-    //    _FineOperator.Op(Min,out);
-    //    out = in -out; // out = in - A Min
-    out = in;
-
-        MdagMLinearOperator<CoarseOperator,CoarseVector> MdagMOp(_CoarseOperator);
-    HermitianLinearOperator<CoarseOperator,CoarseVector> HermOp(_CoarseOperator);
-    Csol=zero;
-    _Aggregates.ProjectToSubspace  (Csrc,out);
-    HermOp.AdjOp(Csrc,Ctmp);// Normal equations
-    CG(MdagMOp  ,Ctmp,Csol);
-    _Aggregates.PromoteFromSubspace(Csol,out);
-
-    out = Min + out;;
-    */
-
+    if ( params.domaindecompose ) {
+      operatorSAP(in,out);
+    } else { 
+      operatorCheby(in,out);
+    }
   }
-#endif
 
     ////////////////////////////////////////////////////////////////////////
     // ADEF2: [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
     // ADEF1: [MP+Q ] in =M [1 - A Q] in + Q in  
     ////////////////////////////////////////////////////////////////////////
-#if 0
-  void operator()(const FineField &in, FineField & out) {
+#if 1
+  void operatorADEF2(const FineField &in, FineField & out) {
 
     CoarseVector Csrc(_CoarseOperator.Grid());
     CoarseVector Ctmp(_CoarseOperator.Grid());
@@ -136,7 +104,7 @@ public:
 
     HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
     MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    MdagMLinearOperator<Matrix,FineField>               fMdagMOp(_Matrix);
+    MdagMLinearOperator<Matrix,FineField>               fMdagMOp(_FineMatrix);
 
     FineField tmp(in._grid);
     FineField res(in._grid);
@@ -189,8 +157,8 @@ public:
   }
 #endif
   // ADEF1: [MP+Q ] in =M [1 - A Q] in + Q in  
-#if 0
-  void operator()(const FineField &in, FineField & out) {
+#if 1
+  void operatorADEF1(const FineField &in, FineField & out) {
 
     CoarseVector Csrc(_CoarseOperator.Grid());
     CoarseVector Ctmp(_CoarseOperator.Grid());
@@ -201,7 +169,7 @@ public:
 
     HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
     MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    ShiftedMdagMLinearOperator<Matrix,FineField>        fMdagMOp(_Matrix,0.1);
+    ShiftedMdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix,0.1);
 
     FineField tmp(in._grid);
     FineField res(in._grid);
@@ -234,14 +202,79 @@ public:
   }
 #endif
 
+  void SAP (const FineField & src,FineField & psi){
+
+    Lattice<iScalar<vInteger> > coor(src._grid);
+    Lattice<iScalar<vInteger> > subset(src._grid);
+    
+    FineField r(src._grid);
+    FineField zz(src._grid); zz=zero;
+    FineField vec1(src._grid);
+    FineField vec2(src._grid);
+
+    const Integer block=params.domainsize;
+
+    subset=zero;
+    for(int mu=0;mu<Nd;mu++){
+      LatticeCoordinate(coor,mu+1);
+      coor = div(coor,block);
+      subset = subset+coor;
+    }
+    subset = mod(subset,(Integer)2);
+    
+    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_SmootherMatrix,0.0);
+    Chebyshev<FineField> Cheby  (params.lo,params.hi,params.order,InverseApproximation);
+
+    RealD resid;
+    for(int i=0;i<params.steps;i++){
+      
+      // Even domain residual
+      _FineOperator.Op(psi,vec1);// this is the G5 herm bit
+      r= src - vec1 ;
+      resid = norm2(r) /norm2(src); 
+      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
+
+
+// Npoly*outer*2 1/2 vol matmuls.
+// 71 iters => 20*71 = 1400 matmuls.
+// 2*71 = 140 comms.
+
+      // Even domain solve
+      r= where(subset==(Integer)0,r,zz);
+      _SmootherOperator.AdjOp(r,vec1);
+      Cheby(fMdagMOp,vec1,vec2);    // solves  MdagM = g5 M g5M
+      psi = psi + vec2;  
+
+      // Odd domain residual
+      _FineOperator.Op(psi,vec1);// this is the G5 herm bit
+      r= src - vec1 ;
+      r= where(subset==(Integer)1,r,zz);
+
+      resid = norm2(r) /norm2(src); 
+      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
+      
+      // Odd domain solve
+      _SmootherOperator.AdjOp(r,vec1);
+      Cheby(fMdagMOp,vec1,vec2);    // solves  MdagM = g5 M g5M
+      psi = psi + vec2;  
+
+      _FineOperator.Op(psi,vec1);// this is the G5 herm bit
+      r= src - vec1 ;
+      resid = norm2(r) /norm2(src); 
+      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
+
+    }
+
+  };
+
   void SmootherTest (const FineField & in){
     
     FineField vec1(in._grid);
     FineField vec2(in._grid);
     RealD lo[3] = { 0.5, 1.0, 2.0};
 
-    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_Matrix);
-    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_Matrix,0.5);
+    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix);
+    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_SmootherMatrix,0.0);
 
     RealD Ni,r;
 
@@ -250,7 +283,7 @@ public:
     for(int ilo=0;ilo<3;ilo++){
       for(int ord=5;ord<50;ord*=2){
 
-	_FineOperator.AdjOp(in,vec1);
+	_SmootherOperator.AdjOp(in,vec1);
 
 	Chebyshev<FineField> Cheby  (lo[ilo],70.0,ord,InverseApproximation);
 	Cheby(fMdagMOp,vec1,vec2);    // solves  MdagM = g5 M g5M
@@ -264,7 +297,7 @@ public:
     }
   }
 
-  void operator()(const FineField &in, FineField & out) {
+  void operatorCheby(const FineField &in, FineField & out) {
 
     CoarseVector Csrc(_CoarseOperator.Grid());
     CoarseVector Ctmp(_CoarseOperator.Grid());
@@ -275,18 +308,18 @@ public:
 
     HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
     MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_Matrix);
-    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_Matrix,0.0);
+    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix);
+    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_SmootherMatrix,0.0);
 
     FineField vec1(in._grid);
     FineField vec2(in._grid);
 
     //    Chebyshev<FineField> Cheby    (0.5,70.0,30,InverseApproximation);
     //    Chebyshev<FineField> ChebyAccu(0.5,70.0,30,InverseApproximation);
-    Chebyshev<FineField> Cheby    (2.0,70.0,10,InverseApproximation);
-    Chebyshev<FineField> ChebyAccu(2.0,70.0,10,InverseApproximation);
-    Cheby.JacksonSmooth();
-    ChebyAccu.JacksonSmooth();
+    Chebyshev<FineField> Cheby    (2.0,70.0,15,InverseApproximation);
+    Chebyshev<FineField> ChebyAccu(2.0,70.0,15,InverseApproximation);
+    //    Cheby.JacksonSmooth();
+    //    ChebyAccu.JacksonSmooth();
 
     _Aggregates.ProjectToSubspace  (Csrc,in);
     _Aggregates.PromoteFromSubspace(Csrc,out);
@@ -305,7 +338,7 @@ public:
 
     RealD Ni = norm2(in);
 
-    _FineOperator.AdjOp(in,vec1);// this is the G5 herm bit
+    _SmootherOperator.AdjOp(in,vec1);// this is the G5 herm bit
     ChebyAccu(fMdagMOp,vec1,out);    // solves  MdagM = g5 M g5M
 
     std::cout<<GridLogMessage << "Smoother norm "<<norm2(out)<<std::endl;
@@ -334,23 +367,89 @@ public:
     std::cout<<GridLogMessage << "Coarse resid "<<std::sqrt(r/Ni)<<std::endl;
 
     // Reapply smoother
-    _FineOperator.Op(vec1,vec2);  // this is the G5 herm bit
+    _SmootherOperator.Op(vec1,vec2);  // this is the G5 herm bit
     ChebyAccu(fMdagMOp,vec2,vec1);    // solves  MdagM = g5 M g5M
 
     out =out+vec1;
-    _FineOperator.Op(out,vec1);// this is the G5 herm bit
     vec1  = in - vec1;   // tmp  = in - A Min
     r=norm2(vec1);
     std::cout<<GridLogMessage << "Smoother resid "<<std::sqrt(r/Ni)<<std::endl;
 
   }
 
+  void operatorSAP(const FineField &in, FineField & out) {
+
+    CoarseVector Csrc(_CoarseOperator.Grid());
+    CoarseVector Ctmp(_CoarseOperator.Grid());
+    CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;
+
+    ConjugateGradient<CoarseVector>  CG(1.0e-3,100000);
+
+    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
+    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
+
+    FineField vec1(in._grid);
+    FineField vec2(in._grid);
+
+    _Aggregates.ProjectToSubspace  (Csrc,in);
+    _Aggregates.PromoteFromSubspace(Csrc,out);
+    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
+    
+
+    // To make a working smoother for indefinite operator
+    // must multiply by "Mdag" (ouch loses all low mode content)
+    // and apply to poly approx of (mdagm)^-1.
+    // so that we end up with an odd polynomial.
+    SAP(in,out);
+
+    // Update with residual for out
+    _FineOperator.Op(out,vec1);// this is the G5 herm bit
+    vec1  = in - vec1;   // tmp  = in - A Min
+
+    RealD r = norm2(vec1);
+    RealD Ni = norm2(in);
+    std::cout<<GridLogMessage << "SAP resid "<<std::sqrt(r/Ni)<< " " << r << " " << Ni <<std::endl;
+    
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    HermOp.AdjOp(Csrc,Ctmp);// Normal equations
+    CG(MdagMOp,Ctmp,Csol);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); // Ass^{-1} [in - A Min]_s
+                                             // Q = Q[in - A Min]  
+    out = out+vec1;
+
+    // Three preconditioner smoothing -- hermitian if C3 = C1
+    // Recompute error
+    _FineOperator.Op(out,vec1);// this is the G5 herm bit
+    vec1  = in - vec1;   // tmp  = in - A Min
+    r=norm2(vec1);
+
+    std::cout<<GridLogMessage << "Coarse resid "<<std::sqrt(r/Ni)<<std::endl;
+
+    // Reapply smoother
+    SAP(vec1,vec2);
+    out =out+vec2;
+
+
+    // Update with residual for out
+    _FineOperator.Op(out,vec1);// this is the G5 herm bit
+    vec1  = in - vec1;   // tmp  = in - A Min
+
+    r = norm2(vec1);
+    Ni = norm2(in);
+    std::cout<<GridLogMessage << "SAP resid(post) "<<std::sqrt(r/Ni)<< " " << r << " " << Ni <<std::endl;
+
+  }
+
 };
 
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
+  XmlReader RD("params.xml");
+  read(RD,"params",params);
+  std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
+
   const int Ls=8;
 
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
@@ -385,11 +484,27 @@ int main (int argc, char ** argv)
   LatticeFermion    tmp(FGrid);
   LatticeFermion    err(FGrid);
   LatticeGaugeField Umu(UGrid); 
+  LatticeGaugeField UmuDD(UGrid); 
+  LatticeColourMatrix U(UGrid);
+  LatticeColourMatrix zz(UGrid);
 
   NerscField header;
   std::string file("./ckpoint_lat.4000");
   NerscIO::readConfiguration(Umu,header,file);
 
+
+  if ( params.domaindecompose ) { 
+    Lattice<iScalar<vInteger> > coor(UGrid);
+    zz=zero;
+    for(int mu=0;mu<Nd;mu++){
+      LatticeCoordinate(coor,mu);
+      U = PeekIndex<LorentzIndex>(Umu,mu);
+      U = where(mod(coor,params.domainsize)==(Integer)0,zz,U);
+      PokeIndex<LorentzIndex>(UmuDD,U,mu);
+    }
+  } else { 
+    UmuDD = Umu;
+  }
   //  SU3::ColdConfiguration(RNG4,Umu);
   //  SU3::TepidConfiguration(RNG4,Umu);
   //  SU3::HotConfiguration(RNG4,Umu);
@@ -402,6 +517,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionR DdwfDD(UmuDD,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 
   const int nbasis = 32;
   //  const int nbasis = 4;
@@ -438,6 +554,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
+  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOpDD(DdwfDD);
   CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LDOp(*Coarse5d);
   LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
 
@@ -467,7 +584,13 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Building deflation preconditioner "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 
-  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR> Precon(Aggregates, LDOp,HermIndefOp,Ddwf);
+  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR> Precon  (Aggregates, LDOp,
+											   HermIndefOp,Ddwf,
+											   HermIndefOp,Ddwf);
+
+  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR> PreconDD(Aggregates, LDOp,
+											   HermIndefOp,Ddwf,
+											   HermIndefOpDD,DdwfDD);
   TrivialPrecon<LatticeFermion> simple;
 
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
@@ -475,9 +598,20 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   Precon.SmootherTest(src);
 
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  PreconDD.SmootherTest(src);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  PreconDD.SAP(src,result);
+
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
   //  TrivialPrecon<LatticeFermion> simple;
   //  ConjugateGradient<LatticeFermion> fCG(1.0e-8,100000);
   //  fCG(HermDefOp,src,result);
@@ -496,12 +630,22 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   Precon.PowerMethod(src);
 
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
+  result=zero;
+  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  PGCRDD(HermIndefOp,src,result);
+
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
-  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
-  PGCR(HermIndefOp,src,result);
+  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
+  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  //  result=zero;
+  //  PGCR(HermIndefOp,src,result);
 
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
@@ -516,6 +660,7 @@ int main (int argc, char ** argv)
 
   pCG(HermOpEO,src_o,result_o);
 
+
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Done "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc
index 5949fa63..26e52a41 100644
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -52,6 +52,7 @@ int main (int argc, char ** argv)
   }
   */
 
+  typedef CartesianStencil<vobj,vobj,SimpleCompressor<vobj> > Stencil;
     for(int dir=0;dir<4;dir++){
       for(int disp=0;disp<Fine._fdimensions[dir];disp++){
 
@@ -61,7 +62,7 @@ int main (int argc, char ** argv)
 	std::vector<int> directions(npoint,dir);
 	std::vector<int> displacements(npoint,disp);
 
-	CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
+	Stencil myStencil(&Fine,npoint,0,directions,displacements);
 
 	std::vector<int> ocoor(4);
 	for(int o=0;o<Fine.oSites();o++){
@@ -142,8 +143,8 @@ int main (int argc, char ** argv)
 	std::vector<int> directions(npoint,dir);
 	std::vector<int> displacements(npoint,disp);
 
-	CartesianStencil EStencil(&rbFine,npoint,Even,directions,displacements);
-	CartesianStencil OStencil(&rbFine,npoint,Odd,directions,displacements);
+	Stencil EStencil(&rbFine,npoint,Even,directions,displacements);
+	Stencil OStencil(&rbFine,npoint,Odd,directions,displacements);
 
 	std::vector<int> ocoor(4);
 	for(int o=0;o<Fine.oSites();o++){
diff --git a/tests/Test_synthetic_lanczos.cc b/tests/Test_synthetic_lanczos.cc
index c15ca3a7..d8bbc9a4 100644
--- a/tests/Test_synthetic_lanczos.cc
+++ b/tests/Test_synthetic_lanczos.cc
@@ -8,6 +8,7 @@ using namespace Grid::QCD;
 static int
 FEenableexcept (unsigned int excepts)
 {
+#if 0
   static fenv_t fenv;
   unsigned int new_excepts = excepts & FE_ALL_EXCEPT,
     old_excepts;  // previous masks
@@ -20,6 +21,9 @@ FEenableexcept (unsigned int excepts)
   fenv.__mxcsr   &= ~(new_excepts << 7);
 
   return ( fesetenv (&fenv) ? -1 : old_excepts );
+#else
+  return 0;
+#endif
 }
 
 
@@ -35,7 +39,7 @@ public:
 
     random(pRNG,scale);
 
-    scale = exp(-real(scale)*6.0);
+    scale = exp(-real(scale)*3.0);
     std::cout << " True matrix \n"<< scale <<std::endl;
   }
 
@@ -70,7 +74,7 @@ public:
 int main (int argc, char ** argv)
 {
 
-  FEenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); 
+  //  FEenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); 
 
   Grid_init(&argc,&argv);
 
@@ -88,8 +92,7 @@ int main (int argc, char ** argv)
   RealD mu    = 0.0;
   int order = 11;
   ChebyshevLanczos<LatticeComplex> Cheby(alpha,beta,mu,order);
-
-  std::ofstream file("pooh.dat");
+  std::ofstream file("cheby.dat");
   Cheby.csv(file);
 
   HermOpOperatorFunction<LatticeComplex> X;
@@ -114,9 +117,9 @@ int main (int argc, char ** argv)
   }
   
   {
-    std::vector<RealD>          eval(Nm);
-    std::vector<LatticeComplex> evec(Nm,grid);
-    ChebyIRL.calc(eval,evec,src, Nconv);
+    //    std::vector<RealD>          eval(Nm);
+    //    std::vector<LatticeComplex> evec(Nm,grid);
+    //    ChebyIRL.calc(eval,evec,src, Nconv);
   }
 
   Grid_finalize();