Merge branch 'master' of https://github.com/paboyle/Grid

2025-08-02 04:37:06 +01:00 · 2015-12-09 12:48:44 +00:00
parent 967be91692 26161addd0
commit a32a59fc43
24 changed files with 406 additions and 221 deletions
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -82,7 +82,7 @@ int main (int argc, char ** argv)
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall=100;
+  int ncall=10000;
  {
    double t0=usecond();
    for(int i=0;i<ncall;i++){
--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@@ -137,9 +137,6 @@
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME

-/* Define to the home page for this package. */
-#undef PACKAGE_URL
-
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION

--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -17,7 +17,6 @@

 #define __X86_64

-
 #ifdef HAVE_EXECINFO_H
 #include <execinfo.h>
 #endif
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -5,22 +5,29 @@
 #include <ctime>
 #include <chrono>
 #include <string.h>
-#include <sys/ioctl.h>
-#include <sys/syscall.h>
-#include <linux/perf_event.h>

+#include <sys/ioctl.h>
+
+#ifdef __linux__
+#include <syscall.h>
+#include <linux/perf_event.h>
+#else
+#include <sys/syscall.h>
+#endif
 namespace Grid {


+#ifdef __linux__
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 			    int cpu, int group_fd, unsigned long flags)
 {
-  int ret;
+  int ret=0;

  ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
 		group_fd, flags);
  return ret;
 }
+#endif


 class PerformanceCounter {
@@ -63,7 +70,6 @@ public:
    
  int PCT;

-  struct perf_event_attr pe;
  long long count;
  int fd;
  uint64_t elapsed;
@@ -74,15 +80,19 @@ public:
  }

  PerformanceCounter(int _pct) {
+#ifdef __linux__
    assert(_pct>=0);
    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
    fd=-1;
    count=0;
    PCT =_pct;
    Open();
+#endif
  }
  void Open(void) 
  {
+#ifdef __linux__
+    struct perf_event_attr pe;
    memset(&pe, 0, sizeof(struct perf_event_attr));
    pe.size = sizeof(struct perf_event_attr);

@@ -99,32 +109,48 @@ public:
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
+#endif
  }

  void Start(void)
  {
+#ifdef __linux__
    if ( fd!= -1) {
      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
    }
    begin  =__rdtsc();
+#else
+    begin = 0;
+#endif
  }

  void Stop(void) {
    count=0;
+#ifdef __linux__
    if ( fd!= -1) {
      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::read(fd, &count, sizeof(long long));
    }
    elapsed = __rdtsc() - begin;
+#else
+    elapsed = 0;
+#endif
+
  }
  void Report(void) {
+#ifdef __linux__
    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
+#else
+    printf("%llu cycles \n", elapsed );
+#endif
  }

  ~PerformanceCounter()
  {
+#ifdef __linux__
    close(fd);
+#endif
  }

 };
--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -44,7 +44,7 @@ class GridThread {
  };
  static void SetMaxThreads(void) { 
 #ifdef GRID_OMP
-    setenv("KMP_AFFINITY","balanced",1);
+    //    setenv("KMP_AFFINITY","balanced",1);
    _threads = omp_get_max_threads();
    omp_set_num_threads(_threads);
 #else 
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -264,6 +264,9 @@ PARALLEL_FOR_LOOP

      for(int i=0;i<nbasis;i++){
 	phi=Subspace.subspace[i];
+	
+	std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
+
 	for(int p=0;p<geom.npoint;p++){ 

 	  int dir   = geom.directions[p];
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -166,7 +166,6 @@ namespace Grid {
      Field *Tn  = &T1;
      Field *Tnp = &T2;

-      std::cout<<GridLogMessage << "Chebyshev ["<<lo<<","<<hi<<"]"<< " order "<<order <<std::endl;
      // Tn=T1 = (xscale M + mscale)in
      RealD xscale = 2.0/(hi-lo);
      RealD mscale = -(hi+lo)/(hi-lo);
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@@ -25,6 +25,9 @@ template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N)
  assert(N==M);
 }

+template<class T> void Resize(DenseVector<T > & mat, int N) { 
+  mat.resize(N);
+}
 template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
  mat.resize(N);
  for(int i=0;i<N;i++){
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -1,4 +1,3 @@
-#if 0
 #ifndef GRID_IRL_H
 #define GRID_IRL_H

@@ -18,8 +17,9 @@ template<class Field>
    const RealD small = 1.0e-16;
 public:       
    int lock;
-    int converged;
+    int get;
    int Niter;
+    int converged;

    int Nk;      // Number of converged sought
    int Np;      // Np -- Number of spare vecs in kryloc space
@@ -59,6 +59,7 @@ public:
    // Sanity checked this routine (step) against Saad.
    /////////////////////////
    void RitzMatrix(DenseVector<Field>& evec,int k){
+
      if(1) return;

      GridBase *grid = evec[0]._grid;
@@ -451,8 +452,9 @@ until convergence
 	std::cout << " -- Nconv       = "<< Nconv  << "\n";
      }

-
+    /////////////////////////////////////////////////
    // Adapted from Rudy's lanczos factor routine
+    /////////////////////////////////////////////////
    int Lanczos_Factor(int start, int end,  int cont,
 		       DenseVector<Field> & bq, 
 		       Field &bf,
@@ -546,10 +548,16 @@ until convergence
 	std::cout << "alpha = " << alpha << " fnorm = " << fnorm << '\n';

 	///Iterative refinement of orthogonality V = [ bq[0]  bq[1]  ...  bq[M] ]
-#if 0
 	int re = 0;
+	// FIXME undefined params; how set in Rudy's code
+	int ref =0;
+	Real rho = 1.0e-8;
+
 	while( re == ref || (sqbt < rho * bck && re < 5) ){

+	  Field tmp2(grid);
+	  Field tmp1(grid);
+
 	  //bex = V^dag bf
 	  DenseVector<ComplexD> bex(j+1);
 	  for(int k=0;k<j+1;k++){
@@ -566,14 +574,14 @@ until convergence

 	  //bf = bf - V V^dag bf.   Subtracting off any component in span { V[j] } 
 	  RealD btc = axpy_norm(bf,-1.0,tmp2,bf);
-	  alpha = alpha + bex[j];	      sqbt = sqrt(real(btc));	      
+	  alpha = alpha + real(bex[j]);	      sqbt = sqrt(real(btc));	      
+	  // FIXME is alpha real in RUDY's code?
 	  RealD nmbex = 0;for(int k=0;k<j+1;k++){nmbex = nmbex + real( conjugate(bex[k])*bex[k]  );}
 	  bck = sqrt( nmbex );
 	  re++;
 	}
 	std::cout << "Iteratively refined orthogonality, changes alpha\n";
 	if(re > 1) std::cout << "orthagonality refined " << re << " times" <<std::endl;
-#endif
 	H[j][j]=alpha;
      }

@@ -641,7 +649,7 @@ until convergence
      int M=Nm;

      DenseMatrix<RealD> H; Resize(H,Nm,Nm);
-      Resize(evals,Nm,Nm);
+      Resize(evals,Nm);
      Resize(evecs,Nm);

      int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with
@@ -702,7 +710,6 @@ until convergence
 	RealD beta;

 	Householder_vector<RealD>(ck, 0, 2, v, beta);
-
 	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,0);
 	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,1);
 	///Accumulate eigenvector
@@ -758,11 +765,11 @@ until convergence
      RealD resid_nrm=  norm2(bf);

      if(!lock) converged = 0;
-
+#if 0
      for(int i = SS - lock_num - 1; i >= SS - Nk && i >= 0; --i){

 	RealD diff = 0;
-	diff = abs(tevecs[i][Nm - 1 - lock_num]) * resid_nrm;
+	diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm;

 	std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl;

@@ -785,53 +792,29 @@ until convergence
 	  break;
 	}
      }
+#endif
      std::cout << "Got " << converged << " so far " <<std::endl;	
    }
-#if 0
-    ///Check
-    void Check(void) {

-      DenseVector<RealD> goodval(get);
+    ///Check
+    void Check(DenseVector<RealD> &evals,
+	       DenseVector<DenseVector<RealD> > &evecs) {
+
+      DenseVector<RealD> goodval(this->get);
+
      EigenSort(evals,evecs);

      int NM = Nm;
-      int Nget = this->get;
-      S **V;
-      V = new S* [NM];

-      RealD *QZ;
-      QZ = new RealD [NM*NM];
+      DenseVector< DenseVector<RealD> > V; Size(V,NM);
+      DenseVector<RealD> QZ(NM*NM);
+
      for(int i = 0; i < NM; i++){
 	for(int j = 0; j < NM; j++){
-
-	  QZ[i*NM+j] = this->evecs[i][j];
-      
-          int f_size_cb = 24*dop.cbLs*dop.node_cbvol;
-
-	  for(int cb = this->prec; cb < 2; cb++){
-	    for(int i = 0; i < NM; i++){
-	      V[i] = (S*)(this->bq[i][cb]);
-
-	      const int m0 = 4 * 4; // this is new code
-	      assert(m0 % 16 == 0); // see the reason in VtimesQ.C
-
-	      const int row_per_thread = f_size_cb / (bfmarg::threads);
-	      {
-
-		{
-		  DenseVector<RealD> vrow_tmp0(m0*NM);
-		  DenseVector<RealD> vrow_tmp1(m0*NM);
-		  RealD *row_tmp0 = vrow_tmp0.data();
-		  RealD *row_tmp1 = vrow_tmp1.data();
-		  VtimesQ(QZ, NM, V, row_tmp0, row_tmp1, id * row_per_thread, m0, (id + 1) * row_per_thread);
-		}
-	      }
-	    }
-	  }
+	  // evecs[i][j];
 	}
      }
    }
-#endif


 /**
@@ -1020,4 +1003,4 @@ static void Lock(DenseMatrix<T> &H, 	///Hess mtx

 }
 #endif
-#endif
+
--- a/lib/lattice/Lattice_unary.h
+++ b/lib/lattice/Lattice_unary.h
@@ -24,6 +24,17 @@ PARALLEL_FOR_LOOP
    return ret;
  }

+  template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=div(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+
  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, ComplexD alpha, Integer Nexp = DEFAULT_MAT_EXP){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -266,11 +266,8 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
    if( this->HandOptDslash ) {
 #pragma omp parallel for schedule(static)
      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
 	for(int s=0;s<Ls;s++){
-	  int sU=ss;
-	  if (    LebesgueOrder::UseLebesgueOrder ) {
-	    sU=lo.Reorder(ss);
-	  }
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 	  }
@@ -323,52 +320,42 @@ PARALLEL_FOR_LOOP
      //      Counter.Report();
      //      }
    } else if( this->HandOptDslash ) {
+      /*

-#pragma omp parallel for 
+#pragma omp parallel for schedule(static)
      for(int t=0;t<threads;t++){

 	int hyperthread = t%HT;
 	int core        = t/HT;

-        int sswork, swork,soff, sU,sF;
-
-	sswork = (nwork + cores-1)/cores;
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);

 	for(int ss=0;ss<sswork;ss++){
-	  sU=ss+core*sswork; // max locality within an L2 slice
-	  if ( LebesgueOrder::UseLebesgueOrder ) {
-	    sU = lo.Reorder(sU);
+	  sU=ss+ ssoff;
+	  for(int s=soff;s<soff+swork;s++){
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	  }
-	  if ( sU < nwork ) {
-	    for(int s=soff;s<soff+swork;s++){
-	      sF = s+Ls*sU;
-	      Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
-	    }
-	  }
-	}
-      }
-
-      /*
-#pragma omp parallel for schedule(static)
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	for(int s=0;s<Ls;s++){
-	  int sU=ss;
-	  if (    LebesgueOrder::UseLebesgueOrder ) {
-	    sU=lo.Reorder(ss);
-	  }
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
      }
      */

+#pragma omp parallel for schedule(static)
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
+	}
+      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
 	for(int s=0;s<Ls;s++){
-	  //	  int sU=lo.Reorder(ss);
-	  int sU=ss;
 	  int sF = s+Ls*sU; 
 	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -29,7 +29,7 @@ namespace Grid {
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
 #if defined(AVX512) || defined(IMCI)
-     void DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
 #else
--- a/lib/serialisation/BinaryIO.cc
+++ b/lib/serialisation/BinaryIO.cc
@@ -1,15 +1,14 @@
 #include <Grid.h>

-using namespace Grid;
-using namespace std;

+namespace Grid {
 // Writer implementation ///////////////////////////////////////////////////////
-BinaryWriter::BinaryWriter(const string &fileName)
-: file_(fileName, ios::binary|ios::out)
+BinaryWriter::BinaryWriter(const std::string &fileName)
+: file_(fileName, std::ios::binary|std::ios::out)
 {}

 template <>
-void BinaryWriter::writeDefault(const string &s, const string &output)
+void BinaryWriter::writeDefault(const std::string &s, const std::string &output)
 {
  uint64_t sz = output.size();
  
@@ -21,12 +20,12 @@ void BinaryWriter::writeDefault(const string &s, const string &output)
 }

 // Reader implementation ///////////////////////////////////////////////////////
-BinaryReader::BinaryReader(const string &fileName)
-: file_(fileName, ios::binary|ios::in)
+BinaryReader::BinaryReader(const std::string &fileName)
+: file_(fileName, std::ios::binary|std::ios::in)
 {}

 template <>
-void BinaryReader::readDefault(const string &s, string &output)
+void BinaryReader::readDefault(const std::string &s, std::string &output)
 {
  uint64_t sz;
  
@@ -34,3 +33,4 @@ void BinaryReader::readDefault(const string &s, string &output)
  output.reserve(sz);
  file_.read((char *)output.data(), sz);
 }
+}
--- a/lib/serialisation/TextIO.cc
+++ b/lib/serialisation/TextIO.cc
@@ -1,14 +1,12 @@
 #include <Grid.h>

-using namespace Grid;
-using namespace std;
-
+namespace Grid {
 // Writer implementation ///////////////////////////////////////////////////////
-TextWriter::TextWriter(const string &fileName)
-: file_(fileName, ios::out)
+TextWriter::TextWriter(const std::string &fileName)
+: file_(fileName, std::ios::out)
 {}

-void TextWriter::push(const string &s)
+void TextWriter::push(const std::string &s)
 {
  level_++;
 };
@@ -27,11 +25,11 @@ void TextWriter::indent(void)
 };

 // Reader implementation ///////////////////////////////////////////////////////
-TextReader::TextReader(const string &fileName)
-: file_(fileName, ios::in)
+TextReader::TextReader(const std::string &fileName)
+: file_(fileName, std::ios::in)
 {}

-void TextReader::push(const string &s)
+void TextReader::push(const std::string &s)
 {
  level_++;
 };
@@ -50,17 +48,18 @@ void TextReader::checkIndent(void)
    file_.get(c);
    if (c != '\t')
    {
-      cerr << "mismatch on tab " << c << " level " << level_;
-      cerr << " i "<< i <<endl;
-      abort();
+      std::cerr << "mismatch on tab " << c << " level " << level_;
+      std::cerr << " i "<< i <<std::endl;
+      std::abort();
    }
  }
 }

 template <>
-void TextReader::readDefault(const string &s, string &output)
+void TextReader::readDefault(const std::string &s, std::string &output)
 {
  checkIndent();
  output.clear();
  getline(file_, output);
 }
+}
--- a/lib/serialisation/XmlIO.cc
+++ b/lib/serialisation/XmlIO.cc
@@ -1,10 +1,8 @@
 #include <Grid.h>

-using namespace Grid;
-using namespace std;
-
+namespace Grid {
 // Writer implementation ///////////////////////////////////////////////////////
-XmlWriter::XmlWriter(const string &fileName)
+XmlWriter::XmlWriter(const std::string &fileName)
 : fileName_(fileName)
 {
  node_ = doc_.append_child();
@@ -16,7 +14,7 @@ XmlWriter::~XmlWriter(void)
  doc_.save_file(fileName_.c_str(), "  ");
 }

-void XmlWriter::push(const string &s)
+void XmlWriter::push(const std::string &s)
 {
  node_ = node_.append_child(s.c_str());
 }
@@ -27,22 +25,22 @@ void XmlWriter::pop(void)
 }

 // Reader implementation ///////////////////////////////////////////////////////
-XmlReader::XmlReader(const string &fileName)
+XmlReader::XmlReader(const std::string &fileName)
 : fileName_(fileName)
 {
  pugi::xml_parse_result result = doc_.load_file(fileName_.c_str());
  
  if ( !result )
  {
-    cerr << "XML error description: " << result.description() << "\n";
-    cerr << "XML error offset     : " << result.offset        << "\n";
-    abort();
+    std::cerr << "XML error description: " << result.description() << "\n";
+    std::cerr << "XML error offset     : " << result.offset        << "\n";
+    std::abort();
  }
  
  node_ = doc_.child("grid");
 }

-void XmlReader::push(const string &s)
+void XmlReader::push(const std::string &s)
 {
  node_ = node_.child(s.c_str());
 }
@@ -53,7 +51,8 @@ void XmlReader::pop(void)
 }

 template <>
-void XmlReader::readDefault(const string &s, string &output)
+void XmlReader::readDefault(const std::string &s, std::string &output)
 {
  output = node_.child(s.c_str()).first_child().value();
 }
+}
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@@ -96,6 +96,7 @@ namespace Grid
      node_.child("elem").set_name("elem-done");
      i++;
    }
+    //    assert( is.tellg()==-1);
    pop();
  }
  
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -67,6 +67,14 @@ namespace Grid {
    }
  };

+  template<class scalar> struct DivIntFunctor {
+    Integer y;
+  DivIntFunctor(Integer _y) : y(_y) {};
+    scalar operator()(const scalar &a)  const {
+      return Integer(a)/y;
+    }
+  };
+
  template<class scalar> struct RealFunctor {
    scalar operator()(const scalar &a)  const {
      return real(a);
@@ -131,6 +139,10 @@ namespace Grid {
  inline Grid_simd<S,V> mod(const Grid_simd<S,V> &r,Integer y) {
    return SimdApply(ModIntFunctor<S>(y),r);
  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> div(const Grid_simd<S,V> &r,Integer y) {
+    return SimdApply(DivIntFunctor<S>(y),r);
+  }
  ////////////////////////////////////////////////////////////////////////////
  // Allows us to assign into **conformable** real vectors from complex
  ////////////////////////////////////////////////////////////////////////////
--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -111,7 +111,7 @@ template<class obj,int N> inline auto toComplex(const iMatrix<obj,N> &z) -> type
  return ret;
 }

-
+BINARY_RSCALAR(div,Integer);
 BINARY_RSCALAR(mod,Integer);
 BINARY_RSCALAR(pow,RealD);

--- a/scripts/configure-commands
+++ b/scripts/configure-commands
@@ -59,7 +59,7 @@ clang-avx2)
 CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-avx-openmp)
-CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
+CXX=clang-omp++ ../../configure --enable-precision=double --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-xc30)
 CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS=""  LIBS="-lgmp -lmpfr" --enable-comms=none
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,6 @@
 bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_RectPlaq

+
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
 Test_GaugeAction_LDADD=-lGrid

--- a/tests/Test_cheby.cc
+++ b/tests/Test_cheby.cc
@@ -57,5 +57,21 @@ int main (int argc, char ** argv)
    ChebyStep.csv(of);
  }

+  lo=-8;
+  hi=8;
+  Chebyshev<LatticeFermion> ChebyIndefInv(lo,hi,40,InverseApproximation);
+  {
+    std::ofstream of("chebyindefinv");
+    ChebyIndefInv.csv(of);
+  }
+
+  lo=0;
+  hi=64;
+  Chebyshev<LatticeFermion> ChebyNE(lo,hi,40,InverseApproximation);
+  {
+    std::ofstream of("chebyNE");
+    ChebyNE.csv(of);
+  }
+
  Grid_finalize();
 }
--- a/tests/Test_dwf_hdcr.cc
+++ b/tests/Test_dwf_hdcr.cc
@@ -6,6 +6,22 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;

+class myclass: Serializable {
+public:
+
+  GRID_DECL_CLASS_MEMBERS(myclass,
+			  int, domaindecompose,
+			  int, domainsize,
+			  int, order,
+			  double, lo,
+			  double, hi,
+			  int, steps);
+
+  myclass(){};
+
+};
+myclass params;
+
 RealD InverseApproximation(RealD x){
  return 1.0/x;
 }
@@ -26,15 +42,21 @@ public:

  Aggregates     & _Aggregates;
  CoarseOperator & _CoarseOperator;
-  Matrix         & _Matrix;
+  Matrix         & _FineMatrix;
  FineOperator   & _FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;

  // Constructor
-  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, FineOperator &Fine,Matrix &FineMatrix) 
+  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
+			  FineOperator &Fine,Matrix &FineMatrix,
+			  FineOperator &Smooth,Matrix &SmootherMatrix) 
    : _Aggregates(Agg),
      _CoarseOperator(Coarse),
      _FineOperator(Fine),
-      _Matrix(FineMatrix)
+      _FineMatrix(FineMatrix),
+      _SmootherOperator(Smooth),
+      _SmootherMatrix(SmootherMatrix)
  {
  }

@@ -43,7 +65,7 @@ public:
    FineField p1(in._grid);
    FineField p2(in._grid);

-    MdagMLinearOperator<Matrix,FineField>   fMdagMOp(_Matrix);
+    MdagMLinearOperator<Matrix,FineField>   fMdagMOp(_FineMatrix);

    p1=in;
    RealD absp2;
@@ -58,74 +80,20 @@ public:
    }
  }

-#if 0
  void operator()(const FineField &in, FineField & out) {
-
-    FineField Min(in._grid);
-    FineField tmp(in._grid);
-
-    CoarseVector Csrc(_CoarseOperator.Grid());
-    CoarseVector Ctmp(_CoarseOperator.Grid());
-    CoarseVector Csol(_CoarseOperator.Grid());
-
-    // Monitor completeness of low mode space
-    _Aggregates.ProjectToSubspace  (Csrc,in);
-    _Aggregates.PromoteFromSubspace(Csrc,out);
-    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
-
-    // Build some solvers
-    ConjugateGradient<FineField>    fCG(1.0e-3,1000);
-    ConjugateGradient<CoarseVector>  CG(1.0e-8,100000);
-
-    ////////////////////////////////////////////////////////////////////////
-    // ADEF2: [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
-    ////////////////////////////////////////////////////////////////////////
-
-    // Smoothing step, followed by coarse grid correction
-    MdagMLinearOperator<Matrix,FineField> MdagMOp(_Matrix);
-
-    Min=in;
-    std::cout<<GridLogMessage<< " Preconditioner in  " << norm2(in)<<std::endl; 
-    _FineOperator.AdjOp(Min,tmp);
-    std::cout<<GridLogMessage<< " Preconditioner tmp  " << norm2(in)<<std::endl; 
-
-    fCG(MdagMOp,tmp,out);
-
-    _FineOperator.Op(out,tmp);
-
-    std::cout<<GridLogMessage<< " Preconditioner in  " << norm2(in)<<std::endl; 
-    std::cout<<GridLogMessage<< " Preconditioner out " << norm2(out)<<std::endl; 
-    std::cout<<GridLogMessage<< " Preconditioner Aout" << norm2(tmp)<<std::endl; 
-
-    tmp = tmp - in;
-    
-    std::cout<<GridLogMessage<<"preconditioner thinks residual is "<<std::sqrt(norm2(tmp)/norm2(in))<<std::endl;
-
-    /*
-    //    _FineOperator.Op(Min,out);
-    //    out = in -out; // out = in - A Min
-    out = in;
-
-        MdagMLinearOperator<CoarseOperator,CoarseVector> MdagMOp(_CoarseOperator);
-    HermitianLinearOperator<CoarseOperator,CoarseVector> HermOp(_CoarseOperator);
-    Csol=zero;
-    _Aggregates.ProjectToSubspace  (Csrc,out);
-    HermOp.AdjOp(Csrc,Ctmp);// Normal equations
-    CG(MdagMOp  ,Ctmp,Csol);
-    _Aggregates.PromoteFromSubspace(Csol,out);
-
-    out = Min + out;;
-    */
-
+    if ( params.domaindecompose ) {
+      operatorSAP(in,out);
+    } else { 
+      operatorCheby(in,out);
+    }
  }
-#endif

    ////////////////////////////////////////////////////////////////////////
    // ADEF2: [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
    // ADEF1: [MP+Q ] in =M [1 - A Q] in + Q in  
    ////////////////////////////////////////////////////////////////////////
-#if 0
-  void operator()(const FineField &in, FineField & out) {
+#if 1
+  void operatorADEF2(const FineField &in, FineField & out) {

    CoarseVector Csrc(_CoarseOperator.Grid());
    CoarseVector Ctmp(_CoarseOperator.Grid());
@@ -136,7 +104,7 @@ public:

    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    MdagMLinearOperator<Matrix,FineField>               fMdagMOp(_Matrix);
+    MdagMLinearOperator<Matrix,FineField>               fMdagMOp(_FineMatrix);

    FineField tmp(in._grid);
    FineField res(in._grid);
@@ -189,8 +157,8 @@ public:
  }
 #endif
  // ADEF1: [MP+Q ] in =M [1 - A Q] in + Q in  
-#if 0
-  void operator()(const FineField &in, FineField & out) {
+#if 1
+  void operatorADEF1(const FineField &in, FineField & out) {

    CoarseVector Csrc(_CoarseOperator.Grid());
    CoarseVector Ctmp(_CoarseOperator.Grid());
@@ -201,7 +169,7 @@ public:

    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    ShiftedMdagMLinearOperator<Matrix,FineField>        fMdagMOp(_Matrix,0.1);
+    ShiftedMdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix,0.1);

    FineField tmp(in._grid);
    FineField res(in._grid);
@@ -234,14 +202,79 @@ public:
  }
 #endif

+  void SAP (const FineField & src,FineField & psi){
+
+    Lattice<iScalar<vInteger> > coor(src._grid);
+    Lattice<iScalar<vInteger> > subset(src._grid);
+    
+    FineField r(src._grid);
+    FineField zz(src._grid); zz=zero;
+    FineField vec1(src._grid);
+    FineField vec2(src._grid);
+
+    const Integer block=params.domainsize;
+
+    subset=zero;
+    for(int mu=0;mu<Nd;mu++){
+      LatticeCoordinate(coor,mu+1);
+      coor = div(coor,block);
+      subset = subset+coor;
+    }
+    subset = mod(subset,(Integer)2);
+    
+    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_SmootherMatrix,0.0);
+    Chebyshev<FineField> Cheby  (params.lo,params.hi,params.order,InverseApproximation);
+
+    RealD resid;
+    for(int i=0;i<params.steps;i++){
+      
+      // Even domain residual
+      _FineOperator.Op(psi,vec1);// this is the G5 herm bit
+      r= src - vec1 ;
+      resid = norm2(r) /norm2(src); 
+      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
+
+
+// Npoly*outer*2 1/2 vol matmuls.
+// 71 iters => 20*71 = 1400 matmuls.
+// 2*71 = 140 comms.
+
+      // Even domain solve
+      r= where(subset==(Integer)0,r,zz);
+      _SmootherOperator.AdjOp(r,vec1);
+      Cheby(fMdagMOp,vec1,vec2);    // solves  MdagM = g5 M g5M
+      psi = psi + vec2;  
+
+      // Odd domain residual
+      _FineOperator.Op(psi,vec1);// this is the G5 herm bit
+      r= src - vec1 ;
+      r= where(subset==(Integer)1,r,zz);
+
+      resid = norm2(r) /norm2(src); 
+      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
+      
+      // Odd domain solve
+      _SmootherOperator.AdjOp(r,vec1);
+      Cheby(fMdagMOp,vec1,vec2);    // solves  MdagM = g5 M g5M
+      psi = psi + vec2;  
+
+      _FineOperator.Op(psi,vec1);// this is the G5 herm bit
+      r= src - vec1 ;
+      resid = norm2(r) /norm2(src); 
+      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
+
+    }
+
+  };
+
  void SmootherTest (const FineField & in){
    
    FineField vec1(in._grid);
    FineField vec2(in._grid);
    RealD lo[3] = { 0.5, 1.0, 2.0};

-    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_Matrix);
-    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_Matrix,0.5);
+    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix);
+    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_SmootherMatrix,0.0);

    RealD Ni,r;

@@ -250,7 +283,7 @@ public:
    for(int ilo=0;ilo<3;ilo++){
      for(int ord=5;ord<50;ord*=2){

-	_FineOperator.AdjOp(in,vec1);
+	_SmootherOperator.AdjOp(in,vec1);

 	Chebyshev<FineField> Cheby  (lo[ilo],70.0,ord,InverseApproximation);
 	Cheby(fMdagMOp,vec1,vec2);    // solves  MdagM = g5 M g5M
@@ -264,7 +297,7 @@ public:
    }
  }

-  void operator()(const FineField &in, FineField & out) {
+  void operatorCheby(const FineField &in, FineField & out) {

    CoarseVector Csrc(_CoarseOperator.Grid());
    CoarseVector Ctmp(_CoarseOperator.Grid());
@@ -275,18 +308,18 @@ public:

    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
-    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_Matrix);
-    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_Matrix,0.0);
+    //    MdagMLinearOperator<Matrix,FineField>        fMdagMOp(_FineMatrix);
+    ShiftedMdagMLinearOperator<Matrix,FineField> fMdagMOp(_SmootherMatrix,0.0);

    FineField vec1(in._grid);
    FineField vec2(in._grid);

    //    Chebyshev<FineField> Cheby    (0.5,70.0,30,InverseApproximation);
    //    Chebyshev<FineField> ChebyAccu(0.5,70.0,30,InverseApproximation);
-    Chebyshev<FineField> Cheby    (2.0,70.0,10,InverseApproximation);
-    Chebyshev<FineField> ChebyAccu(2.0,70.0,10,InverseApproximation);
-    Cheby.JacksonSmooth();
-    ChebyAccu.JacksonSmooth();
+    Chebyshev<FineField> Cheby    (2.0,70.0,15,InverseApproximation);
+    Chebyshev<FineField> ChebyAccu(2.0,70.0,15,InverseApproximation);
+    //    Cheby.JacksonSmooth();
+    //    ChebyAccu.JacksonSmooth();

    _Aggregates.ProjectToSubspace  (Csrc,in);
    _Aggregates.PromoteFromSubspace(Csrc,out);
@@ -305,7 +338,7 @@ public:

    RealD Ni = norm2(in);

-    _FineOperator.AdjOp(in,vec1);// this is the G5 herm bit
+    _SmootherOperator.AdjOp(in,vec1);// this is the G5 herm bit
    ChebyAccu(fMdagMOp,vec1,out);    // solves  MdagM = g5 M g5M

    std::cout<<GridLogMessage << "Smoother norm "<<norm2(out)<<std::endl;
@@ -334,23 +367,89 @@ public:
    std::cout<<GridLogMessage << "Coarse resid "<<std::sqrt(r/Ni)<<std::endl;

    // Reapply smoother
-    _FineOperator.Op(vec1,vec2);  // this is the G5 herm bit
+    _SmootherOperator.Op(vec1,vec2);  // this is the G5 herm bit
    ChebyAccu(fMdagMOp,vec2,vec1);    // solves  MdagM = g5 M g5M

    out =out+vec1;
-    _FineOperator.Op(out,vec1);// this is the G5 herm bit
    vec1  = in - vec1;   // tmp  = in - A Min
    r=norm2(vec1);
    std::cout<<GridLogMessage << "Smoother resid "<<std::sqrt(r/Ni)<<std::endl;

  }

+  void operatorSAP(const FineField &in, FineField & out) {
+
+    CoarseVector Csrc(_CoarseOperator.Grid());
+    CoarseVector Ctmp(_CoarseOperator.Grid());
+    CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;
+
+    ConjugateGradient<CoarseVector>  CG(1.0e-3,100000);
+
+    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
+    MdagMLinearOperator<CoarseOperator,CoarseVector>     MdagMOp(_CoarseOperator);
+
+    FineField vec1(in._grid);
+    FineField vec2(in._grid);
+
+    _Aggregates.ProjectToSubspace  (Csrc,in);
+    _Aggregates.PromoteFromSubspace(Csrc,out);
+    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
+    
+
+    // To make a working smoother for indefinite operator
+    // must multiply by "Mdag" (ouch loses all low mode content)
+    // and apply to poly approx of (mdagm)^-1.
+    // so that we end up with an odd polynomial.
+    SAP(in,out);
+
+    // Update with residual for out
+    _FineOperator.Op(out,vec1);// this is the G5 herm bit
+    vec1  = in - vec1;   // tmp  = in - A Min
+
+    RealD r = norm2(vec1);
+    RealD Ni = norm2(in);
+    std::cout<<GridLogMessage << "SAP resid "<<std::sqrt(r/Ni)<< " " << r << " " << Ni <<std::endl;
+    
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    HermOp.AdjOp(Csrc,Ctmp);// Normal equations
+    CG(MdagMOp,Ctmp,Csol);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); // Ass^{-1} [in - A Min]_s
+                                             // Q = Q[in - A Min]  
+    out = out+vec1;
+
+    // Three preconditioner smoothing -- hermitian if C3 = C1
+    // Recompute error
+    _FineOperator.Op(out,vec1);// this is the G5 herm bit
+    vec1  = in - vec1;   // tmp  = in - A Min
+    r=norm2(vec1);
+
+    std::cout<<GridLogMessage << "Coarse resid "<<std::sqrt(r/Ni)<<std::endl;
+
+    // Reapply smoother
+    SAP(vec1,vec2);
+    out =out+vec2;
+
+
+    // Update with residual for out
+    _FineOperator.Op(out,vec1);// this is the G5 herm bit
+    vec1  = in - vec1;   // tmp  = in - A Min
+
+    r = norm2(vec1);
+    Ni = norm2(in);
+    std::cout<<GridLogMessage << "SAP resid(post) "<<std::sqrt(r/Ni)<< " " << r << " " << Ni <<std::endl;
+
+  }
+
 };

 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

+  XmlReader RD("params.xml");
+  read(RD,"params",params);
+  std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
+
  const int Ls=8;

  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
@@ -385,11 +484,27 @@ int main (int argc, char ** argv)
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeGaugeField Umu(UGrid); 
+  LatticeGaugeField UmuDD(UGrid); 
+  LatticeColourMatrix U(UGrid);
+  LatticeColourMatrix zz(UGrid);

  NerscField header;
  std::string file("./ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu,header,file);

+
+  if ( params.domaindecompose ) { 
+    Lattice<iScalar<vInteger> > coor(UGrid);
+    zz=zero;
+    for(int mu=0;mu<Nd;mu++){
+      LatticeCoordinate(coor,mu);
+      U = PeekIndex<LorentzIndex>(Umu,mu);
+      U = where(mod(coor,params.domainsize)==(Integer)0,zz,U);
+      PokeIndex<LorentzIndex>(UmuDD,U,mu);
+    }
+  } else { 
+    UmuDD = Umu;
+  }
  //  SU3::ColdConfiguration(RNG4,Umu);
  //  SU3::TepidConfiguration(RNG4,Umu);
  //  SU3::HotConfiguration(RNG4,Umu);
@@ -402,6 +517,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionR DdwfDD(UmuDD,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);

  const int nbasis = 32;
  //  const int nbasis = 4;
@@ -438,6 +554,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
+  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOpDD(DdwfDD);
  CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LDOp(*Coarse5d);
  LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);

@@ -467,7 +584,13 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Building deflation preconditioner "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;

-  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR> Precon(Aggregates, LDOp,HermIndefOp,Ddwf);
+  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR> Precon  (Aggregates, LDOp,
+											   HermIndefOp,Ddwf,
+											   HermIndefOp,Ddwf);
+
+  MultiGridPreconditioner <vSpinColourVector,vTComplex,nbasis,DomainWallFermionR> PreconDD(Aggregates, LDOp,
+											   HermIndefOp,Ddwf,
+											   HermIndefOpDD,DdwfDD);
  TrivialPrecon<LatticeFermion> simple;

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
@@ -475,9 +598,20 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  Precon.SmootherTest(src);

+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  PreconDD.SmootherTest(src);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  PreconDD.SAP(src,result);
+
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
  //  TrivialPrecon<LatticeFermion> simple;
  //  ConjugateGradient<LatticeFermion> fCG(1.0e-8,100000);
  //  fCG(HermDefOp,src,result);
@@ -496,12 +630,22 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  Precon.PowerMethod(src);

+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
+  result=zero;
+  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  PGCRDD(HermIndefOp,src,result);
+
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
-  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
-  PGCR(HermIndefOp,src,result);
+  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
+  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  //  result=zero;
+  //  PGCR(HermIndefOp,src,result);

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
@@ -516,6 +660,7 @@ int main (int argc, char ** argv)

  pCG(HermOpEO,src_o,result_o);

+
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -52,6 +52,7 @@ int main (int argc, char ** argv)
  }
  */

+  typedef CartesianStencil<vobj,vobj,SimpleCompressor<vobj> > Stencil;
    for(int dir=0;dir<4;dir++){
      for(int disp=0;disp<Fine._fdimensions[dir];disp++){

@@ -61,7 +62,7 @@ int main (int argc, char ** argv)
 	std::vector<int> directions(npoint,dir);
 	std::vector<int> displacements(npoint,disp);

-	CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
+	Stencil myStencil(&Fine,npoint,0,directions,displacements);

 	std::vector<int> ocoor(4);
 	for(int o=0;o<Fine.oSites();o++){
@@ -142,8 +143,8 @@ int main (int argc, char ** argv)
 	std::vector<int> directions(npoint,dir);
 	std::vector<int> displacements(npoint,disp);

-	CartesianStencil EStencil(&rbFine,npoint,Even,directions,displacements);
-	CartesianStencil OStencil(&rbFine,npoint,Odd,directions,displacements);
+	Stencil EStencil(&rbFine,npoint,Even,directions,displacements);
+	Stencil OStencil(&rbFine,npoint,Odd,directions,displacements);

 	std::vector<int> ocoor(4);
 	for(int o=0;o<Fine.oSites();o++){
--- a/tests/Test_synthetic_lanczos.cc
+++ b/tests/Test_synthetic_lanczos.cc
@@ -8,6 +8,7 @@ using namespace Grid::QCD;
 static int
 FEenableexcept (unsigned int excepts)
 {
+#if 0
  static fenv_t fenv;
  unsigned int new_excepts = excepts & FE_ALL_EXCEPT,
    old_excepts;  // previous masks
@@ -20,6 +21,9 @@ FEenableexcept (unsigned int excepts)
  fenv.__mxcsr   &= ~(new_excepts << 7);

  return ( fesetenv (&fenv) ? -1 : old_excepts );
+#else
+  return 0;
+#endif
 }


@@ -35,7 +39,7 @@ public:

    random(pRNG,scale);

-    scale = exp(-real(scale)*6.0);
+    scale = exp(-real(scale)*3.0);
    std::cout << " True matrix \n"<< scale <<std::endl;
  }

@@ -70,7 +74,7 @@ public:
 int main (int argc, char ** argv)
 {

-  FEenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); 
+  //  FEenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); 

  Grid_init(&argc,&argv);

@@ -88,8 +92,7 @@ int main (int argc, char ** argv)
  RealD mu    = 0.0;
  int order = 11;
  ChebyshevLanczos<LatticeComplex> Cheby(alpha,beta,mu,order);
-
-  std::ofstream file("pooh.dat");
+  std::ofstream file("cheby.dat");
  Cheby.csv(file);

  HermOpOperatorFunction<LatticeComplex> X;
@@ -114,9 +117,9 @@ int main (int argc, char ** argv)
  }
  
  {
-    std::vector<RealD>          eval(Nm);
-    std::vector<LatticeComplex> evec(Nm,grid);
-    ChebyIRL.calc(eval,evec,src, Nconv);
+    //    std::vector<RealD>          eval(Nm);
+    //    std::vector<LatticeComplex> evec(Nm,grid);
+    //    ChebyIRL.calc(eval,evec,src, Nconv);
  }

  Grid_finalize();