Checking in working version of Lanczos.

Works with CPS evolution
Zmobius test was wrong! (only mobius) checking in again
2025-06-14 22:07:05 +01:00 · 2017-03-21 16:45:33 -04:00 · 2017-03-21 12:40:43 -04:00 · 2017-03-16 23:04:28 -04:00
402 changed files with 12652 additions and 60147 deletions
--- a/.gitignore
+++ b/.gitignore
@ -92,7 +92,6 @@ build*/*
 #####################
 *.xcodeproj/*
 build.sh
 .vscode
 # Eigen source #
 ################
@ -107,10 +106,6 @@ lib/fftw/*
 m4/lt*
 m4/libtool.m4
 # github pages #
 ################
 gh-pages/
 # Buck files #
 ##############
 .buck*
@ -121,5 +116,4 @@ make-bin-BUCK.sh
 # generated sources #
 #####################
 lib/qcd/spin/gamma-gen/*.h
-lib/qcd/spin/gamma-gen/*.cc
+lib/qcd/spin/gamma-gen/*.cc
--- a/.travis.yml
+++ b/.travis.yml
@ -7,11 +7,9 @@ cache:
 matrix:
  include:
    - os:        osx
-      osx_image: xcode8.3
+      osx_image: xcode7.2
      compiler: clang
    - compiler: gcc
      dist: trusty
      sudo: required
      addons:
        apt:
          sources:
@ -26,8 +24,6 @@ matrix:
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
      dist: trusty
      sudo: required
      addons:
        apt:
          sources:
@ -42,7 +38,6 @@ matrix:
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
      dist: trusty
      addons:
        apt:
          sources:
@ -57,7 +52,6 @@ matrix:
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
      dist: trusty
      addons:
        apt:
          sources:
@ -79,15 +73,13 @@ before_install:
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
 install:
    - export CC=$CC$VERSION
    - export CXX=$CXX$VERSION
    - echo $PATH
    - which autoconf
    - autoconf  --version
    - which automake
    - automake  --version
    - which $CC
    - $CC  --version
    - which $CXX
@ -100,15 +92,15 @@ script:
    - cd build
    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
    - make -j4 
-    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
+    - ./benchmarks/Benchmark_dwf --threads 1
    - echo make clean
    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
-    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
+    - ./benchmarks/Benchmark_dwf --threads 1
    - make check
    - echo make clean
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
+    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
+    - make -j4
-
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
--- a/Makefile.am
+++ b/Makefile.am
@ -3,15 +3,10 @@ SUBDIRS = lib benchmarks tests extras
 include $(top_srcdir)/doxygen.inc
-bin_SCRIPTS=grid-config
+tests: all
 	$(MAKE) -C tests tests
-
+.PHONY: tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
 .PHONY: bench check tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
 tests-local: all
 bench-local: all
 check-local: all
 AM_CXXFLAGS += -I$(top_builddir)/include
 ACLOCAL_AMFLAGS = -I m4
--- a/README.md
+++ b/README.md
@ -22,26 +22,6 @@ Last update Nov 2016.
 _Please do not send pull requests to the `master` branch which is reserved for releases._
 ### Compilers
 Intel ICPC v16.0.3 and later
 Clang v3.5 and later (need 3.8 and later for OpenMP)
 GCC   v4.9.x (recommended)
 GCC   v6.3 and later
 ### Important: 
 Some versions of GCC appear to have a bug under high optimisation (-O2, -O3).
 The safety of these compiler versions cannot be guaranteed at this time. Follow Issue 100 for details and updates.
 GCC   v5.x
 GCC   v6.1, v6.2
 ### Bug report
 _To help us tracking and solving more efficiently issues with Grid, please report problems using the issue system of GitHub rather than sending emails to Grid developers._
@ -52,7 +32,7 @@ When you file an issue, please go though the following checklist:
 2. Give a description of the target platform (CPU, network, compiler). Please give the full CPU part description, using for example `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) or `sysctl machdep.cpu.brand_string` (macOS) and the full output the `--version` option of your compiler.
 3. Give the exact `configure` command used.
 4. Attach `config.log`.
-5. Attach `grid.config.summary`.
+5. Attach `config.summary`.
 6. Attach the output of `make V=1`.
 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
@ -115,10 +95,10 @@ install Grid. Other options are detailed in the next section, you can also use `
 `CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to
 customise the build.
-Finally, you can build, check, and install Grid:
+Finally, you can build and install Grid:
 ``` bash
-make; make check; make install
+make; make install
 ```
 To minimise the build time, only the tests at the root of the `tests` directory are built by default. If you want to build tests in the sub-directory `<subdir>` you can execute:
@ -141,7 +121,7 @@ If you want to build all the tests at once just use `make tests`.
 - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
 - `--enable-precision={single|double}`: set the default precision (default: `double`).
 - `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
+- `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `).
 - `--disable-timers`: disable system dependent high-resolution timers.
 - `--enable-chroma`: enable Chroma regression tests.
 - `--enable-doxygen-doc`: enable the Doxygen documentation generation (build with `make doxygen-doc`)
@ -179,6 +159,7 @@ Alternatively, some CPU codenames can be directly used:
 | `<code>`    | Description                            |
 | ----------- | -------------------------------------- |
 | `KNC`       | [Intel Xeon Phi codename Knights Corner](http://ark.intel.com/products/codename/57721/Knights-Corner) |
 | `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
 | `BGQ`       | Blue Gene/Q                            |
--- a/68
+++ b/68
@ -1,33 +1,6 @@
 TODO:
 ---------------
 Large item work list:
 1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
 2)- Christoph's local basis expansion Lanczos
 3)- BG/Q port and check
 4)- Precision conversion and sort out localConvert      <-- partial
  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 5)- Physical propagator interface
 6)- Conserved currents
 7)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 8)- HDCR resume
 Recent DONE 
 -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE
 -- Binary I/O MPI2 IO                                  <-- DONE
 -- Binary I/O speed up & x-strips                      <-- DONE
 -- Cut down the exterior overhead                      <-- DONE
 -- Interior legs from SHM comms                        <-- DONE
 -- Half-precision comms                                <-- DONE
 -- Merge high precision reduction into develop         <-- DONE
 -- BlockCG, BCGrQ                                      <-- DONE
 -- multiRHS DWF; benchmark on Cori/BNL for comms elimination <-- DONE
   -- slice* linalg routines for multiRHS, BlockCG    
 -----
 * Forces; the UdSdU  term in gauge force term is half of what I think it should
  be. This is a consequence of taking ONLY the first term in:
@ -48,8 +21,16 @@ Recent DONE
  This means we must double the force in the Test_xxx_force routines, and is the origin of the factor of two.
  This 2x is applied by hand in the fermion routines and in the Test_rect_force routine.
 Policies:
 * Link smearing/boundary conds; Policy class based implementation ; framework more in place
 * Support different boundary conditions (finite temp, chem. potential ... )
 * Support different fermion representations? 
  - contained entirely within the integrator presently
 - Sign of force term.
 - Reversibility test.
@ -60,6 +41,11 @@ Recent DONE
 - Audit oIndex usage for cb behaviour
 - Rectangle gauge actions.
  Iwasaki,
  Symanzik,
  ... etc...
 - Prepare multigrid for HMC. - Alternate setup schemes.
 - Support for ILDG --- ugly, not done
@ -69,11 +55,9 @@ Recent DONE
 - FFTnD ?
 - Gparity; hand opt use template specialisation elegance to enable the optimised paths ?
 - Gparity force term; Gparity (R)HMC.
-
+- Random number state save restore
 - Mobius implementation clean up to rmove #if 0 stale code sequences
 - CG -- profile carefully, kernel fusion, whole CG performance measurements.
 ================================================================
@ -106,7 +90,6 @@ Insert/Extract
 Not sure of status of this -- reverify. Things are working nicely now though.
 * Make the Tensor types and Complex etc... play more nicely.
  - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > >
    QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
    want to introduce a syntax that does not require this.
@ -129,8 +112,6 @@ Not sure of status of this -- reverify. Things are working nicely now though.
 RECENT
 ---------------
  - Support different fermion representations? -- DONE
  - contained entirely within the integrator presently
  - Clean up HMC                                                             -- DONE
  - LorentzScalar<GaugeField> gets Gauge link type (cleaner).                -- DONE
  - Simplified the integrators a bit.                                        -- DONE
@ -142,26 +123,6 @@ RECENT
  - Parallel io improvements                                  -- DONE
  - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE
 DONE:
 - MultiArray -- MultiRHS done
 - ConjugateGradientMultiShift -- DONE
 - MCR                         -- DONE
 - Remez -- Mike or Boost?     -- DONE
 - Proto (ET)                  -- DONE
 - uBlas                       -- DONE ; Eigen
 - Potentially Useful Boost libraries -- DONE ; Eigen
 - Aligned allocator; memory pool -- DONE
 - Multiprecision              -- DONE
 - Serialization               -- DONE
 - Regex -- Not needed
 - Tokenize -- Why?
 - Random number state save restore -- DONE
 - Rectangle gauge actions. -- DONE
  Iwasaki,
  Symanzik,
  ... etc...
 Done: Cayley, Partial , ContFrac force terms.
 DONE
@ -246,7 +207,6 @@ Done
 FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane)
 ======================================================================================================
 * Link smearing/boundary conds; Policy class based implementation ; framework more in place -- DONE
 * Command line args for geometry, simd, etc. layout. Is it necessary to have -- DONE
  user pass these? Is this a QCD specific?
--- a/9
+++ b/9
@ -1,5 +1,6 @@
-Version : 0.7.0
+Version : 0.6.0
- Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended
+- AVX512, AVX2, AVX, SSE good
- MPI and MPI3 comms optimisations for KNL and OPA finished
+- Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above
- Half precision comms
+- MPI and MPI3
 - HiRep, Smearing, Generic gauge group
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@ -31,32 +31,6 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 struct time_statistics{
  double mean;
  double err;
  double min;
  double max;
  void statistics(std::vector<double> v){
      double sum = std::accumulate(v.begin(), v.end(), 0.0);
      mean = sum / v.size();
      std::vector<double> diff(v.size());
      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
      auto result = std::minmax_element(v.begin(), v.end());
      min = *result.first;
      max = *result.second;
 }
 };
 void header(){
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@ -66,21 +40,17 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-  int Nloop=100;
+  int Nloop=10;
  int nmu=0;
  int maxlat=24;
  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
  std::vector<double> t_time(Nloop);
  time_statistics timestat;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-  for(int lat=4;lat<=maxlat;lat+=4){
+  int maxlat=16;
-    for(int Ls=8;Ls<=32;Ls*=2){
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@ -88,9 +58,6 @@ int main (int argc, char ** argv)
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
@ -98,8 +65,8 @@ int main (int argc, char ** argv)
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int i=0;i<Nloop;i++){
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
@ -135,24 +102,18 @@ int main (int argc, char ** argv)
 	}
 	Grid.SendToRecvFromComplete(requests);
 	Grid.Barrier();
-	double stop=usecond();
+
 	t_time[i] = stop-start; // microseconds
      }
      double stop=usecond();
-      timestat.statistics(t_time);
+      double dbytes    = bytes;
-
+      double xbytes    = Nloop*dbytes*2.0*ncomm;
      double dbytes    = bytes*ppn;
      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
-      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+      double time = stop-start; // microseconds
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    
@ -160,17 +121,15 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-  for(int lat=4;lat<=maxlat;lat+=4){
+
-    for(int Ls=8;Ls<=32;Ls*=2){
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat,lat,lat,lat});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
@ -179,8 +138,8 @@ int main (int argc, char ** argv)
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int i=0;i<Nloop;i++){
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
@ -219,37 +178,30 @@ int main (int argc, char ** argv)
 	  }
 	}
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
-      timestat.statistics(t_time);
+      double stop=usecond();
-      double dbytes    = bytes*ppn;
+      double dbytes    = bytes;
-      double xbytes    = dbytes*2.0*ncomm;
+      double xbytes    = Nloop*dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
-    std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+      double time = stop-start;
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
-      
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }  
  Nloop=100;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-  for(int lat=4;lat<=maxlat;lat+=4){
+  for(int lat=4;lat<=maxlat;lat+=2){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@ -257,9 +209,6 @@ int main (int argc, char ** argv)
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
@ -267,115 +216,16 @@ int main (int argc, char ** argv)
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-      double dbytes;
+      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
 	dbytes=0;
 	ncomm=0;
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	for(int mu=0;mu<4;mu++){
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu][0],
 					      xmit_to_rank,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,
 					      bytes);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu+4][0],
 					      xmit_to_rank,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,
 					      bytes);
 	  }
 	}
 	Grid.StencilSendToRecvFromComplete(requests);
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
      dbytes=dbytes*ppn;
      double xbytes    = dbytes*0.5;
      double rbytes    = dbytes*0.5;
      double bidibytes = dbytes;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=32;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
@ -387,52 +237,123 @@ int main (int argc, char ** argv)
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
+	    Grid.StencilSendToRecvFromBegin(requests,
-	      Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu][0],
-					      (void *)&xbuf[mu][0],
+					    xmit_to_rank,
-					      xmit_to_rank,
+					    (void *)&rbuf[mu][0],
-					      (void *)&rbuf[mu][0],
+					    recv_from_rank,
-					      recv_from_rank,
+					    bytes);
-					      bytes);
+	
-	    Grid.StencilSendToRecvFromComplete(requests);
+	    comm_proc = mpi_layout[mu]-1;
-	    requests.resize(0);
+	  
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.StencilSendToRecvFromBegin(requests,
 					    (void *)&xbuf[mu+4][0],
 					    xmit_to_rank,
 					    (void *)&rbuf[mu+4][0],
 					    recv_from_rank,
 					    bytes);
 	  }
 	}
 	Grid.StencilSendToRecvFromComplete(requests);
 	Grid.Barrier();
      }
      double stop=usecond();
      double dbytes    = bytes;
      double xbytes    = Nloop*dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
      double time = stop-start; // microseconds
      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    
  Nloop=100;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.StencilSendToRecvFromBegin(requests,
 					    (void *)&xbuf[mu][0],
 					    xmit_to_rank,
 					    (void *)&rbuf[mu][0],
 					    recv_from_rank,
 					    bytes);
 	    //	    Grid.StencilSendToRecvFromComplete(requests);
 	    //	    requests.resize(0);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
+	    Grid.StencilSendToRecvFromBegin(requests,
-	      Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu+4][0],
-					      (void *)&xbuf[mu+4][0],
+					    xmit_to_rank,
-					      xmit_to_rank,
+					    (void *)&rbuf[mu+4][0],
-					      (void *)&rbuf[mu+4][0],
+					    recv_from_rank,
-					      recv_from_rank,
+					    bytes);
 					      bytes);
 	    Grid.StencilSendToRecvFromComplete(requests);
 	    requests.resize(0);
 	  }
 	}
 	Grid.Barrier();
-	double stop=usecond();
+
 	t_time[i] = stop-start; // microseconds
      }
      double stop=usecond();
-      timestat.statistics(t_time);
+      double dbytes    = bytes;
      double xbytes    = Nloop*dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
-      dbytes=dbytes*ppn;
+      double time = stop-start; // microseconds
      double xbytes    = dbytes*0.5;
      double rbytes    = dbytes*0.5;
      double bidibytes = dbytes;
-
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -1,22 +1,28 @@
- /*************************************************************************************
+    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
@ -42,16 +48,16 @@ typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
+  const int Ls=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@ -65,66 +71,35 @@ int main (int argc, char ** argv)
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
-  
+
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
  LatticeFermion src   (FGrid); random(RNG5,src);
 #if 0
  src = zero;
  {
    std::vector<int> origin({0,0,0,latt4[2]-1,0});
    SpinColourVectorF tmp;
    tmp=zero;
    tmp()(0)(0)=Complex(-2.0,0.0);
    std::cout << " source site 0 " << tmp<<std::endl;
    pokeSite(tmp,src,origin);
  }
 #else
  RealD N2 = 1.0/::sqrt(norm2(src));
  src = src*N2;
 #endif
  LatticeFermion result(FGrid); result=zero;
  LatticeFermion    ref(FGrid);    ref=zero;
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
  LatticeGaugeField Umu(UGrid); 
-  SU3::HotConfiguration(RNG4,Umu); 
+  random(RNG4,Umu);
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 #if 0
  Umu=1.0;
  for(int mu=0;mu<Nd;mu++){
    LatticeColourMatrix ttmp(UGrid);
    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
    //    if (mu !=2 ) ttmp = 0;
    //    ttmp = ttmp* pow(10.0,mu);
    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
  }
  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
 #endif
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  // replicate across fifth dimension
  LatticeGaugeField Umu5d(FGrid); 
-  std::vector<LatticeColourMatrix> U(4,FGrid);
+
  // replicate across fifth dimension
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
    }
  }
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  std::vector<LatticeColourMatrix> U(4,FGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
  }
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
  if (1)
  {
@ -145,7 +120,8 @@ int main (int argc, char ** argv)
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
-  RealD NN = UGrid->NodeCount();
+
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
@ -155,22 +131,15 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  int ncall =100;
  int ncall =1000;
  if (1) {
    FGrid->Barrier();
    Dw.ZeroCounters();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
@ -184,55 +153,16 @@ int main (int argc, char ** argv)
    double flops=1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
-    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    /*
    if(( norm2(err)>1.0e-4) ) { 
      std::cout << "RESULT\n " << result<<std::endl;
      std::cout << "REF   \n " << ref   <<std::endl;
      std::cout << "ERR   \n " << err   <<std::endl;
      FGrid->Barrier();
      exit(-1);
    }
    */
    assert (norm2(err)< 1.0e-4 );
    Dw.Report();
  }
  DomainWallFermionRL DwH(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  if (1) {
    FGrid->Barrier();
    DwH.ZeroCounters();
    DwH.Dhop(src,result,0);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      DwH.Dhop(src,result,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    assert (norm2(err)< 1.0e-3 );
    DwH.Report();
  }
  if (1)
  {
@ -241,10 +171,6 @@ int main (int argc, char ** argv)
    std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
    if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
    if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
@ -256,13 +182,21 @@ int main (int argc, char ** argv)
    LatticeFermion sresult(sFGrid);
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
-
+  
-    localConvert(src,ssrc);
+    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVector tmp;
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
    std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
    FGrid->Barrier();
    sDw.Dhop(ssrc,sresult,0);
    sDw.ZeroCounters();
    double t0=usecond();
    sDw.ZeroCounters();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      sDw.Dhop(ssrc,sresult,0);
@ -276,52 +210,46 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    //    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
    sDw.Report();
    if(0){
      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 	sDw.Dhop(ssrc,sresult,0);
 	PerformanceCounter Counter(i);
 	Counter.Start();
 	sDw.Dhop(ssrc,sresult,0);
 	Counter.Stop();
 	Counter.Report();
      }
    }
    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
    RealD sum=0;
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVector normal, simd;
      peekSite(normal,result,site);
      peekSite(simd,sresult,site);
      sum=sum+norm2(normal-simd);
      if (norm2(normal-simd) > 1.0e-6 ) {
 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
      }
    }}}}}
    std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
    assert (sum< 1.0e-4 );
    err=zero;
    localConvert(sresult,err);
    err = err - ref;
    sum = norm2(err);
    std::cout<<GridLogMessage<<" difference between normal ref and simd is "<<sum<<std::endl;
    if(sum > 1.0e-4 ){
      std::cout<< "sD REF\n " <<ref << std::endl;
      std::cout<< "sD ERR   \n " <<err  <<std::endl;
    }
    //    assert(sum < 1.0e-4);
-    err=zero;
+    if (1) {
    localConvert(sresult,err);
    err = err - result;
    sum = norm2(err);
    std::cout<<GridLogMessage<<" difference between normal result and simd is "<<sum<<std::endl;
    if(sum > 1.0e-4 ){
      std::cout<< "sD REF\n " <<result << std::endl;
      std::cout<< "sD ERR   \n " << err  <<std::endl;
    }
    assert(sum < 1.0e-4);
    if(1){
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
      std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
      if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
      if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) 
 	std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) 
 	std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) 
 	std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      LatticeFermion sr_eo(sFGrid);
      LatticeFermion ssrc_e (sFrbGrid);
      LatticeFermion ssrc_o (sFrbGrid);
      LatticeFermion sr_e   (sFrbGrid);
@ -329,30 +257,39 @@ int main (int argc, char ** argv)
      pickCheckerboard(Even,ssrc_e,ssrc);
      pickCheckerboard(Odd,ssrc_o,ssrc);
-      //      setCheckerboard(sr_eo,ssrc_o);
+
-      //      setCheckerboard(sr_eo,ssrc_e);
+      setCheckerboard(sr_eo,ssrc_o);
      setCheckerboard(sr_eo,ssrc_e);
      sr_e = zero;
      sr_o = zero;
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
      std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
      if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
      if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      FGrid->Barrier();
      sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
      sDw.ZeroCounters();
-      //      sDw.stat.init("DhopEO");
+      sDw.stat.init("DhopEO");
      double t0=usecond();
      for (int i = 0; i < ncall; i++) {
        sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
      }
      double t1=usecond();
      FGrid->Barrier();
-      //      sDw.stat.print();
+      sDw.stat.print();
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
      sDw.Report();
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
@ -361,26 +298,22 @@ int main (int argc, char ** argv)
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
      RealD error = norm2(ssrc_e);
      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      ssrc_o = ssrc_o - sr_o;
      error+= norm2(ssrc_o);
      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
-
+      if(error>1.0e-4) { 
      if(( error>1.0e-4) ) { 
 	setCheckerboard(ssrc,ssrc_o);
 	setCheckerboard(ssrc,ssrc_e);
-	std::cout<< "DIFF\n " <<ssrc << std::endl;
+	std::cout<< ssrc << std::endl;
 	setCheckerboard(ssrc,sr_o);
 	setCheckerboard(ssrc,sr_e);
 	std::cout<< "CBRESULT\n " <<ssrc << std::endl;
 	std::cout<< "RESULT\n " <<sresult<< std::endl;
      }
      assert(error<1.0e-4);
    }
  }
  if (1)
@ -391,30 +324,25 @@ int main (int argc, char ** argv)
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
      tmp = U[mu]*Cshift(src,mu+1,1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+  ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+  ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
    ref = -0.5*ref;
  }
  //  dump=1;
  Dw.Dhop(src,result,1);
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
-  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
-  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
+  assert(norm2(err)<1.0e-4);
 	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
 	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
 	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
  }
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
@ -422,24 +350,18 @@ int main (int argc, char ** argv)
  LatticeFermion r_eo  (FGrid);
-  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO                "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
@ -447,7 +369,6 @@ int main (int argc, char ** argv)
  {
    Dw.ZeroCounters();
    FGrid->Barrier();
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
@ -460,7 +381,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
@ -476,20 +396,14 @@ int main (int argc, char ** argv)
  err = r_eo-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
+  assert(norm2(err)<1.0e-4);
 	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
 	std::cout<< "Deo REF\n " <<result  << std::endl;
 	std::cout<< "Deo ERR   \n " << err <<std::endl;
  }
  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
-
+  assert(norm2(src_e)<1.0e-4);
-  //assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
  //assert(norm2(src_o)<1.0e-4);
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@ -66,8 +66,7 @@ int main (int argc, char ** argv)
    Vec tsum; tsum = zero;
-    GridParallelRNG          pRNG(&Grid);      
+    GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
    pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101}));
    std::vector<double> stop(threads);
    Vector<Vec> sum(threads);
@ -78,7 +77,8 @@ int main (int argc, char ** argv)
    }
    double start=usecond();
-    parallel_for(int t=0;t<threads;t++){
+PARALLEL_FOR_LOOP
    for(int t=0;t<threads;t++){
      sum[t] = x[t]._odata[0];
      for(int i=0;i<Nloop;i++){
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@ -55,8 +55,8 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  uint64_t lmax=64;
+  uint64_t lmax=44;
-#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
+#define NLOOP (1*lmax*lmax*lmax*lmax/vol)
  for(int lat=4;lat<=lmax;lat+=4){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
@ -65,7 +65,7 @@ int main (int argc, char ** argv)
      uint64_t Nloop=NLOOP;
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
@ -100,7 +100,7 @@ int main (int argc, char ** argv)
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
@ -138,7 +138,7 @@ int main (int argc, char ** argv)
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
@ -173,7 +173,7 @@ int main (int argc, char ** argv)
      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
      LatticeVec z(&Grid); //random(pRNG,z);
      LatticeVec x(&Grid); //random(pRNG,x);
      LatticeVec y(&Grid); //random(pRNG,y);
--- a/benchmarks/Benchmark_staggered.cc
+++ b/benchmarks/Benchmark_staggered.cc
@ -1,134 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_staggered.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
  pRNG.SeedFixedIntegers(seeds);
  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
  typedef typename ImprovedStaggeredFermionR::FermionField FermionField; 
  typename ImprovedStaggeredFermionR::ImplParams params; 
  FermionField src   (&Grid); random(pRNG,src);
  FermionField result(&Grid); result=zero;
  FermionField    ref(&Grid);    ref=zero;
  FermionField    tmp(&Grid);    tmp=zero;
  FermionField    err(&Grid);    tmp=zero;
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }  
  // Only one non-zero (y)
 #if 0
  Umu=zero;
  Complex cone(1.0,0.0);
  for(int nn=0;nn<Nd;nn++){
    random(pRNG,U[nn]);
    if(1) {
      if (nn!=2) { U[nn]=zero; std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; }
      //      else       { U[nn]= cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; }
      else       { std::cout<<GridLogMessage << "random gauge field in dir "<<nn<<std::endl; }
    }
    PokeIndex<LorentzIndex>(Umu,U[nn],nn);
  }
 #endif
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  ref = zero;
  /*  
  { // Naive wilson implementation
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
      tmp = U[mu]*Cshift(src,mu,1);
      for(int i=0;i<ref._odata.size();i++){
 	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu,-1);
      for(int i=0;i<ref._odata.size();i++){
 	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
  }
  ref = -0.5*ref;
  */
  RealD mass=0.1;
  RealD c1=9.0/8.0;
  RealD c2=-1.0/24.0;
  RealD u0=1.0;
  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
  std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
  int ncall=1000;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
    Ds.Dhop(src,result,0);
  }
  double t1=usecond();
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  err = ref-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@ -35,9 +35,8 @@ using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
 #define LMAX (64)
-  int Nloop=20;
+  int Nloop=1000;
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
@ -51,12 +50,12 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=LMAX;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
      LatticeColourMatrix z(&Grid);// random(pRNG,z);
      LatticeColourMatrix x(&Grid);// random(pRNG,x);
@ -83,13 +82,13 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=LMAX;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
      LatticeColourMatrix z(&Grid); //random(pRNG,z);
      LatticeColourMatrix x(&Grid); //random(pRNG,x);
@ -114,13 +113,13 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=LMAX;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
      LatticeColourMatrix z(&Grid); //random(pRNG,z);
      LatticeColourMatrix x(&Grid); //random(pRNG,x);
@ -145,13 +144,13 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=LMAX;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
      LatticeColourMatrix z(&Grid); //random(pRNG,z);
      LatticeColourMatrix x(&Grid); //random(pRNG,x);
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@ -69,7 +69,7 @@ int main (int argc, char ** argv)
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
  pRNG.SeedFixedIntegers(seeds);
-  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+  //  pRNG.SeedRandomDevice();
  LatticeFermion src   (&Grid); random(pRNG,src);
  LatticeFermion result(&Grid); result=zero;
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@ -1,7 +1,11 @@
 include Make.inc
-bench-local: all
+simple: simple_su3_test.o simple_su3_expr.o simple_simd_test.o
-	./Benchmark_su3
+
-	./Benchmark_memory_bandwidth
+EXTRA_LIBRARIES = libsimple_su3_test.a libsimple_su3_expr.a libsimple_simd_test.a
-	./Benchmark_wilson
+
-	./Benchmark_dwf --dslash-unroll
+libsimple_su3_test_a_SOURCES = simple_su3_test.cc
 libsimple_su3_expr_a_SOURCES = simple_su3_expr.cc
 libsimple_simd_test_a_SOURCES = simple_simd_test.cc
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -1,6 +1,6 @@
 #!/usr/bin/env bash
-EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2'
+EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
 echo "-- deploying Eigen source..."
 wget ${EIGEN_URL} --no-check-certificate
--- a/configure.ac
+++ b/configure.ac
@ -1,19 +1,16 @@
 AC_PREREQ([2.63])
-AC_INIT([Grid], [0.7.0], [https://github.com/paboyle/Grid], [Grid])
+AC_INIT([Grid], [0.6.0], [https://github.com/paboyle/Grid], [Grid])
 AC_CANONICAL_BUILD
 AC_CANONICAL_HOST
 AC_CANONICAL_TARGET
-AM_INIT_AUTOMAKE([subdir-objects 1.13])
+AM_INIT_AUTOMAKE(subdir-objects)
 AM_EXTRA_RECURSIVE_TARGETS([tests bench])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([lib/Grid.h])
 AC_CONFIG_HEADERS([lib/Config.h],[sed -i 's|PACKAGE_|GRID_|' lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 ################ Get git info
 #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
 ############### Checks for programs
 CXXFLAGS="-O3 $CXXFLAGS"
 AC_PROG_CXX
 AC_PROG_RANLIB
@ -27,15 +24,12 @@ AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
      [version of g++ that will compile the code])
 CXXFLAGS="-g $CXXFLAGS"
 ############### Checks for typedefs, structures, and compiler characteristics
 AC_TYPE_SIZE_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T
-############### OpenMP
+############### OpenMP 
 AC_OPENMP
 ac_openmp=no
 if test "${OPENMP_CXXFLAGS}X" != "X"; then
@ -66,23 +60,16 @@ AC_ARG_WITH([mpfr],
    [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
-############### FFTW3
+############### FFTW3 
-AC_ARG_WITH([fftw],
+AC_ARG_WITH([fftw],    
            [AS_HELP_STRING([--with-fftw=prefix],
            [try this for a non-standard install prefix of the FFTW3 library])],
            [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
-############### LIME
+############### lapack 
 AC_ARG_WITH([lime],
            [AS_HELP_STRING([--with-lime=prefix],
            [try this for a non-standard install prefix of the LIME library])],
            [AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
 ############### lapack
 AC_ARG_ENABLE([lapack],
-    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])],
+    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
    [ac_LAPACK=${enable_lapack}], [ac_LAPACK=no])
 case ${ac_LAPACK} in
@ -96,18 +83,6 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac
 ############### FP16 conversions
 AC_ARG_ENABLE([sfw-fp16],
    [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])],
    [ac_SFW_FP16=${enable_sfw_fp16}], [ac_SFW_FP16=yes])
 case ${ac_SFW_FP16} in
    yes)
      AC_DEFINE([SFW_FP16],[1],[software conversion to fp16]);;
    no);;
    *)
      AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac
 ############### MKL
 AC_ARG_ENABLE([mkl],
    [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
@ -133,7 +108,7 @@ AC_ARG_WITH([hdf5],
 ############### first-touch
 AC_ARG_ENABLE([numa],
-    [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])],
+    [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])], 
    [ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
 case ${ac_NUMA} in
@ -159,8 +134,8 @@ if test "${ac_MKL}x" != "nox"; then
 fi
 AC_SEARCH_LIBS([__gmpf_init], [gmp],
-               [AC_SEARCH_LIBS([mpfr_init], [mpfr],
+               [AC_SEARCH_LIBS([mpfr_init], [mpfr], 
-                               [AC_DEFINE([HAVE_LIBMPFR], [1],
+                               [AC_DEFINE([HAVE_LIBMPFR], [1], 
                                          [Define to 1 if you have the `MPFR' library])]
                               [have_mpfr=true], [AC_MSG_ERROR([MPFR library not found])])]
               [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library])]
@ -169,7 +144,7 @@ AC_SEARCH_LIBS([__gmpf_init], [gmp],
 if test "${ac_LAPACK}x" != "nox"; then
    AC_SEARCH_LIBS([LAPACKE_sbdsdc], [lapack], [],
                   [AC_MSG_ERROR("LAPACK enabled but library not found")])
-fi
+fi   
 AC_SEARCH_LIBS([fftw_execute], [fftw3],
               [AC_SEARCH_LIBS([fftwf_execute], [fftw3f], [],
@ -177,18 +152,6 @@ AC_SEARCH_LIBS([fftw_execute], [fftw3],
               [AC_DEFINE([HAVE_FFTW], [1], [Define to 1 if you have the `FFTW' library])]
               [have_fftw=true])
 AC_SEARCH_LIBS([limeCreateReader], [lime],
               [AC_DEFINE([HAVE_LIME], [1], [Define to 1 if you have the `LIME' library])]
               [have_lime=true],
 	       [AC_MSG_WARN(C-LIME library was not found in your system.
 In order to use ILGG file format please install or provide the correct path to your installation
 Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
 AC_SEARCH_LIBS([crc32], [z],
               [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
               [have_zlib=true],
 	       [AC_MSG_ERROR(zlib library was not found in your system.)])
 AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
               [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
               [have_hdf5=true]
@ -213,26 +176,19 @@ case ${ax_cv_cxx_compiler_vendor} in
    case ${ac_SIMD} in
      SSE4)
        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
-	case ${ac_SFW_FP16} in
+        SIMD_FLAGS='-msse4.2';;
 	  yes)
 	  SIMD_FLAGS='-msse4.2';;
 	  no)
 	  SIMD_FLAGS='-msse4.2 -mf16c';;
 	  *)
          AC_MSG_ERROR(["SFW_FP16 must be either yes or no value ${ac_SFW_FP16} "]);;
 	esac;;
      AVX)
        AC_DEFINE([AVX1],[1],[AVX intrinsics])
-        SIMD_FLAGS='-mavx -mf16c';;
+        SIMD_FLAGS='-mavx';;
      AVXFMA4)
        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
-        SIMD_FLAGS='-mavx -mfma4 -mf16c';;
+        SIMD_FLAGS='-mavx -mfma4';;
      AVXFMA)
        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
-        SIMD_FLAGS='-mavx -mfma -mf16c';;
+        SIMD_FLAGS='-mavx -mfma';;
      AVX2)
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
-        SIMD_FLAGS='-mavx2 -mfma -mf16c';;
+        SIMD_FLAGS='-mavx2 -mfma';;
      AVX512)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
@ -341,7 +297,7 @@ case ${ac_COMMS} in
        comms_type='shmem'
     ;;
     *)
-        AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);
+        AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
     ;;
 esac
 case ${ac_COMMS} in
@ -365,7 +321,7 @@ AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 ############### RNG selection
 AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937|sitmo],\
 	            [Select Random Number Generator to be used])],\
-	            [ac_RNG=${enable_rng}],[ac_RNG=sitmo])
+	            [ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
 case ${ac_RNG} in
     ranlux48)
@ -378,7 +334,7 @@ case ${ac_RNG} in
      AC_DEFINE([RNG_SITMO],[1],[RNG_SITMO] )
     ;;
     *)
-      AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
+      AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
     ;;
 esac
@ -395,7 +351,7 @@ case ${ac_TIMERS} in
      AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
     ;;
     *)
-      AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]);
+      AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
     ;;
 esac
@ -407,7 +363,7 @@ case ${ac_CHROMA} in
     yes|no)
     ;;
     *)
-       AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]);
+       AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
     ;;
 esac
@ -428,65 +384,12 @@ DX_INIT_DOXYGEN([$PACKAGE_NAME], [doxygen.cfg])
 ############### Ouput
 cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
 GRID_CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
 GRID_LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 GRID_LIBS=$LIBS
 GRID_SHORT_SHA=`git rev-parse --short HEAD`
 GRID_SHA=`git rev-parse HEAD`
 GRID_BRANCH=`git rev-parse --abbrev-ref HEAD`
 AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS"
 AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS"
 AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS"
 AC_SUBST([AM_CFLAGS])
 AC_SUBST([AM_CXXFLAGS])
 AC_SUBST([AM_LDFLAGS])
 AC_SUBST([GRID_CXXFLAGS])
 AC_SUBST([GRID_LDFLAGS])
 AC_SUBST([GRID_LIBS])
 AC_SUBST([GRID_SHA])
 AC_SUBST([GRID_BRANCH])
 git_commit=`cd $srcdir && ./scripts/configure.commit`
 echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Summary of configuration for $PACKAGE v$VERSION
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ----- GIT VERSION -------------------------------------
 $git_commit
 ----- PLATFORM ----------------------------------------
 architecture (build)        : $build_cpu
 os (build)                  : $build_os
 architecture (target)       : $target_cpu
 os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp}
 Communications type         : ${comms_type}
 Default precision           : ${ac_PRECISION}
 Software FP16 conversion    : ${ac_SFW_FP16}
 RNG choice                  : ${ac_RNG}
 GMP                         : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 LAPACK                      : ${ac_LAPACK}
 FFTW                        : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
 LIME (ILDG support)         : `if test "x$have_lime" = xtrue; then echo yes; else echo no; fi`
 HDF5                        : `if test "x$have_hdf5" = xtrue; then echo yes; else echo no; fi`
 build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
 CXXFLAGS:
 `echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 LDFLAGS:
 `echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 LIBS:
 `echo ${LIBS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 -------------------------------------------------------" > grid.configure.summary
 GRID_SUMMARY="`cat grid.configure.summary`"
 AM_SUBST_NOTMAKE([GRID_SUMMARY])
 AC_SUBST([GRID_SUMMARY])
 AC_CONFIG_FILES([grid-config], [chmod +x grid-config])
 AC_CONFIG_FILES(Makefile)
 AC_CONFIG_FILES(lib/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
@ -497,15 +400,42 @@ AC_CONFIG_FILES(tests/forces/Makefile)
 AC_CONFIG_FILES(tests/hadrons/Makefile)
 AC_CONFIG_FILES(tests/hmc/Makefile)
 AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/smearing/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(tests/testu01/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_CONFIG_FILES(extras/Makefile)
 AC_CONFIG_FILES(extras/Hadrons/Makefile)
 AC_OUTPUT
-echo ""
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-cat grid.configure.summary
+Summary of configuration for $PACKAGE v$VERSION
-echo ""
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ----- PLATFORM ----------------------------------------
 architecture (build)        : $build_cpu
 os (build)                  : $build_os
 architecture (target)       : $target_cpu
 os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp} 
 Communications type         : ${comms_type}
 Default precision           : ${ac_PRECISION}
 RNG choice                  : ${ac_RNG} 
 GMP                         : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 LAPACK                      : ${ac_LAPACK}
 FFTW                        : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
 HDF5                        : `if test "x$have_hdf5" = xtrue; then echo yes; else echo no; fi`
 build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
 CXXFLAGS:
 `echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 LDFLAGS:
 `echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 LIBS:
 `echo ${LIBS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 -------------------------------------------------------" > config.summary
 echo ""
 cat config.summary
 echo ""
--- a/extras/Hadrons/Application.cc
+++ b/extras/Hadrons/Application.cc
@ -162,8 +162,7 @@ void Application::saveParameterFile(const std::string parameterFileName)
 sizeString((size)*locVol_) << " (" << sizeString(size)  << "/site)"
 #define DEFINE_MEMPEAK \
-GeneticScheduler<unsigned int>::ObjFunc memPeak = \
+auto memPeak = [this](const std::vector<unsigned int> &program)\
 [this](const std::vector<unsigned int> &program)\
 {\
    unsigned int memPeak;\
    bool         msg;\
--- a/extras/Hadrons/Global.hpp
+++ b/extras/Hadrons/Global.hpp
@ -145,15 +145,6 @@ std::string typeName(void)
    return typeName(typeIdPt<T>());
 }
 // default writers/readers
 #ifdef HAVE_HDF5
 typedef Hdf5Reader CorrReader;
 typedef Hdf5Writer CorrWriter;
 #else
 typedef XmlReader CorrReader;
 typedef XmlWriter CorrWriter;
 #endif
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Global_hpp_
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@ -29,20 +29,12 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Modules/MAction/DWF.hpp>
 #include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
 #include <Grid/Hadrons/Modules/MContraction/DiscLoop.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Gamma3pt.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Meson.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 #include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
 #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
 #include <Grid/Hadrons/Modules/MSource/Wall.hpp>
 #include <Grid/Hadrons/Modules/MSource/Z2.hpp>
 #include <Grid/Hadrons/Modules/Quark.hpp>
--- a/extras/Hadrons/Modules/MAction/DWF.hpp
+++ b/extras/Hadrons/Modules/MAction/DWF.hpp
@ -48,8 +48,7 @@ public:
                                    std::string, gauge,
                                    unsigned int, Ls,
                                    double      , mass,
-                                    double      , M5,
+                                    double      , M5);
                                    std::string , boundary);
 };
 template <typename FImpl>
@ -117,19 +116,14 @@ void TDWF<FImpl>::execute(void)
                 << par().mass << ", M5= " << par().M5 << " and Ls= "
                 << par().Ls << " using gauge field '" << par().gauge << "'"
                 << std::endl;
    LOG(Message) << "Fermion boundary conditions: " << par().boundary 
                 << std::endl;
    env().createGrid(par().Ls);
    auto &U      = *env().template getObject<LatticeGaugeField>(par().gauge);
    auto &g4     = *env().getGrid();
    auto &grb4   = *env().getRbGrid();
    auto &g5     = *env().getGrid(par().Ls);
    auto &grb5   = *env().getRbGrid(par().Ls);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename DomainWallFermion<FImpl>::ImplParams implParams(boundary);
    FMat *fMatPt = new DomainWallFermion<FImpl>(U, g5, grb5, g4, grb4,
-                                                par().mass, par().M5,
+                                                par().mass, par().M5);
                                                implParams);
    env().setObject(getName(), fMatPt);
 }
--- a/extras/Hadrons/Modules/MAction/Wilson.hpp
+++ b/extras/Hadrons/Modules/MAction/Wilson.hpp
@ -46,8 +46,7 @@ class WilsonPar: Serializable
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonPar,
                                    std::string, gauge,
-                                    double     , mass,
+                                    double     , mass);
                                    std::string, boundary);
 };
 template <typename FImpl>
@ -113,15 +112,10 @@ void TWilson<FImpl>::execute()
 {
    LOG(Message) << "Setting up TWilson fermion matrix with m= " << par().mass
                 << " using gauge field '" << par().gauge << "'" << std::endl;
    LOG(Message) << "Fermion boundary conditions: " << par().boundary 
                 << std::endl;
    auto &U      = *env().template getObject<LatticeGaugeField>(par().gauge);
    auto &grid   = *env().getGrid();
    auto &gridRb = *env().getRbGrid();
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
+    FMat *fMatPt = new WilsonFermion<FImpl>(U, grid, gridRb, par().mass);
    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
    FMat *fMatPt = new WilsonFermion<FImpl>(U, grid, gridRb, par().mass,
                                            implParams);
    env().setObject(getName(), fMatPt);
 }
--- a/extras/Hadrons/Modules/MContraction/Baryon.hpp
+++ b/extras/Hadrons/Modules/MContraction/Baryon.hpp
@ -112,7 +112,7 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
                 << " quarks '" << par().q1 << "', '" << par().q2 << "', and '"
                 << par().q3 << "'" << std::endl;
-    CorrWriter             writer(par().output);
+    XmlWriter             writer(par().output);
    PropagatorField1      &q1 = *env().template getObject<PropagatorField1>(par().q1);
    PropagatorField2      &q2 = *env().template getObject<PropagatorField2>(par().q2);
    PropagatorField3      &q3 = *env().template getObject<PropagatorField3>(par().q2);
--- a/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
+++ b/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
@ -1,144 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/DiscLoop.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_DiscLoop_hpp_
 #define Hadrons_DiscLoop_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                                DiscLoop                                    *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class DiscLoopPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(DiscLoopPar,
                                    std::string,    q_loop,
                                    Gamma::Algebra, gamma,
                                    std::string,    output);
 };
 template <typename FImpl>
 class TDiscLoop: public Module<DiscLoopPar>
 {
    TYPE_ALIASES(FImpl,);
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        Gamma::Algebra, gamma,
                                        std::vector<Complex>, corr);
    };
 public:
    // constructor
    TDiscLoop(const std::string name);
    // destructor
    virtual ~TDiscLoop(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(DiscLoop, TDiscLoop<FIMPL>, MContraction);
 /******************************************************************************
 *                       TDiscLoop implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TDiscLoop<FImpl>::TDiscLoop(const std::string name)
 : Module<DiscLoopPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TDiscLoop<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q_loop};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TDiscLoop<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TDiscLoop<FImpl>::setup(void)
 {
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TDiscLoop<FImpl>::execute(void)
 {
    LOG(Message) << "Computing disconnected loop contraction '" << getName() 
                 << "' using '" << par().q_loop << "' with " << par().gamma 
                 << " insertion." << std::endl;
    CorrWriter            writer(par().output);
    PropagatorField       &q_loop = *env().template getObject<PropagatorField>(par().q_loop);
    LatticeComplex        c(env().getGrid());
    Gamma                 gamma(par().gamma);
    std::vector<TComplex> buf;
    Result                result;
    c = trace(gamma*q_loop);
    sliceSum(c, buf, Tp);
    result.gamma = par().gamma;
    result.corr.resize(buf.size());
    for (unsigned int t = 0; t < buf.size(); ++t)
    {
        result.corr[t] = TensorRemove(buf[t]);
    }
    write(writer, "disc", result);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_DiscLoop_hpp_
--- a/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
+++ b/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
@ -1,170 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Gamma3pt_hpp_
 #define Hadrons_Gamma3pt_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
 * 3pt contraction with gamma matrix insertion.
 *
 * Schematic:
 *
 *             q2           q3
 *        /----<------*------<----¬
 *       /          gamma          \
 *      /                           \
 *   i *                            * f
 *      \                          /
 *       \                        /
 *        \----------->----------/
 *                   q1
 *
 *      trace(g5*q1*adj(q2)*g5*gamma*q3)
 */
 /******************************************************************************
 *                               Gamma3pt                                     *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class Gamma3ptPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(Gamma3ptPar,
                                    std::string,    q1,
                                    std::string,    q2,
                                    std::string,    q3,
                                    Gamma::Algebra, gamma,
                                    std::string,    output);
 };
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 class TGamma3pt: public Module<Gamma3ptPar>
 {
    TYPE_ALIASES(FImpl1, 1);
    TYPE_ALIASES(FImpl2, 2);
    TYPE_ALIASES(FImpl3, 3);
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        Gamma::Algebra, gamma,
                                        std::vector<Complex>, corr);
    };
 public:
    // constructor
    TGamma3pt(const std::string name);
    // destructor
    virtual ~TGamma3pt(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(Gamma3pt, ARG(TGamma3pt<FIMPL, FIMPL, FIMPL>), MContraction);
 /******************************************************************************
 *                       TGamma3pt implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 TGamma3pt<FImpl1, FImpl2, FImpl3>::TGamma3pt(const std::string name)
 : Module<Gamma3ptPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 std::vector<std::string> TGamma3pt<FImpl1, FImpl2, FImpl3>::getInput(void)
 {
    std::vector<std::string> in = {par().q1, par().q2, par().q3};
    return in;
 }
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 std::vector<std::string> TGamma3pt<FImpl1, FImpl2, FImpl3>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TGamma3pt<FImpl1, FImpl2, FImpl3>::setup(void)
 {
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TGamma3pt<FImpl1, FImpl2, FImpl3>::execute(void)
 {
    LOG(Message) << "Computing 3pt contractions '" << getName() << "' using"
                 << " quarks '" << par().q1 << "', '" << par().q2 << "' and '"
                 << par().q3 << "', with " << par().gamma << " insertion." 
                 << std::endl;
    CorrWriter            writer(par().output);
    PropagatorField1      &q1 = *env().template getObject<PropagatorField1>(par().q1);
    PropagatorField2      &q2 = *env().template getObject<PropagatorField2>(par().q2);
    PropagatorField3      &q3 = *env().template getObject<PropagatorField3>(par().q3);
    LatticeComplex        c(env().getGrid());
    Gamma                 g5(Gamma::Algebra::Gamma5);
    Gamma                 gamma(par().gamma);
    std::vector<TComplex> buf;
    Result                result;
    c = trace(g5*q1*adj(q2)*(g5*gamma)*q3);
    sliceSum(c, buf, Tp);
    result.gamma = par().gamma;
    result.corr.resize(buf.size());
    for (unsigned int t = 0; t < buf.size(); ++t)
    {
        result.corr[t] = TensorRemove(buf[t]);
    }
    write(writer, "gamma3pt", result);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Gamma3pt_hpp_
--- a/extras/Hadrons/Modules/MContraction/Meson.hpp
+++ b/extras/Hadrons/Modules/MContraction/Meson.hpp
@ -6,10 +6,8 @@ Source file: extras/Hadrons/Modules/MContraction/Meson.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Copyright (C) 2017
 Author: Antonin Portelli <antonin.portelli@me.com>
        Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -38,39 +36,20 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE
 /*
 Meson contractions
 -----------------------------
 * options:
 - q1: input propagator 1 (string)
 - q2: input propagator 2 (string)
 - gammas: gamma products to insert at sink & source, pairs of gamma matrices 
           (space-separated strings) in angled brackets (i.e. <g_sink g_src>),
           in a sequence (e.g. "<Gamma5 Gamma5><Gamma5 GammaT>").
           Special values: "all" - perform all possible contractions.
 - mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0."),
        given as multiples of (2*pi) / L.
 */
 /******************************************************************************
 *                                TMeson                                       *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 typedef std::pair<Gamma::Algebra, Gamma::Algebra> GammaPair;
 class MesonPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(MesonPar,
-                                    std::string, q1,
+                                    std::string,    q1,
-                                    std::string, q2,
+                                    std::string,    q2,
-                                    std::string, gammas,
+                                    std::string,    output,
-                                    std::string, mom,
+                                    Gamma::Algebra, gammaSource,
-                                    std::string, output);
+                                    Gamma::Algebra, gammaSink);
 };
 template <typename FImpl1, typename FImpl2>
@ -82,10 +61,7 @@ public:
    class Result: Serializable
    {
    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result, std::vector<Complex>, corr);
                                        Gamma::Algebra, gamma_snk,
                                        Gamma::Algebra, gamma_src,
                                        std::vector<Complex>, corr);
    };
 public:
    // constructor
@ -95,7 +71,6 @@ public:
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    virtual void parseGammaString(std::vector<GammaPair> &gammaList);
    // execution
    virtual void execute(void);
 };
@ -128,31 +103,6 @@ std::vector<std::string> TMeson<FImpl1, FImpl2>::getOutput(void)
    return output;
 }
 template <typename FImpl1, typename FImpl2>
 void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList)
 {
    gammaList.clear();
    // Determine gamma matrices to insert at source/sink.
    if (par().gammas.compare("all") == 0)
    {
        // Do all contractions.
        for (unsigned int i = 1; i < Gamma::nGamma; i += 2)
        {
            for (unsigned int j = 1; j < Gamma::nGamma; j += 2)
            {
                gammaList.push_back(std::make_pair((Gamma::Algebra)i, 
                                                   (Gamma::Algebra)j));
            }
        }
    }
    else
    {
        // Parse individual contractions from input string.
        gammaList = strToVec<GammaPair>(par().gammas);
    }
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 void TMeson<FImpl1, FImpl2>::execute(void)
@ -161,44 +111,21 @@ void TMeson<FImpl1, FImpl2>::execute(void)
                 << " quarks '" << par().q1 << "' and '" << par().q2 << "'"
                 << std::endl;
-    CorrWriter              writer(par().output);
+    XmlWriter             writer(par().output);
-    PropagatorField1       &q1 = *env().template getObject<PropagatorField1>(par().q1);
+    PropagatorField1      &q1 = *env().template getObject<PropagatorField1>(par().q1);
-    PropagatorField2       &q2 = *env().template getObject<PropagatorField2>(par().q2);
+    PropagatorField2      &q2 = *env().template getObject<PropagatorField2>(par().q2);
-    LatticeComplex         c(env().getGrid());
+    LatticeComplex        c(env().getGrid());
-    Gamma                  g5(Gamma::Algebra::Gamma5);
+    Gamma                 gSrc(par().gammaSource), gSnk(par().gammaSink);
-    std::vector<GammaPair> gammaList;
+    Gamma                 g5(Gamma::Algebra::Gamma5);
-    std::vector<TComplex>  buf;
+    std::vector<TComplex> buf;
-    std::vector<Result>    result;
+    Result                result;
    std::vector<Real>      p;
    p  = strToVec<Real>(par().mom);
    LatticeComplex         ph(env().getGrid()), coor(env().getGrid());
    Complex                i(0.0,1.0);
    ph = zero;
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
    {
        LatticeCoordinate(coor, mu);
        ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
    }
    ph = exp((Real)(2*M_PI)*i*ph);
-    parseGammaString(gammaList);
+    c = trace(gSnk*q1*adj(gSrc)*g5*adj(q2)*g5);
-
+    sliceSum(c, buf, Tp);
-    result.resize(gammaList.size());
+    result.corr.resize(buf.size());
-    for (unsigned int i = 0; i < result.size(); ++i)
+    for (unsigned int t = 0; t < buf.size(); ++t)
    {
-        Gamma gSnk(gammaList[i].first);
+        result.corr[t] = TensorRemove(buf[t]);
        Gamma gSrc(gammaList[i].second);
        c = trace((g5*gSnk)*q1*(adj(gSrc)*g5)*adj(q2))*ph;
        sliceSum(c, buf, Tp);
        result[i].gamma_snk = gammaList[i].first;
        result[i].gamma_src = gammaList[i].second;
        result[i].corr.resize(buf.size());
        for (unsigned int t = 0; t < buf.size(); ++t)
        {
            result[i].corr[t] = TensorRemove(buf[t]);
        }
    }
    write(writer, "meson", result);
 }
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
@ -1,114 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_WeakHamiltonian_hpp_
 #define Hadrons_WeakHamiltonian_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakHamiltonian                                    *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 /*******************************************************************************
 * Utilities for contractions involving the Weak Hamiltonian.
 ******************************************************************************/
 //// Sum and store correlator.
 #define MAKE_DIAG(exp, buf, res, n)\
 sliceSum(exp, buf, Tp);\
 res.name = (n);\
 res.corr.resize(buf.size());\
 for (unsigned int t = 0; t < buf.size(); ++t)\
 {\
    res.corr[t] = TensorRemove(buf[t]);\
 }
 //// Contraction of mu index: use 'mu' variable in exp.
 #define SUM_MU(buf,exp)\
 buf = zero;\
 for (unsigned int mu = 0; mu < ndim; ++mu)\
 {\
    buf += exp;\
 }
 enum 
 {
  i_V = 0,
  i_A = 1,
  n_i = 2
 };
 class WeakHamiltonianPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WeakHamiltonianPar,
                                    std::string, q1,
                                    std::string, q2,
                                    std::string, q3,
                                    std::string, q4,
                                    std::string, output);
 };
 #define MAKE_WEAK_MODULE(modname)\
 class T##modname: public Module<WeakHamiltonianPar>\
 {\
 public:\
    TYPE_ALIASES(FIMPL,)\
    class Result: Serializable\
    {\
    public:\
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,\
                                        std::string, name,\
                                        std::vector<Complex>, corr);\
    };\
 public:\
    /* constructor */ \
    T##modname(const std::string name);\
    /* destructor */ \
    virtual ~T##modname(void) = default;\
    /* dependency relation */ \
    virtual std::vector<std::string> getInput(void);\
    virtual std::vector<std::string> getOutput(void);\
    /* setup */ \
    virtual void setup(void);\
    /* execution */ \
    virtual void execute(void);\
    std::vector<std::string> VA_label = {"V", "A"};\
 };\
 MODULE_REGISTER_NS(modname, T##modname, MContraction);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_WeakHamiltonian_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
@ -1,137 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 /*
 * Weak Hamiltonian current-current contractions, Eye-type.
 * 
 * These contractions are generated by the Q1 and Q2 operators in the physical
 * basis (see e.g. Fig 3 of arXiv:1507.03094).
 * 
 * Schematics:        q4                 |                  
 *                  /-<-¬                |                             
 *                 /     \               |             q2           q3
 *                 \     /               |        /----<------*------<----¬                        
 *            q2    \   /    q3          |       /          /-*-¬          \
 *       /-----<-----* *-----<----¬      |      /          /     \          \
 *    i *            H_W           * f   |   i *           \     /  q4      * f
 *       \                        /      |      \           \->-/          /   
 *        \                      /       |       \                        /       
 *         \---------->---------/        |        \----------->----------/        
 *                   q1                  |                   q1                  
 *                                       |
 *                Saucer (S)             |                  Eye (E)
 * 
 * S: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1]*q4*gL[mu][p_2])
 * E: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1])*trace(q4*gL[mu][p_2])
 */
 /******************************************************************************
 *                  TWeakHamiltonianEye implementation                        *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TWeakHamiltonianEye::TWeakHamiltonianEye(const std::string name)
 : Module<WeakHamiltonianPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TWeakHamiltonianEye::getInput(void)
 {
    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
    return in;
 }
 std::vector<std::string> TWeakHamiltonianEye::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakHamiltonianEye::setup(void)
 {
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TWeakHamiltonianEye::execute(void)
 {
    LOG(Message) << "Computing Weak Hamiltonian (Eye type) contractions '" 
                 << getName() << "' using quarks '" << par().q1 << "', '" 
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    CorrWriter             writer(par().output);
    PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
    PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
    PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
    PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
    Gamma g5            = Gamma(Gamma::Algebra::Gamma5);
    LatticeComplex        expbuf(env().getGrid());
    std::vector<TComplex> corrbuf;
    std::vector<Result>   result(n_eye_diag);
    unsigned int ndim   = env().getNd();
    PropagatorField              tmp1(env().getGrid());
    LatticeComplex               tmp2(env().getGrid());
    std::vector<PropagatorField> S_body(ndim, tmp1);
    std::vector<PropagatorField> S_loop(ndim, tmp1);
    std::vector<LatticeComplex>  E_body(ndim, tmp2);
    std::vector<LatticeComplex>  E_loop(ndim, tmp2);
    // Setup for S-type contractions.
    for (int mu = 0; mu < ndim; ++mu)
    {
        S_body[mu] = MAKE_SE_BODY(q1, q2, q3, GammaL(Gamma::gmu[mu]));
        S_loop[mu] = MAKE_SE_LOOP(q4, GammaL(Gamma::gmu[mu]));
    }
    // Perform S-type contractions.    
    SUM_MU(expbuf, trace(S_body[mu]*S_loop[mu]))
    MAKE_DIAG(expbuf, corrbuf, result[S_diag], "HW_S")
    // Recycle sub-expressions for E-type contractions.
    for (unsigned int mu = 0; mu < ndim; ++mu)
    {
        E_body[mu] = trace(S_body[mu]);
        E_loop[mu] = trace(S_loop[mu]);
    }
    // Perform E-type contractions.
    SUM_MU(expbuf, E_body[mu]*E_loop[mu])
    MAKE_DIAG(expbuf, corrbuf, result[E_diag], "HW_E")
    write(writer, "HW_Eye", result);
 }
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
@ -1,58 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_WeakHamiltonianEye_hpp_
 #define Hadrons_WeakHamiltonianEye_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakHamiltonianEye                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 enum
 {
    S_diag = 0,
    E_diag = 1,
    n_eye_diag = 2
 };
 // Saucer and Eye subdiagram contractions.
 #define MAKE_SE_BODY(Q_1, Q_2, Q_3, gamma) (Q_3*g5*Q_1*adj(Q_2)*g5*gamma)
 #define MAKE_SE_LOOP(Q_loop, gamma) (Q_loop*gamma)
 MAKE_WEAK_MODULE(WeakHamiltonianEye)
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_WeakHamiltonianEye_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
@ -1,139 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 /*
 * Weak Hamiltonian current-current contractions, Non-Eye-type.
 * 
 * These contractions are generated by the Q1 and Q2 operators in the physical
 * basis (see e.g. Fig 3 of arXiv:1507.03094).
 * 
 * Schematic:     
 *            q2             q3          |           q2              q3
 *          /--<--¬       /--<--¬        |        /--<--¬         /--<--¬       
 *         /       \     /       \       |       /       \       /       \      
 *        /         \   /         \      |      /         \     /         \     
 *       /           \ /           \     |     /           \   /           \    
 *    i *             * H_W         *  f |  i *             * * H_W         * f 
 *      \             *             |    |     \           /   \           /
 *       \           / \           /     |      \         /     \         /    
 *        \         /   \         /      |       \       /       \       /  
 *         \       /     \       /       |        \-->--/         \-->--/      
 *          \-->--/       \-->--/        |          q1               q4 
 *            q1             q4          |
 *                Connected (C)          |                 Wing (W)
 *
 * C: trace(q1*adj(q2)*g5*gL[mu]*q3*adj(q4)*g5*gL[mu])
 * W: trace(q1*adj(q2)*g5*gL[mu])*trace(q3*adj(q4)*g5*gL[mu])
 * 
 */
 /******************************************************************************
 *                  TWeakHamiltonianNonEye implementation                     *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TWeakHamiltonianNonEye::TWeakHamiltonianNonEye(const std::string name)
 : Module<WeakHamiltonianPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TWeakHamiltonianNonEye::getInput(void)
 {
    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
    return in;
 }
 std::vector<std::string> TWeakHamiltonianNonEye::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakHamiltonianNonEye::setup(void)
 {
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TWeakHamiltonianNonEye::execute(void)
 {
    LOG(Message) << "Computing Weak Hamiltonian (Non-Eye type) contractions '" 
                 << getName() << "' using quarks '" << par().q1 << "', '" 
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    CorrWriter             writer(par().output);
    PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
    PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
    PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
    PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
    Gamma g5            = Gamma(Gamma::Algebra::Gamma5);
    LatticeComplex        expbuf(env().getGrid());
    std::vector<TComplex> corrbuf;
    std::vector<Result>   result(n_noneye_diag); 
    unsigned int ndim   = env().getNd();
    PropagatorField              tmp1(env().getGrid());
    LatticeComplex               tmp2(env().getGrid());
    std::vector<PropagatorField> C_i_side_loop(ndim, tmp1);
    std::vector<PropagatorField> C_f_side_loop(ndim, tmp1);
    std::vector<LatticeComplex>  W_i_side_loop(ndim, tmp2);
    std::vector<LatticeComplex>  W_f_side_loop(ndim, tmp2);
    // Setup for C-type contractions.
    for (int mu = 0; mu < ndim; ++mu)
    {
        C_i_side_loop[mu] = MAKE_CW_SUBDIAG(q1, q2, GammaL(Gamma::gmu[mu]));
        C_f_side_loop[mu] = MAKE_CW_SUBDIAG(q3, q4, GammaL(Gamma::gmu[mu]));
    }
    // Perform C-type contractions.    
    SUM_MU(expbuf, trace(C_i_side_loop[mu]*C_f_side_loop[mu]))
    MAKE_DIAG(expbuf, corrbuf, result[C_diag], "HW_C")
    // Recycle sub-expressions for W-type contractions.
    for (unsigned int mu = 0; mu < ndim; ++mu)
    {
        W_i_side_loop[mu] = trace(C_i_side_loop[mu]);
        W_f_side_loop[mu] = trace(C_f_side_loop[mu]);
    }
    // Perform W-type contractions.
    SUM_MU(expbuf, W_i_side_loop[mu]*W_f_side_loop[mu])
    MAKE_DIAG(expbuf, corrbuf, result[W_diag], "HW_W")
    write(writer, "HW_NonEye", result);
 }
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
@ -1,57 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_WeakHamiltonianNonEye_hpp_
 #define Hadrons_WeakHamiltonianNonEye_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakHamiltonianNonEye                              *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 enum
 {
    W_diag = 0,
    C_diag = 1,
    n_noneye_diag = 2
 };
 // Wing and Connected subdiagram contractions
 #define MAKE_CW_SUBDIAG(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma)
 MAKE_WEAK_MODULE(WeakHamiltonianNonEye)
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_WeakHamiltonianNonEye_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
@ -1,135 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 /*
 * Weak Hamiltonian + current contractions, disconnected topology for neutral 
 * mesons.
 * 
 * These contractions are generated by operators Q_1,...,10 of the dS=1 Weak
 * Hamiltonian in the physical basis and an additional current J (see e.g. 
 * Fig 11 of arXiv:1507.03094).
 * 
 * Schematic:
 *                        
 *           q2          q4             q3
 *       /--<--¬     /---<--¬       /---<--¬
 *     /         \ /         \     /        \
 *  i *           * H_W      |  J *          * f
 *     \         / \         /     \        /
 *      \--->---/   \-------/       \------/
 *          q1 
 * 
 * options
 * - q1: input propagator 1 (string)
 * - q2: input propagator 2 (string)
 * - q3: input propagator 3 (string), assumed to be sequential propagator 
 * - q4: input propagator 4 (string), assumed to be a loop
 * 
 * type 1: trace(q1*adj(q2)*g5*gL[mu])*trace(loop*gL[mu])*trace(q3*g5)
 * type 2: trace(q1*adj(q2)*g5*gL[mu]*loop*gL[mu])*trace(q3*g5)
 */
 /*******************************************************************************
 *                  TWeakNeutral4ptDisc implementation                         *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TWeakNeutral4ptDisc::TWeakNeutral4ptDisc(const std::string name)
 : Module<WeakHamiltonianPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TWeakNeutral4ptDisc::getInput(void)
 {
    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
    return in;
 }
 std::vector<std::string> TWeakNeutral4ptDisc::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakNeutral4ptDisc::setup(void)
 {
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TWeakNeutral4ptDisc::execute(void)
 {
    LOG(Message) << "Computing Weak Hamiltonian neutral disconnected contractions '" 
                 << getName() << "' using quarks '" << par().q1 << "', '" 
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    CorrWriter             writer(par().output);
    PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
    PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
    PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
    PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
    Gamma g5            = Gamma(Gamma::Algebra::Gamma5);
    LatticeComplex        expbuf(env().getGrid());
    std::vector<TComplex> corrbuf;
    std::vector<Result>   result(n_neut_disc_diag);
    unsigned int ndim   = env().getNd();
    PropagatorField              tmp(env().getGrid());
    std::vector<PropagatorField> meson(ndim, tmp);
    std::vector<PropagatorField> loop(ndim, tmp);
    LatticeComplex               curr(env().getGrid());
    // Setup for type 1 contractions.
    for (int mu = 0; mu < ndim; ++mu)
    {
        meson[mu] = MAKE_DISC_MESON(q1, q2, GammaL(Gamma::gmu[mu]));
        loop[mu] = MAKE_DISC_LOOP(q4, GammaL(Gamma::gmu[mu]));
    }
    curr = MAKE_DISC_CURR(q3, GammaL(Gamma::Algebra::Gamma5));
    // Perform type 1 contractions.    
    SUM_MU(expbuf, trace(meson[mu]*loop[mu]))
    expbuf *= curr;
    MAKE_DIAG(expbuf, corrbuf, result[neut_disc_1_diag], "HW_disc0_1")
    // Perform type 2 contractions.
    SUM_MU(expbuf, trace(meson[mu])*trace(loop[mu]))
    expbuf *= curr;
    MAKE_DIAG(expbuf, corrbuf, result[neut_disc_2_diag], "HW_disc0_2")
    write(writer, "HW_disc0", result);
 }
--- a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
@ -1,59 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_WeakNeutral4ptDisc_hpp_
 #define Hadrons_WeakNeutral4ptDisc_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakNeutral4ptDisc                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 enum
 {
    neut_disc_1_diag = 0,
    neut_disc_2_diag = 1,
    n_neut_disc_diag = 2
 };
 // Neutral 4pt disconnected subdiagram contractions.
 #define MAKE_DISC_MESON(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma)
 #define MAKE_DISC_LOOP(Q_LOOP, gamma) (Q_LOOP*gamma)
 #define MAKE_DISC_CURR(Q_c, gamma) (trace(Q_c*gamma))
 MAKE_WEAK_MODULE(WeakNeutral4ptDisc)
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_WeakNeutral4ptDisc_hpp_
--- a/extras/Hadrons/Modules/MGauge/Load.cc
+++ b/extras/Hadrons/Modules/MGauge/Load.cc
@ -65,7 +65,7 @@ void TLoad::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 void TLoad::execute(void)
 {
-    FieldMetaData  header;
+    NerscField  header;
    std::string fileName = par().file + "."
                           + std::to_string(env().getTrajectory());
@ -74,5 +74,5 @@ void TLoad::execute(void)
    LatticeGaugeField &U = *env().createLattice<LatticeGaugeField>(getName());
    NerscIO::readConfiguration(U, header, fileName);
    LOG(Message) << "NERSC header:" << std::endl;
-    dump_meta_data(header, LOG(Message));
+    dump_nersc_header(header, LOG(Message));
 }
--- a/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
+++ b/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
@ -1,132 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
 Copyright (C) 2016
 Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_NoiseLoop_hpp_
 #define Hadrons_NoiseLoop_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
 Noise loop propagator
 -----------------------------
 * loop_x = q_x * adj(eta_x)
 * options:
 - q = Result of inversion on noise source.
 - eta = noise source.
 */
 /******************************************************************************
 *                         NoiseLoop                                          *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MLoop)
 class NoiseLoopPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(NoiseLoopPar,
                                    std::string, q,
                                    std::string, eta);
 };
 template <typename FImpl>
 class TNoiseLoop: public Module<NoiseLoopPar>
 {
 public:
    TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TNoiseLoop(const std::string name);
    // destructor
    virtual ~TNoiseLoop(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(NoiseLoop, TNoiseLoop<FIMPL>, MLoop);
 /******************************************************************************
 *                 TNoiseLoop implementation                                  *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TNoiseLoop<FImpl>::TNoiseLoop(const std::string name)
 : Module<NoiseLoopPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TNoiseLoop<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().eta};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TNoiseLoop<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TNoiseLoop<FImpl>::setup(void)
 {
    env().template registerLattice<PropagatorField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TNoiseLoop<FImpl>::execute(void)
 {
    PropagatorField &loop = *env().template createLattice<PropagatorField>(getName());
    PropagatorField &q    = *env().template getObject<PropagatorField>(par().q);
    PropagatorField &eta  = *env().template getObject<PropagatorField>(par().eta);
    loop = q*adj(eta);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_NoiseLoop_hpp_
--- a/extras/Hadrons/Modules/MSource/SeqGamma.hpp
+++ b/extras/Hadrons/Modules/MSource/SeqGamma.hpp
@ -6,7 +6,6 @@ Source file: extras/Hadrons/Modules/MSource/SeqGamma.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Copyright (C) 2017
 Author: Antonin Portelli <antonin.portelli@me.com>
@ -150,9 +149,9 @@ void TSeqGamma<FImpl>::execute(void)
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
    {
        LatticeCoordinate(coor, mu);
-        ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
+        ph = ph + p[mu]*coor;
    }
-    ph = exp((Real)(2*M_PI)*i*ph);
+    ph = exp(i*ph);
    LatticeCoordinate(t, Tp);
    src = where((t >= par().tA) and (t <= par().tB), ph*(g*q), 0.*q);
 }
--- a/extras/Hadrons/Modules/MSource/Wall.hpp
+++ b/extras/Hadrons/Modules/MSource/Wall.hpp
@ -1,147 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MSource/Wall.hpp
 Copyright (C) 2017
 Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_WallSource_hpp_
 #define Hadrons_WallSource_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
 Wall source
 -----------------------------
 * src_x = delta(x_3 - tW) * exp(i x.mom)
 * options:
 - tW: source timeslice (integer)
 - mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0.")
 */
 /******************************************************************************
 *                         Wall                                               *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MSource)
 class WallPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WallPar,
                                    unsigned int, tW,
                                    std::string, mom);
 };
 template <typename FImpl>
 class TWall: public Module<WallPar>
 {
 public:
    TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWall(const std::string name);
    // destructor
    virtual ~TWall(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(Wall, TWall<FIMPL>, MSource);
 /******************************************************************************
 *                 TWall implementation                                       *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TWall<FImpl>::TWall(const std::string name)
 : Module<WallPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWall<FImpl>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TWall<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWall<FImpl>::setup(void)
 {
    env().template registerLattice<PropagatorField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWall<FImpl>::execute(void)
 {    
    LOG(Message) << "Generating wall source at t = " << par().tW 
                 << " with momentum " << par().mom << std::endl;
    PropagatorField &src = *env().template createLattice<PropagatorField>(getName());
    Lattice<iScalar<vInteger>> t(env().getGrid());
    LatticeComplex             ph(env().getGrid()), coor(env().getGrid());
    std::vector<Real>          p;
    Complex                    i(0.0,1.0);
    p  = strToVec<Real>(par().mom);
    ph = zero;
    for(unsigned int mu = 0; mu < Nd; mu++)
    {
        LatticeCoordinate(coor, mu);
        ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
    }
    ph = exp((Real)(2*M_PI)*i*ph);
    LatticeCoordinate(t, Tp);
    src = 1.;
    src = where((t == par().tW), src*ph, 0.*src);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_WallSource_hpp_
--- a/extras/Hadrons/Modules/Quark.hpp
+++ b/extras/Hadrons/Modules/Quark.hpp
@ -173,7 +173,7 @@ void TQuark<FImpl>::execute(void)
                *env().template getObject<PropagatorField>(getName());
            axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0);
-            axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1);
+            axpby_ssp_pplus(sol, 0., sol, 1., sol, 0, Ls_-1);
            ExtractSlice(tmp, sol, 0, 0);
            FermToProp(p4d, tmp, s, c);
        }
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@ -1,7 +1,4 @@
 modules_cc =\
  Modules/MContraction/WeakHamiltonianEye.cc \
  Modules/MContraction/WeakHamiltonianNonEye.cc \
  Modules/MContraction/WeakNeutral4ptDisc.cc \
  Modules/MGauge/Load.cc \
  Modules/MGauge/Random.cc \
  Modules/MGauge/Unit.cc
@ -10,21 +7,13 @@ modules_hpp =\
  Modules/MAction/DWF.hpp \
  Modules/MAction/Wilson.hpp \
  Modules/MContraction/Baryon.hpp \
  Modules/MContraction/DiscLoop.hpp \
  Modules/MContraction/Gamma3pt.hpp \
  Modules/MContraction/Meson.hpp \
  Modules/MContraction/WeakHamiltonian.hpp \
  Modules/MContraction/WeakHamiltonianEye.hpp \
  Modules/MContraction/WeakHamiltonianNonEye.hpp \
  Modules/MContraction/WeakNeutral4ptDisc.hpp \
  Modules/MGauge/Load.hpp \
  Modules/MGauge/Random.hpp \
  Modules/MGauge/Unit.hpp \
  Modules/MLoop/NoiseLoop.hpp \
  Modules/MSolver/RBPrecCG.hpp \
  Modules/MSource/Point.hpp \
  Modules/MSource/SeqGamma.hpp \
  Modules/MSource/Wall.hpp \
  Modules/MSource/Z2.hpp \
  Modules/Quark.hpp
--- a/gcc-bug-report/README
+++ b/gcc-bug-report/README
@ -20,17 +20,4 @@ The simple testcase in this directory is the submitted bug report that encapsula
 problem. The test case works with icpc and with clang++, but fails consistently on g++
 current variants.
-Peter
+Peter
 ************
 Second GCC bug reported, see Issue 100.
 https://wandbox.org/permlink/tzssJza6R9XnqANw
 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80652
 Getting Travis fails under gcc-5 for Test_simd, now that I added more comprehensive testing to the
 CI test suite. The limitations of Travis runtime limits & weak cores are being shown.
 Travis uses 5.4.1 for g++-5.
--- a/grid-config.in
+++ b/grid-config.in
@ -1,86 +0,0 @@
 #! /bin/sh
 prefix=@prefix@
 exec_prefix=@exec_prefix@
 includedir=@includedir@
 usage()
 {
  cat <<EOF
 Usage: grid-config [OPTION]
 Known values for OPTION are:
  --prefix     show Grid installation prefix
  --cxxflags   print pre-processor and compiler flags
  --ldflags    print library linking flags
  --libs       print library linking information
  --summary    print full build summary
  --help       display this help and exit
  --version    output version information
  --git        print git revision
 EOF
  exit $1
 }
 if test $# -eq 0; then
  usage 1
 fi
 cflags=false
 libs=false
 while test $# -gt 0; do
  case "$1" in
    -*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
    *) optarg= ;;
  esac
  case "$1" in
    --prefix)
      echo $prefix
    ;;
    --version)
      echo @VERSION@
      exit 0
    ;;
    --git)
      echo "@GRID_BRANCH@ @GRID_SHA@"
      exit 0
    ;;
    --help)
      usage 0
    ;;
    --cxxflags)
      echo @GRID_CXXFLAGS@
    ;;
    --ldflags)
      echo @GRID_LDFLAGS@
    ;;
    --libs)
      echo @GRID_LIBS@
    ;;
    --summary)
      echo ""
      echo "@GRID_SUMMARY@"
      echo ""
    ;;
    *)
      usage
      exit 1
    ;;
  esac
  shift
 done
 exit 0
--- a/lib/algorithms/Algorithms.h
+++ b/lib/algorithms/Algorithms.h
@ -39,17 +39,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientShifted.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 // Lanczos support
-//#include <Grid/algorithms/iterative/MatrixUtils.h>
+#include <Grid/algorithms/iterative/MatrixUtils.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/CoarsenedMatrix.h>
 #include <Grid/algorithms/FFT.h>
 // Eigen/lanczos
 // EigCg
--- a/lib/allocator/AlignedAllocator.cc
+++ b/lib/allocator/AlignedAllocator.cc
@ -1,7 +1,7 @@
-#include <Grid/GridCore.h>
+#include <Grid/Grid.h>
 namespace Grid {
@ -13,10 +13,9 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
  if (bytes < 4096 ) return NULL;
-#ifdef GRID_OMP
+#ifdef _OPENMP
  assert(omp_in_parallel()==0);
 #endif 
  void * ret = NULL;
  int v = -1;
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
--- a/lib/cartesian/Cartesian.h
+++ b/lib/cartesian/Cartesian.h
--- a/lib/communicator/Communicator.h
+++ b/lib/communicator/Communicator.h
--- a/lib/cshift/Cshift.h
+++ b/lib/cshift/Cshift.h
--- a/lib/DisableWarnings.h
+++ b/lib/DisableWarnings.h
@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/DisableWarnings.h
 Copyright (C) 2016
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef DISABLE_WARNINGS_H
 #define DISABLE_WARNINGS_H
 //disables and intel compiler specific warning (in json.hpp)
 #pragma warning disable 488  
 #endif
--- a/lib/algorithms/FFT.h
+++ b/lib/algorithms/FFT.h
--- a/lib/Grid.h
+++ b/lib/Grid.h
@ -38,12 +38,52 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_H
 #define GRID_H
-#include <Grid/GridCore.h>
+///////////////////
-#include <Grid/GridQCDcore.h>
+// Std C++ dependencies
-#include <Grid/qcd/action/Action.h>
+///////////////////
-#include <Grid/qcd/utils/GaugeFix.h>
+#include <cassert>
-#include <Grid/qcd/smearing/Smearing.h>
+#include <complex>
-#include <Grid/parallelIO/MetaData.h>
+#include <vector>
-#include <Grid/qcd/hmc/HMC_aggregate.h>
+#include <iostream>
 #include <iomanip>
 #include <random>
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
 #include <sys/time.h>
 #include <chrono>
 ///////////////////
 // Grid headers
 ///////////////////
 #include "Config.h"
 #include <Grid/Timer.h>
 #include <Grid/PerfCount.h>
 #include <Grid/Log.h>
 #include <Grid/AlignedAllocator.h>
 #include <Grid/Simd.h>
 #include <Grid/serialisation/Serialisation.h>
 #include <Grid/Threads.h>
 #include <Grid/Lexicographic.h>
 #include <Grid/Init.h>
 #include <Grid/Communicator.h> 
 #include <Grid/Cartesian.h>    
 #include <Grid/Tensors.h>      
 #include <Grid/Lattice.h>      
 #include <Grid/Cshift.h>       
 #include <Grid/Stencil.h>      
 #include <Grid/Algorithms.h>   
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/FFT.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/parallelIO/NerscIO.h>
 #include <Grid/qcd/hmc/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/HmcRunner.h>
 #endif
--- a/lib/GridStd.h
+++ b/lib/GridStd.h
@ -1,29 +0,0 @@
 #ifndef GRID_STD_H
 #define GRID_STD_H
 ///////////////////
 // Std C++ dependencies
 ///////////////////
 #include <cassert>
 #include <complex>
 #include <vector>
 #include <string>
 #include <iostream>
 #include <iomanip>
 #include <random>
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
 #include <sys/time.h>
 #include <chrono>
 #include <zlib.h>
 ///////////////////
 // Grid config
 ///////////////////
 #include "Config.h"
 #endif /* GRID_STD_H */
--- a/lib/Grid_Eigen_Dense.h
+++ b/lib/Grid_Eigen_Dense.h
@ -1,9 +0,0 @@
 #pragma once
 #if defined __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
 #include <Grid/Eigen/Dense>
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif
--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@ -1,6 +1,6 @@
-/*************************************************************************************
+    /*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Init.cc
@ -36,20 +36,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <stdint.h>
 #include <unistd.h>
 #include <sys/mman.h>
-#include <sys/stat.h>
+#include <sys/stat.h> 
 #include <sys/time.h>
 #include <signal.h>
 #include <iostream>
 #include <iterator>
 #include <Grid/Grid.h>
 #include <algorithm>
 #include <iterator>
 #include <cstdlib>
 #include <memory>
 #include <Grid/Grid.h>
 #include <Grid/util/CompilerCompatible.h>
 #include <fenv.h>
 #ifdef __APPLE__
@ -95,14 +92,14 @@ const std::vector<int> GridDefaultSimd(int dims,int nsimd)
      if ( nn>=2) {
 	layout[d]=2;
 	nn/=2;
-      } else {
+      } else { 
 	layout[d]=1;
      }
    }
    assert(nn==1);
    return layout;
 }
-
+  
 ////////////////////////////////////////////////////////////
 // Command line parsing assist for stock controls
 ////////////////////////////////////////////////////////////
@ -146,7 +143,7 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
    vec.push_back(i);
    if(std::ispunct(ss.peek()))
      ss.ignore();
-  }
+  }    
  return;
 }
@ -222,59 +219,8 @@ void Grid_init(int *argc,char ***argv)
    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
  CartesianCommunicator::Init(argc,argv);
  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
    Grid_quiesce_nodes();
  } else { 
    FILE *fp;
    std::ostringstream fname;
    fname<<"Grid.stdout.";
    fname<<CartesianCommunicator::RankWorld();
    fp=freopen(fname.str().c_str(),"w",stdout);
    assert(fp!=(FILE *)NULL);
  }
  ////////////////////////////////////
  // Banner
  ////////////////////////////////////
  if ( CartesianCommunicator::RankWorld() == 0 ) { 
    std::cout <<std::endl;
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
    std::cout  << "__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__"<<std::endl; 
    std::cout  << "__|_                                    _|__"<<std::endl; 
    std::cout  << "__|_   GGGG    RRRR    III    DDDD      _|__"<<std::endl;
    std::cout  << "__|_  G        R   R    I     D   D     _|__"<<std::endl;
    std::cout  << "__|_  G        R   R    I     D    D    _|__"<<std::endl;
    std::cout  << "__|_  G  GG    RRRR     I     D    D    _|__"<<std::endl;
    std::cout  << "__|_  G   G    R  R     I     D   D     _|__"<<std::endl;
    std::cout  << "__|_   GGGG    R   R   III    DDDD      _|__"<<std::endl;
    std::cout  << "__|_                                    _|__"<<std::endl; 
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
    std::cout  << "  |  |  |  |  |  |  |  |  |  |  |  |  |  |  "<<std::endl; 
    std::cout << std::endl;
    std::cout << std::endl;
    std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
    std::cout << std::endl;
    std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
    std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
    std::cout << "the Free Software Foundation; either version 2 of the License, or"<<std::endl;
    std::cout << "(at your option) any later version."<<std::endl;
    std::cout << std::endl;
    std::cout << "This program is distributed in the hope that it will be useful,"<<std::endl;
    std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
    std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
    std::cout << "GNU General Public License for more details."<<std::endl;
    std::cout << std::endl;
  }
  ////////////////////////////////////
  // Logging
  ////////////////////////////////////
@ -284,6 +230,9 @@ void Grid_init(int *argc,char ***argv)
  GridCmdOptionCSL(defaultLog,logstreams);
  GridLogConfigure(logstreams);
  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
    Grid_quiesce_nodes();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
    arg = GridCmdOptionPayload(*argv,*argv+*argc,"--log");
@ -299,74 +248,101 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"  --help : this message"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Geometry:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
-    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;
+    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;    
-    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;
+    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
-    std::cout<<GridLogMessage<<std::endl;
+    std::cout<<GridLogMessage<<"  --log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    std::cout<<GridLogMessage<<"  --log list      : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl;
-    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;
+    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;    
-    std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;
+    std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
    std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
    std::cout<<GridLogMessage<<"  --comms-overlap : Overlap comms with compute "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --lebesgue      : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;    
    std::cout<<GridLogMessage<<"  --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    exit(EXIT_SUCCESS);
  }
  ////////////////////////////////////
  // Banner
  ////////////////////////////////////
  std::string COL_RED    = GridLogColours.colour["RED"];
  std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
  std::string COL_BLACK  = GridLogColours.colour["BLACK"];
  std::string COL_GREEN  = GridLogColours.colour["GREEN"];
  std::string COL_BLUE   = GridLogColours.colour["BLUE"];
  std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
  std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
  std::cout <<std::endl;
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_RED  << "__|_ |  |  |  "<<             "|  |  | "<<COL_PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
  std::cout <<COL_RED  << "__|_          "<<             "        "<<COL_PURPLE<<"        "<<                "          _|__"<<std::endl; 
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G  GG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_GREEN <<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G   G   "<<COL_RED<<" R  R   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_GREEN <<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" R   R  "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_GREEN <<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_          "<<             "        "<<COL_GREEN <<"        "<<                "          _|__"<<std::endl; 
  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_BLUE << "  |  |  |  |  "<<             "|  |  | "<<COL_GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
  std::cout << std::endl;
  std::cout << std::endl;
  std::cout <<COL_YELLOW<< std::endl;
  std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
  std::cout << std::endl;
  std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
  std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
  std::cout << "the Free Software Foundation; either version 2 of the License, or"<<std::endl;
  std::cout << "(at your option) any later version."<<std::endl;
  std::cout << std::endl;
  std::cout << "This program is distributed in the hope that it will be useful,"<<std::endl;
  std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
  std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
  std::cout << "GNU General Public License for more details."<<std::endl;
  std::cout << COL_BACKGROUND <<std::endl;
  std::cout << std::endl;
  ////////////////////////////////////
  // Debug and performance options
  ////////////////////////////////////
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptHandUnroll;
    QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptInlineAsm;
    QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
    QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
  } else {
    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
    GridLogTimestamp(0);
-  } else {
+  } else { 
    GridLogTimestamp(1);
  }
@ -374,7 +350,7 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);
-  std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
+  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Decomposition\n";
@ -390,39 +366,30 @@ void Grid_init(int *argc,char ***argv)
  Grid_is_initialised = 1;
 }
-
+  
 void Grid_finalize(void)
 {
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) 
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
 #if defined (GRID_COMMS_SHMEM)
  shmem_finalize();
 #endif
 }
 void GridLogLayout() {
    std::cout << GridLogMessage << "Grid Layout\n";
    std::cout << GridLogMessage << "\tGlobal lattice size  : "<< GridCmdVectorIntToString(GridDefaultLatt()) << std::endl;
    std::cout << GridLogMessage << "\tOpenMP threads       : "<< GridThread::GetThreads() <<std::endl;
    std::cout << GridLogMessage << "\tMPI tasks            : "<< GridCmdVectorIntToString(GridDefaultMpi()) << std::endl;
 }
 void * Grid_backtrace_buffer[_NBACKTRACE];
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
-  fprintf(stderr,"Caught signal %d\n",si->si_signo);
+  printf("Caught signal %d\n",si->si_signo);
-  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
+  printf("  mem address %llx\n",(unsigned long long)si->si_addr);
-  fprintf(stderr,"         code %d\n",si->si_code);
+  printf("         code %d\n",si->si_code);
  // Linux/Posix
 #ifdef __linux__
  // And x86 64bit
 #ifdef __x86_64__
  ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
-  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
+  printf("  instruction %llx\n",(unsigned long long)sc->rip);
 #define REG(A)  printf("  %s %lx\n",#A,sc-> A);
  REG(rdi);
  REG(rsi);
@ -445,11 +412,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  REG(r15);
 #endif
 #endif
-  fflush(stderr);
+  BACKTRACE();
  BACKTRACEFP(stderr);
  fprintf(stderr,"Called backtrace\n");
  fflush(stdout);
  fflush(stderr);
  exit(0);
  return;
 };
@ -462,12 +425,9 @@ void Grid_debug_handler_init(void)
  sa.sa_flags    = SA_SIGINFO;
  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  sigaction(SIGBUS,&sa,NULL);
  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
  sigaction(SIGFPE,&sa,NULL);
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);
 }
 }
--- a/lib/util/Init.h
+++ b/lib/util/Init.h
@ -1,6 +1,6 @@
    /*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Init.h
@ -46,7 +46,6 @@ namespace Grid {
  const int              &GridThreads(void)  ;
  void                    GridSetThreads(int t) ;
  void GridLogTimestamp(int);
  void GridLogLayout();
  // Common parsing chores
  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
--- a/lib/lattice/Lattice.h
+++ b/lib/lattice/Lattice.h
--- a/lib/util/Lexicographic.h
+++ b/lib/util/Lexicographic.h
--- a/lib/log/Log.cc
+++ b/lib/log/Log.cc
@ -29,11 +29,9 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/GridCore.h>
+#include <Grid/Grid.h>
 #include <Grid/util/CompilerCompatible.h>
 #include <cxxabi.h>
 #include <memory>
 namespace Grid {
--- a/lib/log/Log.h
+++ b/lib/log/Log.h
@ -110,8 +110,8 @@ public:
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
    if ( log.active ) {
-      stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : ";
+      stream << log.background()<< std::setw(10) << std::left << log.topName << log.background()<< " : ";
-      stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : ";
+      stream << log.colour() << std::setw(14) << std::left << log.name << log.background() << " : ";
      if ( log.timestamp ) {
 	StopWatch.Stop();
 	GridTime now = StopWatch.Elapsed();
--- a/lib/Old/Endeavour.tgz
+++ b/lib/Old/Endeavour.tgz
--- a/lib/Old/Tensor_peek.h
+++ b/lib/Old/Tensor_peek.h
@ -0,0 +1,154 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Old/Tensor_peek.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_MATH_PEEK_H
 #define GRID_MATH_PEEK_H
 namespace Grid {
 //////////////////////////////////////////////////////////////////////////////
 // Peek on a specific index; returns a scalar in that index, tensor inherits rest
 //////////////////////////////////////////////////////////////////////////////
 // If we hit the right index, return scalar with no further recursion
 //template<int Level> inline ComplexF peekIndex(const ComplexF arg) { return arg;}
 //template<int Level> inline ComplexD peekIndex(const ComplexD arg) { return arg;}
 //template<int Level> inline RealF peekIndex(const RealF arg) { return arg;}
 //template<int Level> inline RealD peekIndex(const RealD arg) { return arg;}
 #if 0
 // Scalar peek, no indices
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  auto peekIndex(const iScalar<vtype> &arg) ->  iScalar<vtype> 
 {
  return arg;
 }
 // Vector peek, one index
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  auto peekIndex(const iVector<vtype,N> &arg,int i) -> iScalar<vtype> // Index matches
 {
  iScalar<vtype> ret;                              // return scalar
  ret._internal = arg._internal[i];
  return ret;
 }
 // Matrix peek, two indices
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) ->  iScalar<vtype>
 {
  iScalar<vtype> ret;                              // return scalar
  ret._internal = arg._internal[i][j];
  return ret;
 }
 /////////////
 // No match peek for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue
 /////////////
 // scalar
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iScalar<vtype> &arg) -> iScalar<decltype(peekIndex<Level>(arg._internal))>
 {
  iScalar<decltype(peekIndex<Level>(arg._internal))> ret;
  ret._internal= peekIndex<Level>(arg._internal);
  return ret;
 }
 template<int Level,class vtype, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iScalar<vtype> &arg,int i) ->  iScalar<decltype(peekIndex<Level>(arg._internal,i))> 
 {
  iScalar<decltype(peekIndex<Level>(arg._internal,i))> ret;
  ret._internal=peekIndex<Level>(arg._internal,i);
  return ret;
 }
 template<int Level,class vtype, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iScalar<vtype> &arg,int i,int j) ->  iScalar<decltype(peekIndex<Level>(arg._internal,i,j))>
 {
  iScalar<decltype(peekIndex<Level>(arg._internal,i,j))> ret;
  ret._internal=peekIndex<Level>(arg._internal,i,j);
  return ret;
 }
 // vector
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
 auto peekIndex(const iVector<vtype,N> &arg) ->   iVector<decltype(peekIndex<Level>(arg._internal[0])),N>
 {
  iVector<decltype(peekIndex<Level>(arg._internal[0])),N> ret;
  for(int ii=0;ii<N;ii++){
    ret._internal[ii]=peekIndex<Level>(arg._internal[ii]);
  }
  return ret;
 }
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iVector<vtype,N> &arg,int i) ->  iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N>
 {
  iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N> ret;
  for(int ii=0;ii<N;ii++){
    ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i);
  }
  return ret;
 }
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iVector<vtype,N> &arg,int i,int j) ->  iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N> 
 {
  iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N> ret;
  for(int ii=0;ii<N;ii++){
    ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i,j);
  }
  return ret;
 }
 // matrix
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
 auto peekIndex(const iMatrix<vtype,N> &arg) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N> 
 {
  iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N> ret;
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj]);// Could avoid this because peeking a scalar is dumb
  }}
  return ret;
 }
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iMatrix<vtype,N> &arg,int i) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i)),N>
 {
  iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i)),N> ret;
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i);
  }}
  return ret;
 }
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N>
 {
  iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N> ret;
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i,j);
  }}
  return ret;
 }
 #endif
 }
 #endif
--- a/lib/Old/Tensor_poke.h
+++ b/lib/Old/Tensor_poke.h
@ -0,0 +1,127 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Old/Tensor_poke.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_MATH_POKE_H
 #define GRID_MATH_POKE_H
 namespace Grid {
 //////////////////////////////////////////////////////////////////////////////
 // Poke a specific index; 
 //////////////////////////////////////////////////////////////////////////////
 #if 0
 // Scalar poke
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  void pokeIndex(iScalar<vtype> &ret, const iScalar<vtype> &arg)
 {
  ret._internal = arg._internal;
 }
 // Vector poke, one index
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  void pokeIndex(iVector<vtype,N> &ret, const iScalar<vtype> &arg,int i)
 {
  ret._internal[i] = arg._internal;
 }
 //Matrix poke, two indices
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  void pokeIndex(iMatrix<vtype,N> &ret, const iScalar<vtype> &arg,int i,int j)
 {
  ret._internal[i][j] = arg._internal;
 }
 /////////////
 // No match poke for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue
 /////////////
 // scalar
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
 void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal))>  &arg)
 {
  pokeIndex<Level>(ret._internal,arg._internal);
 }
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal,0))> &arg, int i)
 {
  pokeIndex<Level>(ret._internal,arg._internal,i);
 }
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal,0,0))> &arg,int i,int j)
 {
  pokeIndex<Level>(ret._internal,arg._internal,i,j);
 }
 // Vector
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iVector<vtype,N> &ret, iVector<decltype(peekIndex<Level>(ret._internal)),N>  &arg)
 {
  for(int ii=0;ii<N;ii++){
    pokeIndex<Level>(ret._internal[ii],arg._internal[ii]);
  }
 }
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(peekIndex<Level>(ret._internal,0)),N> &arg,int i)
 {
  for(int ii=0;ii<N;ii++){
    pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i);
  }
 }
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(peekIndex<Level>(ret._internal,0,0)),N> &arg,int i,int j)
 {
  for(int ii=0;ii<N;ii++){
    pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i,j);
  }
 }
 // Matrix
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal)),N> &arg)		 
 {
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj]);
  }}
 }
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal,0)),N> &arg,int i)
 {
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i);
  }}
 }
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal,0,0)),N> &arg, int i,int j)
 {
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i,j);
  }}
 }
 #endif
 }
 #endif
--- a/lib/perfmon/PerfCount.cc
+++ b/lib/perfmon/PerfCount.cc
@ -26,8 +26,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/GridCore.h>
+#include <Grid/Grid.h>
-#include <Grid/perfmon/PerfCount.h>
+#include <Grid/PerfCount.h>
 namespace Grid {
--- a/lib/perfmon/PerfCount.h
+++ b/lib/perfmon/PerfCount.h
@ -172,7 +172,7 @@ public:
    const char * name = PerformanceCounterConfigs[PCT].name;
    fd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
    if (fd == -1) {
-      fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name);
+      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
    int norm = PerformanceCounterConfigs[PCT].normalisation;
@ -181,7 +181,7 @@ public:
    name = PerformanceCounterConfigs[norm].name;
    cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
    if (cyclefd == -1) {
-      fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name);
+      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
 #endif
@ -205,14 +205,13 @@ public:
  void Stop(void) {
    count=0;
    cycles=0;
    size_t ign;
 #ifdef __linux__
    ssize_t ign;
    if ( fd!= -1) {
      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
      ign=::read(fd, &count, sizeof(long long));
-      ign+=::read(cyclefd, &cycles, sizeof(long long));
+      ign=::read(cyclefd, &cycles, sizeof(long long));
      assert(ign=2*sizeof(long long));
    }
    elapsed = cyclecount() - begin;
 #else
--- a/lib/simd/Simd.h
+++ b/lib/simd/Simd.h
@ -172,8 +172,8 @@ namespace Grid {
 };
-#include <Grid/simd/Grid_vector_types.h>
+#include "simd/Grid_vector_types.h"
-#include <Grid/simd/Grid_vector_unops.h>
+#include "simd/Grid_vector_unops.h"
 namespace Grid {
  // Default precision
--- a/lib/perfmon/Stat.cc
+++ b/lib/perfmon/Stat.cc
@ -1,9 +1,11 @@
-#include <Grid/GridCore.h>
+#include <Grid/Grid.h>
-#include <Grid/perfmon/PerfCount.h>
+#include <Grid/PerfCount.h>
-#include <Grid/perfmon/Stat.h>
+#include <Grid/Stat.h>
 namespace Grid { 
 bool PmuStat::pmu_initialized=false;
--- a/lib/perfmon/Stat.h
+++ b/lib/perfmon/Stat.h
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
--- a/lib/tensors/Tensors.h
+++ b/lib/tensors/Tensors.h
--- a/lib/threads/Threads.h
+++ b/lib/threads/Threads.h
@ -37,9 +37,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_OMP
 #include <omp.h>
-
+#ifdef GRID_NUMA
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
 #else
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(runtime)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)")
 #endif
 #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #define PARALLEL_REGION       _Pragma("omp parallel")
 #define PARALLEL_CRITICAL     _Pragma("omp critical")
@ -51,9 +55,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define PARALLEL_CRITICAL
 #endif
 #define parallel_for       PARALLEL_FOR_LOOP for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 namespace Grid {
  // Introduce a class to gain deterministic bit reproducible reduction.
--- a/lib/perfmon/Timer.h
+++ b/lib/perfmon/Timer.h
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@ -267,7 +267,8 @@ namespace Grid {
      SimpleCompressor<siteVector> compressor;
      Stencil.HaloExchange(in,compressor);
-      parallel_for(int ss=0;ss<Grid()->oSites();ss++){
+PARALLEL_FOR_LOOP
      for(int ss=0;ss<Grid()->oSites();ss++){
        siteVector res = zero;
 	siteVector nbr;
 	int ptype;
@ -379,7 +380,8 @@ namespace Grid {
 	  Subspace.ProjectToSubspace(oProj,oblock);
 	  //	  blockProject(iProj,iblock,Subspace.subspace);
 	  //	  blockProject(oProj,oblock,Subspace.subspace);
-	  parallel_for(int ss=0;ss<Grid()->oSites();ss++){
+PARALLEL_FOR_LOOP
 	  for(int ss=0;ss<Grid()->oSites();ss++){
 	    for(int j=0;j<nbasis;j++){
 	      if( disp!= 0 ) {
 		A[p]._odata[ss](j,i) = oProj._odata[ss](j);
@ -425,7 +427,7 @@ namespace Grid {
 	A[p]=zero;
      }
-      GridParallelRNG  RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34}));
+      GridParallelRNG  RNG(Grid()); RNG.SeedRandomDevice();
      Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);
      Complex one(1.0);
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@ -235,7 +235,7 @@ namespace Grid {
 	Field tmp(in._grid);
 	_Mat.MeooeDag(in,tmp);
-        _Mat.MooeeInvDag(tmp,out);
+	_Mat.MooeeInvDag(tmp,out);
 	_Mat.MeooeDag(out,tmp);
 	_Mat.MooeeDag(in,out);
--- a/lib/algorithms/approx/.dirstamp
+++ b/lib/algorithms/approx/.dirstamp
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@ -197,9 +197,8 @@ namespace Grid {
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      GridBase *grid=in._grid;
-
+//std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
-      // std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
+//<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
      //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
      int vol=grid->gSites();
--- a/lib/algorithms/approx/MultiShiftFunction.cc
+++ b/lib/algorithms/approx/MultiShiftFunction.cc
@ -25,7 +25,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/GridCore.h>
+#include <Grid/Grid.h>
 namespace Grid {
 double MultiShiftFunction::approx(double x)
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@ -16,7 +16,7 @@
 #define INCLUDED_ALG_REMEZ_H
 #include <stddef.h>
-#include <Grid/GridStd.h>
+#include <Config.h>
 #ifdef HAVE_LIBGMP
 #include "bigfloat.h"
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@ -1,593 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
 Copyright (C) 2017
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
 #define GRID_BLOCK_CONJUGATE_GRADIENT_H
 namespace Grid {
 enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
 template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
 public:
  typedef typename Field::scalar_type scomplex;
  int blockDim ;
  int Nblock;
  BlockCGtype CGtype;
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
  {};
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 Eigen::MatrixXcd &C,
 		 Eigen::MatrixXcd &Cinv,
 		 Field & Q,
 		 const Field & R)
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  //Dimensions
  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
  //
  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
  //
  //   Q  C = R => Q = R C^{-1}
  //
  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
  //
  // Set C = L^{dag}, and then Q^dag Q = ident 
  //
  // Checks:
  // Cdag C = Rdag R ; passes.
  // QdagQ  = 1      ; passes
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Cholesky from Eigen
  // There exists a ldlt that is documented as more stable
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Q = R C^{-1}
  //
  // Q_j  = R_i Cinv(i,j) 
  //
  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // FIXME:: make a sliceMulMatrix to avoid zero vector
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Call one of several implementations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  if ( CGtype == BlockCGrQ ) {
    BlockCGrQsolve(Linop,Src,Psi);
  } else if (CGtype == BlockCG ) {
    BlockCGsolve(Linop,Src,Psi);
  } else if (CGtype == CGmultiRHS ) {
    CGmultiRHSsolve(Linop,Src,Psi);
  } else {
    assert(0);
  }
 }
 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQ implementation:
 //--------------------------
 // X is guess/Solution
 // B is RHS
 // Solve A X_i = B_i    ;        i refers to Nblock index
 ////////////////////////////////////////////////////////////////////////////
 void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = B._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  X.checkerboard = B.checkerboard;
  conformable(X, B);
  Field tmp(B);
  Field Q(B);
  Field D(B);
  Field Z(B);
  Field AD(B);
  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,B,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,B,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,X,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  /************************************************************************
   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
   ************************************************************************
   * Dimensions:
   *
   *   X,B==(Nferm x Nblock)
   *   A==(Nferm x Nferm)
   *  
   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
   * 
   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
   * for k: 
   *   Z  = AD
   *   M  = [D^dag Z]^{-1}
   *   X  = X + D MC
   *   QS = Q - ZM
   *   D  = Q + D S^dag
   *   C  = S C
   */
  ///////////////////////////////////////
  // Initial block: initial search dir is guess
  ///////////////////////////////////////
  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
  Linop.HermOp(X, AD);
  tmp = B - AD;  
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  D=Q;
  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch QRTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    //3. Z  = AD
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
    sliceMaddTimer.Start();
    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
    sliceMaddTimer.Stop();
    //6. QS = Q - ZM
    sliceMaddTimer.Start();
    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
    sliceMaddTimer.Stop();
    QRTimer.Start();
    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
    QRTimer.Stop();
    //7. D  = Q + D S^dag
    m_tmp = m_S.adjoint();
    sliceMaddTimer.Start();
    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
    sliceMaddTimer.Stop();
    //8. C  = S C
    m_C = m_S*m_C;
    /*********************
     * convergence monitor
     *********************
     */
    m_rr = m_C.adjoint() * m_C;
    RealD max_resid=0;
    RealD rrsum=0;
    RealD rr;
    for(int b=0;b<Nblock;b++) {
      rrsum+=real(m_rr(b,b));
      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
 		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(X, AD);
      AD = AD-B;
      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
 void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  Psi.checkerboard = Src.checkerboard;
  conformable(Psi, Src);
  Field P(Src);
  Field AP(Src);
  Field R(Src);
  Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,Src,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,Src,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,Psi,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  // Initial search dir is guess
  Linop.HermOp(Psi, AP);
  /************************************************************************
   * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
   ************************************************************************
   * O'Leary : R = B - A X
   * O'Leary : P = M R ; preconditioner M = 1
   * O'Leary : alpha = PAP^{-1} RMR
   * O'Leary : beta  = RMR^{-1}_old RMR_new
   * O'Leary : X=X+Palpha
   * O'Leary : R_new=R_old-AP alpha
   * O'Leary : P=MR_new+P beta
   */
  R = Src - AP;  
  P = R;
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    RealD rrsum=0;
    for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
    MatrixTimer.Start();
    Linop.HermOp(P, AP);
    MatrixTimer.Stop();
    // Alpha
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
    sliceInnerTimer.Stop();
    m_pAp_inv = m_pAp.inverse();
    m_alpha   = m_pAp_inv * m_rr ;
    // Psi, R update
    sliceMaddTimer.Start();
    sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi
    sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
    sliceMaddTimer.Stop();
    // Beta
    m_rr_inv = m_rr.inverse();
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_rr,R,R,Orthog);
    sliceInnerTimer.Stop();
    m_beta = m_rr_inv *m_rr;
    // Search update
    sliceMaddTimer.Start();
    sliceMaddMatrix(AP,m_beta,P,R,Orthog);
    sliceMaddTimer.Stop();
    P= AP;
    /*********************
     * convergence monitor
     *********************
     */
    RealD max_resid=0;
    RealD rr;
    for(int b=0;b<Nblock;b++){
      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
 		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(Psi, AP);
      AP = AP-Src;
      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
 // Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
 void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  int Orthog = blockDim; // First dimension is block dim
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  Psi.checkerboard = Src.checkerboard;
  conformable(Psi, Src);
  Field P(Src);
  Field AP(Src);
  Field R(Src);
  std::vector<ComplexD> v_pAp(Nblock);
  std::vector<RealD> v_rr (Nblock);
  std::vector<RealD> v_rr_inv(Nblock);
  std::vector<RealD> v_alpha(Nblock);
  std::vector<RealD> v_beta(Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,Src,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,Src,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,Psi,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  // Initial search dir is guess
  Linop.HermOp(Psi, AP);
  R = Src - AP;  
  P = R;
  sliceNorm(v_rr,R,Orthog);
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch sliceNormTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    RealD rrsum=0;
    for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
    MatrixTimer.Start();
    Linop.HermOp(P, AP);
    MatrixTimer.Stop();
    // Alpha
    sliceInnerTimer.Start();
    sliceInnerProductVector(v_pAp,P,AP,Orthog);
    sliceInnerTimer.Stop();
    for(int b=0;b<Nblock;b++){
      v_alpha[b] = v_rr[b]/real(v_pAp[b]);
    }
    // Psi, R update
    sliceMaddTimer.Start();
    sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi
    sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
    sliceMaddTimer.Stop();
    // Beta
    for(int b=0;b<Nblock;b++){
      v_rr_inv[b] = 1.0/v_rr[b];
    }
    sliceNormTimer.Start();
    sliceNorm(v_rr,R,Orthog);
    sliceNormTimer.Stop();
    for(int b=0;b<Nblock;b++){
      v_beta[b] = v_rr_inv[b] *v_rr[b];
    }
    // Search update
    sliceMaddTimer.Start();
    sliceMaddVector(P,v_beta,P,R,Orthog);
    sliceMaddTimer.Stop();
    /*********************
     * convergence monitor
     *********************
     */
    RealD max_resid=0;
    for(int b=0;b<Nblock;b++){
      RealD rr = v_rr[b]/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(Psi, AP);
      AP = AP-Src;
      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 };
 }
 #endif
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@ -45,8 +45,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
      : Tolerance(tol),
        MaxIterations(maxit),
@ -78,12 +76,18 @@ class ConjugateGradient : public OperatorFunction<Field> {
    cp = a;
    ssq = norm2(src);
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   src " << ssq << std::endl;
+              << "ConjugateGradient: guess " << guess << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:    mp " << d << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:   mmp " << b << std::endl;
+              << "ConjugateGradient:   src " << ssq << std::endl;
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:  cp,r " << cp << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
-    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient:     p " << a << std::endl;
+              << "ConjugateGradient:    mp " << d << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:   mmp " << b << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:  cp,r " << cp << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:     p " << a << std::endl;
    RealD rsq = Tolerance * Tolerance * ssq;
@ -93,7 +97,8 @@ class ConjugateGradient : public OperatorFunction<Field> {
    }
    std::cout << GridLogIterative << std::setprecision(4)
-              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
+              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq
              << std::endl;
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
@ -123,11 +128,8 @@ class ConjugateGradient : public OperatorFunction<Field> {
      p = p * b + r;
      LinalgTimer.Stop();
      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
                << " residual " << cp << " target " << rsq << std::endl;
      std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl;
      std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl;
      // Stopping condition
      if (cp <= rsq) {
@ -135,33 +137,31 @@ class ConjugateGradient : public OperatorFunction<Field> {
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
        RealD mmpnorm = sqrt(norm2(mmp));
        RealD psinorm = sqrt(norm2(psi));
        RealD srcnorm = sqrt(norm2(src));
        RealD resnorm = sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
-        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
+        std::cout << GridLogMessage
-        std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
+                  << "ConjugateGradient: Converged on iteration " << k << std::endl;
-	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
+        std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq)
-	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+                  << " true residual " << true_residual << " target "
-
+                  << Tolerance << std::endl;
-        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
+        std::cout << GridLogMessage << "Time elapsed: Iterations "
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+                  << SolverTimer.Elapsed() << " Matrix  "
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+                  << MatrixTimer.Elapsed() << " Linalg "
-	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+                  << LinalgTimer.Elapsed();
        std::cout << std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 	IterationsToComplete = k;	
        return;
      }
    }
    std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
              << std::endl;
    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;
  }
 };
 }
--- a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -35,7 +35,6 @@ namespace Grid {
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
  public:                                                
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid; //Grid for single-precision fields
@ -43,16 +42,12 @@ namespace Grid {
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
-      Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
      OuterLoopNormMult(100.), guesser(NULL){ };
    void useGuesser(LinearFunction<FieldF> &g){
@ -60,8 +55,9 @@ namespace Grid {
    }
    void operator() (const FieldD &src_d_in, FieldD &sol_d){
-      TotalInnerIterations = 0;
+	(*this)(src_d_in,sol_d,NULL);
-	
+    }
    void operator() (const FieldD &src_d_in, FieldD &sol_d, RealD *shift){
      GridStopWatch TotalTimer;
      TotalTimer.Start();
@ -81,7 +77,7 @@ namespace Grid {
      FieldD src_d(DoublePrecGrid);
      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
-      RealD inner_tol = InnerTolerance;
+      RealD inner_tol = Tolerance;
      FieldF src_f(SinglePrecGrid);
      src_f.checkerboard = cb;
@ -89,18 +85,17 @@ namespace Grid {
      FieldF sol_f(SinglePrecGrid);
      sol_f.checkerboard = cb;
-      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations);
      CG_f.ErrorOnNoConverge = false;
      GridStopWatch InnerCGtimer;
      GridStopWatch PrecChangeTimer;
-      Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
+      for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
      for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
 	//Compute double precision rsd and also new RHS vector.
 	Linop_d.HermOp(sol_d, tmp_d);
 	if(shift) axpy(tmp_d,*shift,sol_d,tmp_d);
 	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
@ -124,9 +119,8 @@ namespace Grid {
 	//Inner CG
 	CG_f.Tolerance = inner_tol;
 	InnerCGtimer.Start();
-	CG_f(Linop_f, src_f, sol_f);
+	CG_f(Linop_f, src_f, sol_f,shift);
 	InnerCGtimer.Stop();
 	TotalInnerIterations += CG_f.IterationsToComplete;
 	//Convert sol back to double and add to double prec solution
 	PrecChangeTimer.Start();
@ -139,13 +133,11 @@ namespace Grid {
      //Final trial CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
-      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
+      ConjugateGradientShifted<FieldD> CG_d(Tolerance, MaxInnerIterations);
-      CG_d(Linop_d, src_d_in, sol_d);
+      CG_d(Linop_d, src_d_in, sol_d,shift);
      TotalFinalStepIterations = CG_d.IterationsToComplete;
      TotalTimer.Stop();
-      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
    }
  };
--- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
@ -45,6 +45,7 @@ public:
    Integer MaxIterations;
    int verbose;
    MultiShiftFunction shifts;
    int iter;
    ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
 	MaxIterations(maxit),
@ -60,6 +61,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
  std::vector<Field> results(nshift,grid);
  (*this)(Linop,src,results,psi);
 }
 void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
 {
  int nshift = shifts.order;
@ -105,11 +107,12 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  RealD a,b,c,d;
  RealD cp,bp,qq; //prev
  int cb=src.checkerboard;
  // Matrix mult fields
  Field r(grid);
-  Field p(grid);
+  Field p(grid); p.checkerboard = src.checkerboard;
  Field tmp(grid);
-  Field mmp(grid);
+  Field mmp(grid);mmp.checkerboard = src.checkerboard;
  // Check lightest mass
  for(int s=0;s<nshift;s++){
@ -132,6 +135,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  p=src;
  //MdagM+m[0]
  std::cout << "p.checkerboard " << p.checkerboard
  << "mmp.checkerboard " << mmp.checkerboard << std::endl;
  Linop.HermOpAndNorm(p,mmp,d,qq);
  axpy(mmp,mass[0],p,mmp);
  RealD rn = norm2(p);
@ -269,6 +275,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	RealD cn = norm2(src);
 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
      }
      iter = k;
      return;
    }
  }
--- a/lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@ -0,0 +1,404 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
    Copyright (C) 2015
 Author: Chulwoo Jung <chulwoo@quark.phy.bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END/ LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H
 namespace Grid {
  //Mixed precision restarted defect correction CG
  template<class FieldD,class FieldF
 //, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0
 //, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0
 > 
  class MixedPrecisionConjugateGradientMultiShift : public LinearFunction<FieldD> {
  public:                                                
 //    RealD   Tolerance;
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    MultiShiftFunction shifts;
    Integer iter;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
 //    LinearFunction<FieldF> *guesser;
    MixedPrecisionConjugateGradientMultiShift(GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, 
 Integer maxinnerit,	MultiShiftFunction &_shifts ) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
      MaxInnerIterations(maxinnerit), SinglePrecGrid(_sp_grid),
      OuterLoopNormMult(100.), shifts(_shifts) {};
    void operator() (const FieldD &src_d_in, FieldD &sol_d){
 	assert(0); // not yet implemented
    }
    void operator() (const FieldD &src_d_in, std::vector<FieldD> &sol_d){
      GridStopWatch TotalTimer;
      TotalTimer.Start();
      int cb = src_d_in.checkerboard;
      int nshift = shifts.order;
      assert(nshift == sol_d.size());
      for(int i=0;i<nshift;i++) sol_d[i].checkerboard = cb;
      RealD src_norm = norm2(src_d_in);
 //      RealD stop = src_norm * Tolerance*Tolerance;
      GridBase* DoublePrecGrid = src_d_in._grid;
      FieldD tmp_d(DoublePrecGrid); tmp_d.checkerboard = cb;
      FieldD tmp2_d(DoublePrecGrid); tmp2_d.checkerboard = cb;
      FieldD src_d(DoublePrecGrid);
      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
 //      RealD inner_tol = Tolerance;
  	FieldD psi_d(DoublePrecGrid);psi_d.checkerboard = cb;
      FieldF src_f(SinglePrecGrid);
      src_f.checkerboard = cb;
      std::vector<FieldF> sol_f(nshift,SinglePrecGrid);
      for(int i=0;i<nshift;i++) sol_f[i].checkerboard = cb;
 //      ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations);
      ConjugateGradientMultiShift<FieldF> MSCG(MaxInnerIterations,shifts);
 //      CG_f.ErrorOnNoConverge = false;
      GridStopWatch InnerCGtimer;
      GridStopWatch PrecChangeTimer;
 {
 //	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
 //	if(norm < OuterLoopNormMult * stop){
 //	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 //	  break;
 //	}
 //	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 	PrecChangeTimer.Start();
 	precisionChange(src_f, src_d);
 	PrecChangeTimer.Stop();
 //	zeroit(sol_f);
 	//Inner CG
 	InnerCGtimer.Start();
  int if_relup = 0;
 #if 0
        MSCG(Linop_f,src_f,sol_f);
 #else
 {
  GridBase *grid = SinglePrecGrid;
  ////////////////////////////////////////////////////////////////////////
  // Convenience references to the info stored in "MultiShiftFunction"
  ////////////////////////////////////////////////////////////////////////
  int nshift = shifts.order;
  std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
  std::vector<RealD> &mresidual(shifts.tolerances);
  std::vector<RealD> alpha(nshift,1.);
  std::vector<FieldF>   ps(nshift,grid);// Search directions
  assert(sol_f.size()==nshift);
  assert(mass.size()==nshift);
  assert(mresidual.size()==nshift);
  // dynamic sized arrays on stack; 2d is a pain with vector
  RealD  bs[nshift];
  RealD  rsq[nshift];
  RealD  z[nshift][2];
  int     converged[nshift];
  const int       primary =0;
  //Primary shift fields CG iteration
  RealD a,b,c,d;
  RealD cp,bp,qq; //prev
  int cb=src_f.checkerboard;
  // Matrix mult fields
  FieldF r(grid); r.checkerboard = src_f.checkerboard;
  FieldF p(grid); p.checkerboard = src_f.checkerboard;
  FieldF tmp(grid); tmp.checkerboard = src_f.checkerboard;
  FieldF mmp(grid);mmp.checkerboard = src_f.checkerboard;
  FieldF psi(grid);psi.checkerboard = src_f.checkerboard;
    std::cout.precision(12);
    std::cout<<GridLogMessage<<"norm2(psi_d)= "<<norm2(psi_d)<<std::endl;
    std::cout<<GridLogMessage<<"norm2(psi)= "<<norm2(psi)<<std::endl;
  // Check lightest mass
  for(int s=0;s<nshift;s++){
    assert( mass[s]>= mass[primary] );
    converged[s]=0;
  }
  // Wire guess to zero
  // Residuals "r" are src
  // First search direction "p" is also src
  cp = norm2(src_f);
  Real c_relup = cp;
  for(int s=0;s<nshift;s++){
    rsq[s] = cp * mresidual[s] * mresidual[s];
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientMultiShift: shift "<<s
 	     <<" target resid "<<rsq[s]<<std::endl;
    ps[s] = src_f;
  }
  // r and p for primary
  r=src_f;
  p=src_f;
  //MdagM+m[0]
  std::cout << "p.checkerboard " << p.checkerboard
  << "mmp.checkerboard " << mmp.checkerboard << std::endl;
  Linop_f.HermOpAndNorm(p,mmp,d,qq);
  axpy(mmp,mass[0],p,mmp);
  RealD rn = norm2(p);
  d += rn*mass[0];
  // have verified that inner product of 
  // p and mmp is equal to d after this since
  // the d computation is tricky
  //  qq = real(innerProduct(p,mmp));
  //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
  b = -cp /d;
  // Set up the various shift variables
  int       iz=0;
  z[0][1-iz] = 1.0;
  z[0][iz]   = 1.0;
  bs[0]      = b;
  for(int s=1;s<nshift;s++){
    z[s][1-iz] = 1.0;
    z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
    bs[s]      = b*z[s][iz]; 
  }
  // r += b[0] A.p[0]
  // c= norm(r)
  c=axpy_norm(r,b,mmp,r);
 axpby(psi,0.,-bs[0],src_f,src_f);
  for(int s=0;s<nshift;s++) {
    axpby(sol_f[s],0.,-bs[s]*alpha[s],src_f,src_f);
  }
  // Iteration loop
  int k;
 // inefficient zeroing, please replace!
 //  RealD sol_norm = axpy_norm(sol_d[0],-1.,sol_d[0],sol_d[0]);
  zeroit(sol_d[0]);
  std::cout<<GridLogMessage<<"norm(sol_d[0])= "<<norm2(sol_d[0])<<std::endl;
  int all_converged = 1;
 	RealD tmp1,tmp2;
  for (k=1;k<=MaxOuterIterations;k++){
    a = c /cp;
    axpy(p,a,p,r);
    // Note to self - direction ps is iterated seperately
    // for each shift. Does not appear to have any scope
    // for avoiding linear algebra in "single" case.
    // 
    // However SAME r is used. Could load "r" and update
    // ALL ps[s]. 2/3 Bandwidth saving
    // New Kernel: Load r, vector of coeffs, vector of pointers ps
    for(int s=0;s<nshift;s++){
      if ( ! converged[s] ) { 
 	if (s==0){
 	  axpy(ps[s],a,ps[s],r);
 	} else{
 	  RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	  axpby(ps[s],z[s][iz],as,r,ps[s]);
 	}
      }
    }
    cp=c;
    Linop_f.HermOpAndNorm(p,mmp,d,qq);
    axpy(mmp,mass[0],p,mmp);
    RealD rn = norm2(p);
    d += rn*mass[0];
    bp=b;
    b=-cp/d;
    c=axpy_norm(r,b,mmp,r);
    // Toggle the recurrence history
    bs[0] = b;
    iz = 1-iz;
    for(int s=1;s<nshift;s++){
      if((!converged[s])){
 	RealD z0 = z[s][1-iz];
 	RealD z1 = z[s][iz];
 	z[s][iz] = z0*z1*bp
 	  / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
      }
    }
    axpy(psi,-bs[0],ps[0],psi);
    for(int s=0;s<nshift;s++){
      int ss = s;
      // Scope for optimisation here in case of "single".
      // Could load sol_f[0] and pull all ps[s] in.
      //      if ( single ) ss=primary;
      // Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
      // Pipelined CG gain:
      //
      // New Kernel: Load r, vector of coeffs, vector of pointers ps
      // New Kernel: Load sol_f[0], vector of coeffs, vector of pointers ps
      // If can predict the coefficient bs then we can fuse these and avoid write reread cyce
      //  on ps[s].
      // Before:  3 x npole  + 3 x npole
      // After :  2 x npole (ps[s])        => 3x speed up of multishift CG.
      if( (!converged[s]) ) { 
 	axpy(sol_f[ss],-bs[s]*alpha[s],ps[s],sol_f[ss]);
      }
    }
    if (k%MaxInnerIterations==0){
 //    if (c < 1e-4*c_relup){
       RealD c_f=c;
       precisionChange(tmp_d,psi);
       RealD sol_norm =axpy_norm (psi_d,1.,tmp_d,psi_d);
       tmp1 = norm2(psi);
       zeroit(psi);
       tmp2 = norm2(psi);
       std::cout<<GridLogMessage<<"k= "<<k<<" norm2(sol)= "<<sol_norm<<" "<<tmp1<<" "<<tmp2<<std::endl;
 //       precisionChange(sol_d[0],sol_f[0]);
       Linop_d.HermOpAndNorm(psi_d,tmp_d,tmp1,tmp2);
       axpy(tmp2_d,mass[0],psi_d,tmp_d);
       axpy(tmp_d,-1.,tmp2_d,src_d);
       precisionChange(r,tmp_d);
 	c_relup = norm2(r);
       std::cout<<GridLogMessage<<"k= "<<k<<" norm2(r)= "<<c<<" "<<c_relup<<" "<<c_f<<std::endl;
 	if_relup=1;
    }
    // Convergence checks
  all_converged=1;
    for(int s=0;s<nshift;s++){
      if ( (!converged[s]) ){
 	RealD css  = c * z[s][iz]* z[s][iz];
 	if(css<rsq[s]){
 	  if ( ! converged[s] )
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	      converged[s]=1;
 	} else {
 		if (k%MaxInnerIterations==0)
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has not converged "<<css<<"<"<<rsq[s]<<std::endl;
 	  all_converged=0;
 	}
      }
    }
 #if 0
    if ( all_converged ){
      std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
 #else
    if ( converged[0] ){
      std::cout<<GridLogMessage<< "CGMultiShift: Shift 0 have converged iteration, terminating  "<<k<<std::endl;
 #endif
 #if 1
      for(int s=1; s < nshift; s++) { 
 	Linop_f.HermOpAndNorm(sol_f[s],mmp,d,qq);
 	axpy(tmp,mass[s],sol_f[s],mmp);
 	axpy(r,-alpha[s],src_f,tmp);
 	RealD rn = norm2(r);
 	RealD cn = norm2(src_f);
 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
      }
 #endif
     iter = k;
      break;
    }
  }
  // ugly hack
  if ( !all_converged )
  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
 //  assert(0);
 }
 #endif
 	InnerCGtimer.Stop();
 	//Convert sol back to double and add to double prec solution
 	PrecChangeTimer.Start();
 	sol_d[0]=psi_d;
 	for(int i=1;i<nshift;i++)precisionChange(sol_d[i], sol_f[i]);
      std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
      // Check answers 
      for(int s=0; s < nshift; s++) { 
 	RealD tmp1,tmp2;
       Linop_d.HermOpAndNorm(sol_d[s],tmp_d,tmp1,tmp2);
       axpy(tmp2_d,shifts.poles[s],sol_d[s],tmp_d);
       axpy(tmp_d,-1.,src_d,tmp2_d);
 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(norm2(tmp_d)/norm2(src_d))<<std::endl;
      }
 	PrecChangeTimer.Stop();
 }
      //Final trial CG
 //     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
      TotalTimer.Stop();
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
    }
  };
 }
 #endif
--- a/lib/algorithms/iterative/ConjugateGradientShifted.h
+++ b/lib/algorithms/iterative/ConjugateGradientShifted.h
@ -0,0 +1,168 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradient.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_SHIFTED_H
 #define GRID_CONJUGATE_GRADIENT_SHIFTED_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Base classes for iterative processes based on operators
    // single input vec, single output vec.
    /////////////////////////////////////////////////////////////
  template<class Field> 
    class ConjugateGradientShifted : public OperatorFunction<Field> {
 public:                                                
    bool ErrorOnNoConverge; //throw an assert when the CG fails to converge. Defaults true.
    RealD   Tolerance;
    Integer MaxIterations;
    ConjugateGradientShifted(RealD tol,Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv) { 
    };
    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi ){
 	(*this)(Linop,src,psi,NULL);
    }
    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi, RealD *shift){
      psi.checkerboard = src.checkerboard;
      conformable(psi,src);
      RealD cp,c,a,d,b,ssq,qq,b_pred;
      Field   p(src);
      Field mmp(src);
      Field   r(src);
      //Initial residual computation & set up
      RealD guess = norm2(psi);
      assert(std::isnan(guess)==0);
      Linop.HermOpAndNorm(psi,mmp,d,b);
 	if(shift) axpy(mmp,*shift,psi,mmp);
 	RealD rn = norm2(psi);
 	if(shift) d += rn*(*shift);
 	RealD d2 = real(innerProduct(psi,mmp));
 	b= norm2(mmp);
      RealD src_norm=norm2(src);
      r= src-mmp;
      p= r;
      a  =norm2(p);
      cp =a;
      ssq=norm2(src);
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
      RealD rsq =  Tolerance* Tolerance*ssq;
      //Check if guess is really REALLY good :)
      if ( cp <= rsq ) {
 	return;
      }
      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;
      GridStopWatch LinalgTimer;
      GridStopWatch MatrixTimer;
      GridStopWatch SolverTimer;
      SolverTimer.Start();
      int k;
      for (k=1;k<=MaxIterations;k++){
 	c=cp;
 	MatrixTimer.Start();
 	Linop.HermOpAndNorm(p,mmp,d,qq);
 	MatrixTimer.Stop();
 	LinalgTimer.Start();
 	if(shift) axpy(mmp,*shift,p,mmp);
 	RealD rn = norm2(p);
 	if(shift) d += rn*(*shift);
 	RealD d2 = real(innerProduct(p,mmp));
 	qq = norm2(mmp);
      if (k%10==1) std::cout<< std::setprecision(4)<< "d:  "<<d<<" d2= "<<d2<<std::endl;
 	//	RealD    qqck = norm2(mmp);
 	//	ComplexD dck  = innerProduct(p,mmp);
 	a      = c/d;
 	b_pred = a*(a*qq-d)/c;
 	cp = axpy_norm(r,-a,mmp,r);
 	b = cp/c;
      if (k%10==1) std::cout<< std::setprecision(4)<<"k= "<<k<<" src:  "<<src_norm<<" r= "<<cp<<std::endl;
 	// Fuse these loops ; should be really easy
 	psi= a*p+psi;
 	p  = p*b+r;
 	LinalgTimer.Stop();
 	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl;
 	// Stopping condition
 	if ( cp <= rsq ) { 
 	  SolverTimer.Stop();
 	  Linop.HermOpAndNorm(psi,mmp,d,qq);
 	  if(shift) mmp = mmp + (*shift) * psi;
 	  p=mmp-src;
 	  RealD mmpnorm = sqrt(norm2(mmp));
 	  RealD psinorm = sqrt(norm2(psi));
 	  RealD srcnorm = sqrt(norm2(src));
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm/srcnorm;
 	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
 		   <<" computed residual "<<sqrt(cp/ssq)
 		   <<" true residual "    <<true_residual
 		   <<" target "<<Tolerance<<std::endl;
 	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 	  std::cout<<std::endl;
 	if(ErrorOnNoConverge)
 	  assert(true_residual/Tolerance < 1000.0);
 	  return;
 	}
      }
      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
 //      assert(0);
    }
  };
 }
 #endif
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@ -0,0 +1,137 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/DenseMatrix.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_DENSE_MATRIX_H
 #define GRID_DENSE_MATRIX_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Matrix untils
    /////////////////////////////////////////////////////////////
 template<class T> using DenseVector = std::vector<T>;
 template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
 template<class T> void Size(DenseVector<T> & vec, int &N) 
 { 
  N= vec.size();
 }
 template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
 { 
  N= mat.size();
  M= mat[0].size();
 }
 template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
 { 
  int M; Size(mat,N,M);
  assert(N==M);
 }
 template<class T> void Resize(DenseVector<T > & mat, int N) { 
  mat.resize(N);
 }
 template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
  mat.resize(N);
  for(int i=0;i<N;i++){
    mat[i].resize(M);
  }
 }
 template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
  int N,M;
  Size(mat,N,M);
  for(int i=0;i<N;i++){
  for(int j=0;j<M;j++){
    mat[i][j] = val;
  }}
 }
 /** Transpose of a matrix **/
 template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
  int N,M;
  Size(mat,N,M);
  DenseMatrix<T> C; Resize(C,M,N);
  for(int i=0;i<M;i++){
  for(int j=0;j<N;j++){
    C[i][j] = mat[j][i];
  }} 
  return C;
 }
 /** Set DenseMatrix to unit matrix **/
 template<class T> void Unity(DenseMatrix<T> &A){
  int N;  SizeSquare(A,N);
  for(int i=0;i<N;i++){
    for(int j=0;j<N;j++){
      if ( i==j ) A[i][j] = 1;
      else        A[i][j] = 0;
    } 
  } 
 }
 /** Add C * I to matrix **/
 template<class T>
 void PlusUnit(DenseMatrix<T> & A,T c){
  int dim;  SizeSquare(A,dim);
  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
 }
 /** return the Hermitian conjugate of matrix **/
 template<class T>
 DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
  int dim; SizeSquare(mat,dim);
  DenseMatrix<T> C; Resize(C,dim,dim);
  for(int i=0;i<dim;i++){
    for(int j=0;j<dim;j++){
      C[i][j] = conj(mat[j][i]);
    } 
  } 
  return C;
 }
 /**Get a square submatrix**/
 template <class T>
 DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
 {
  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
  for(int i = row_st; i<row_end; i++){
  for(int j = col_st; j<col_end; j++){
    H[i-row_st][j-col_st]=A[i][j];
  }}
  return H;
 }
 }
 #include "Householder.h"
 #include "Francis.h"
 #endif
--- a/lib/algorithms/iterative/EigenSort.h
+++ b/lib/algorithms/iterative/EigenSort.h
@ -0,0 +1,81 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/EigenSort.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_EIGENSORT_H
 #define GRID_EIGENSORT_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Eigen sorter to begin with
    /////////////////////////////////////////////////////////////
 template<class Field>
 class SortEigen {
 private:
 //hacking for testing for now
 private:
  static bool less_lmd(RealD left,RealD right){
    return left > right;
  }  
  static bool less_pair(std::pair<RealD,Field const*>& left,
                        std::pair<RealD,Field const*>& right){
    return left.first > (right.first);
  }  
 public:
  void push(DenseVector<RealD>& lmd,
            DenseVector<Field>& evec,int N) {
    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
    for(int i=0;i<lmd.size();++i)
      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
    for(int i=0;i<N;++i){
      lmd[i]=it->first;
      evec[i]=*(it->second);
      ++it;
    }
  }
  void push(DenseVector<RealD>& lmd,int N) {
    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
  }
  bool saturated(RealD lmd, RealD thrs) {
    return fabs(lmd) > fabs(thrs);
  }
 };
 }
 #endif
--- a/lib/algorithms/iterative/Francis.h
+++ b/lib/algorithms/iterative/Francis.h
@ -0,0 +1,525 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Francis.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef FRANCIS_H
 #define FRANCIS_H
 #include <cstdlib>
 #include <string>
 #include <cmath>
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
 #include <fstream>
 #include <complex>
 #include <algorithm>
 //#include <timer.h>
 //#include <lapacke.h>
 //#include <Eigen/Dense>
 namespace Grid {
 template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
 template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
 /**
  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
 H =
      x  x  x  x  x  x  x  x  x
      x  x  x  x  x  x  x  x  x
      0  x  x  x  x  x  x  x  x
      0  0  x  x  x  x  x  x  x
      0  0  0  x  x  x  x  x  x
      0  0  0  0  x  x  x  x  x
      0  0  0  0  0  x  x  x  x
      0  0  0  0  0  0  x  x  x
      0  0  0  0  0  0  0  x  x
 Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
 **/
 template <class T>
 int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
 {
  DenseMatrix<T> H = Hin; 
  int N ; SizeSquare(H,N);
  int M = N;
  Fill(evals,0);
  Fill(evecs,0);
  T s,t,x=0,y=0,z=0;
  T u,d;
  T apd,amd,bc;
  DenseVector<T> p(N,0);
  T nrm = Norm(H);    ///DenseMatrix Norm
  int n, m;
  int e = 0;
  int it = 0;
  int tot_it = 0;
  int l = 0;
  int r = 0;
  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
  DenseVector<int> trows(N,0);
  /// Check if the matrix is really hessenberg, if not abort
  RealD sth = 0;
  for(int j=0;j<N;j++){
    for(int i=j+2;i<N;i++){
      sth = abs(H[i][j]);
      if(sth > small){
 	std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
 	exit(1);
      }
    }
  }
  do{
    std::cout << "Francis QR Step N = " << N << std::endl;
    /** Check for convergence
      x  x  x  x  x
      0  x  x  x  x
      0  0  x  x  x
      0  0  x  x  x
      0  0  0  0  x
      for this matrix l = 4
     **/
    do{
      l = Chop_subdiag(H,nrm,e,small);
      r = 0;    ///May have converged on more than one eval
      ///Single eval
      if(l == N-1){
        evals[e] = H[l][l];
        N--; e++; r++; it = 0;
      }
      ///RealD eval
      if(l == N-2){
        trows[l+1] = 1;    ///Needed for UTSolve
        apd = H[l][l] + H[l+1][l+1];
        amd = H[l][l] - H[l+1][l+1];
        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
        N-=2; e+=2; r++; it = 0;
      }
    } while(r>0);
    if(N ==0) break;
    DenseVector<T > ck; Resize(ck,3);
    DenseVector<T> v;   Resize(v,3);
    for(int m = N-3; m >= l; m--){
      ///Starting vector essentially random shift.
      if(it%10 == 0 && N >= 3 && it > 0){
        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
        z = H[m+1][m]*H[m+2][m+1];
      }
      ///Starting vector implicit Q theorem
      else{
        s = (H[N-2][N-2] + H[N-1][N-1]);
        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
        z = H[m+1][m]*H[m+2][m+1];
      }
      ck[0] = x; ck[1] = y; ck[2] = z;
      if(m == l) break;
      /** Some stupid thing from numerical recipies, seems to work**/
      // PAB.. for heaven's sake quote page, purpose, evidence it works.
      //       what sort of comment is that!?!?!?
      u=abs(H[m][m-1])*(abs(y)+abs(z));
      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
      if ((T)abs(u+d) == (T)abs(d) ){
 	l = m; break;
      }
      //if (u < small){l = m; break;}
    }
    if(it > 100000){
     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
      exit(1);
    }
    normalize(ck);    ///Normalization cancels in PHP anyway
    T beta;
    Householder_vector<T >(ck, 0, 2, v, beta);
    Householder_mult<T >(H,v,beta,0,l,l+2,0);
    Householder_mult<T >(H,v,beta,0,l,l+2,1);
    ///Accumulate eigenvector
    Householder_mult<T >(P,v,beta,0,l,l+2,1);
    int sw = 0;      ///Are we on the last row?
    for(int k=l;k<N-2;k++){
      x = H[k+1][k];
      y = H[k+2][k];
      z = (T)0.0;
      if(k+3 <= N-1){
 	z = H[k+3][k];
      } else{
 	sw = 1; 
 	v[2] = (T)0.0;
      }
      ck[0] = x; ck[1] = y; ck[2] = z;
      normalize(ck);
      Householder_vector<T >(ck, 0, 2-sw, v, beta);
      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
      ///Accumulate eigenvector
      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
    }
    it++;
    tot_it++;
  }while(N > 1);
  N = evals.size();
  ///Annoying - UT solves in reverse order;
  DenseVector<T> tmp; Resize(tmp,N);
  for(int i=0;i<N;i++){
    tmp[i] = evals[N-i-1];
  } 
  evals = tmp;
  UTeigenvectors(H, trows, evals, evecs);
  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
  return tot_it;
 }
 template <class T>
 int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
 {
  /**
  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
  H =
  x  x  0  0  0  0
  x  x  x  0  0  0
  0  x  x  x  0  0
  0  0  x  x  x  0
  0  0  0  x  x  x
  0  0  0  0  x  x
  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
  return my_Wilkinson(Hin, evals, evecs, small, small);
 }
 template <class T>
 int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
 {
  int N; SizeSquare(Hin,N);
  int M = N;
  ///I don't want to modify the input but matricies must be passed by reference
  //Scale a matrix by its "norm"
  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
  DenseMatrix<T> H;  H = Hin;
  RealD Hnorm = abs(Norm(Hin));
  H = H * (1.0 / Hnorm);
  // TODO use openmp and memset
  Fill(evals,0);
  Fill(evecs,0);
  T s, t, x = 0, y = 0, z = 0;
  T u, d;
  T apd, amd, bc;
  DenseVector<T> p; Resize(p,N); Fill(p,0);
  T nrm = Norm(H);    ///DenseMatrix Norm
  int n, m;
  int e = 0;
  int it = 0;
  int tot_it = 0;
  int l = 0;
  int r = 0;
  DenseMatrix<T> P; Resize(P,N,N);
  Unity(P);
  DenseVector<int> trows(N, 0);
  /// Check if the matrix is really symm tridiag
  RealD sth = 0;
  for(int j = 0; j < N; ++j)
  {
    for(int i = j + 2; i < N; ++i)
    {
      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
      {
 	std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
 	std::cout << "Warning tridiagonalize and call again" << std::endl;
        // exit(1); // see what is going on
        //return;
      }
    }
  }
  do{
    do{
      //Jasper
      //Check if the subdiagonal term is small enough (<small)
      //if true then it is converged.
      //check start from H.dim - e - 1
      //How to deal with more than 2 are converged?
      //What if Chop_symm_subdiag return something int the middle?
      //--------------
      l = Chop_symm_subdiag(H,nrm, e, small);
      r = 0;    ///May have converged on more than one eval
      //Jasper
      //In this case
      // x  x  0  0  0  0
      // x  x  x  0  0  0
      // 0  x  x  x  0  0
      // 0  0  x  x  x  0
      // 0  0  0  x  x  0
      // 0  0  0  0  0  x  <- l
      //--------------
      ///Single eval
      if(l == N - 1)
      {
        evals[e] = H[l][l];
        N--;
        e++;
        r++;
        it = 0;
      }
      //Jasper
      // x  x  0  0  0  0
      // x  x  x  0  0  0
      // 0  x  x  x  0  0
      // 0  0  x  x  0  0
      // 0  0  0  0  x  x  <- l
      // 0  0  0  0  x  x
      //--------------
      ///RealD eval
      if(l == N - 2)
      {
        trows[l + 1] = 1;    ///Needed for UTSolve
        apd = H[l][l] + H[l + 1][ l + 1];
        amd = H[l][l] - H[l + 1][l + 1];
        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
        N -= 2;
        e += 2;
        r++;
        it = 0;
      }
    }while(r > 0);
    //Jasper
    //Already converged
    //--------------
    if(N == 0) break;
    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
    for(int m = N - 3; m >= l; m--)
    {
      ///Starting vector essentially random shift.
      if(it%10 == 0 && N >= 3 && it > 0)
      {
        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
        x = H[m][m] - t;
        z = H[m + 1][m];
      } else {
      ///Starting vector implicit Q theorem
        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
 	  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
        x = H[m][m] - t;
        z = H[m + 1][m];
      }
      //Jasper
      //why it is here????
      //-----------------------
      if(m == l)
        break;
      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
      if ((T)abs(u + d) == (T)abs(d))
      {
        l = m;
        break;
      }
    }
    //Jasper
    if(it > 1000000)
    {
      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
      exit(1);
    }
    //
    T s, c;
    Givens_calc<T>(x, z, c, s);
    Givens_mult<T>(H, l, l + 1, c, -s, 0);
    Givens_mult<T>(H, l, l + 1, c,  s, 1);
    Givens_mult<T>(P, l, l + 1, c,  s, 1);
    //
    for(int k = l; k < N - 2; ++k)
    {
      x = H.A[k + 1][k];
      z = H.A[k + 2][k];
      Givens_calc<T>(x, z, c, s);
      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
    }
    it++;
    tot_it++;
  }while(N > 1);
  N = evals.size();
  ///Annoying - UT solves in reverse order;
  DenseVector<T> tmp(N);
  for(int i = 0; i < N; ++i)
    tmp[i] = evals[N-i-1];
  evals = tmp;
  //
  UTeigenvectors(H, trows, evals, evecs);
  //UTSymmEigenvectors(H, trows, evals, evecs);
  for(int i = 0; i < evals.size(); ++i)
  {
    evecs[i] = P * evecs[i];
    normalize(evecs[i]);
    evals[i] = evals[i] * Hnorm;
  }
  // // FIXME this is to test
  // Hin.write("evecs3", evecs);
  // Hin.write("evals3", evals);
  // // check rsd
  // for(int i = 0; i < M; i++) {
  //   vector<T> Aevec = Hin * evecs[i];
  //   RealD norm2(0.);
  //   for(int j = 0; j < M; j++) {
  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
  //   }
  // }
  return tot_it;
 }
 template <class T>
 void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
  /**
  turn a matrix A =
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  into
  x  x  x  x  x
  x  x  x  x  x
  0  x  x  x  x
  0  0  x  x  x
  0  0  0  x  x
  with householder rotations
  Slow.
  */
  int N ; SizeSquare(A,N);
  DenseVector<T > p; Resize(p,N); Fill(p,0);
  for(int k=start;k<N-2;k++){
    //cerr << "hess" << k << std::endl;
    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
    normalize(ck);    ///Normalization cancels in PHP anyway
    T beta;
    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
    ///Accumulate eigenvector
    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
  }
  /*for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
    A(0,k,l);
    }
    }*/
 }
 template <class T>
 void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
 ///Tridiagonalize a matrix
  int N; SizeSquare(A,N);
  Hess(A,Q,start);
  /*for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
    A(0,l,k);
    }
    }*/
 }
 template <class T>
 void ForceTridiagonal(DenseMatrix<T> &A){
 ///Tridiagonalize a matrix
  int N ; SizeSquare(A,N);
  for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
      A[l][k]=0;
      A[k][l]=0;
    }
  }
 }
 template <class T>
 int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
  int N; SizeSquare(Ain,N);
  DenseMatrix<T > A; A = Ain;
  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
  Tri(A,Q,0);
  int it = my_Wilkinson<T>(A, evals, evecs, small);
  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
  return it;
 }
 template <class T>
 int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  return my_Wilkinson(Ain, evals, evecs, small);
 }
 template <class T>
 int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  return my_SymmEigensystem(Ain, evals, evecs, small);
 }
 template <class T>
 int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
 ///Solve a general eigensystem, not necessarily in tridiagonal form
  int N = Ain.dim;
  DenseMatrix<T > A(N); A = Ain;
  DenseMatrix<T > Q(N);Q.Unity();
  Hess(A,Q,0);
  int it = QReigensystem<T>(A, evals, evecs, small);
  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
  return it;
 }
 }
 #endif
--- a/lib/algorithms/iterative/Householder.h
+++ b/lib/algorithms/iterative/Householder.h
@ -0,0 +1,242 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Householder.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef HOUSEHOLDER_H
 #define HOUSEHOLDER_H
 #define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #include <cstdlib>
 #include <string>
 #include <cmath>
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
 #include <fstream>
 #include <complex>
 #include <algorithm>
 namespace Grid {
 /** Comparison function for finding the max element in a vector **/
 template <class T> bool cf(T i, T j) { 
  return abs(i) < abs(j); 
 }
 /** 
 	Calculate a real Givens angle 
 **/
 template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
  RealD mz = (RealD)abs(z);
  if(mz==0.0){
    c = 1; s = 0;
  }
  if(mz >= (RealD)abs(y)){
    T t = -y/z;
    s = (T)1.0 / sqrt ((T)1.0 + t * t);
    c = s * t;
  } else {
    T t = -z/y;
    c = (T)1.0 / sqrt ((T)1.0 + t * t);
    s = c * t;
  }
 }
 template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
 {
  int q ; SizeSquare(A,q);
  if(dir == 0){
    for(int j=0;j<q;j++){
      T nu = A[i][j];
      T w  = A[k][j];
      A[i][j] = (c*nu + s*w);
      A[k][j] = (-s*nu + c*w);
    }
  }
  if(dir == 1){
    for(int j=0;j<q;j++){
      T nu = A[j][i];
      T w  = A[j][k];
      A[j][i] = (c*nu - s*w);
      A[j][k] = (s*nu + c*w);
    }
  }
 }
 /**
 	from input = x;
 	Compute the complex Householder vector, v, such that
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	P | x |    | x | k = 0
 	| x |    | 0 | 
 	| x | =  | 0 |
 	| x |    | 0 | j = 3
 	| x |	   | x |
 	These are the "Unreduced" Householder vectors.
 **/
 template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
 {
  int N ; Size(input,N);
  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
  if(abs(m) > 0.0){
    T alpha = 0;
    for(int i=k; i<j+1; i++){
      v[i] = input[i]/m;
      alpha = alpha + v[i]*conj(v[i]);
    }
    alpha = sqrt(alpha);
    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
    else                 v[k] = -alpha;
  } else{
    for(int i=k; i<j+1; i++){
      v[i] = 0.0;
    } 
  }
 }
 /**
 	from input = x;
 	Compute the complex Householder vector, v, such that
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	Px = alpha*e_dir
 	These are the "Unreduced" Householder vectors.
 **/
 template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
 {
  int N = input.size();
  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
  if(abs(m) > 0.0){
    T alpha = 0;
    for(int i=k; i<j+1; i++){
      v[i] = input[i]/m;
      alpha = alpha + v[i]*conj(v[i]);
    }
    alpha = sqrt(alpha);
    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
    else                  v[dir] = -alpha;
  }else{
    for(int i=k; i<j+1; i++){
      v[i] = 0.0;
    } 
  }
 }
 /**
 	Compute the product PA if trans = 0
 	AP if trans = 1
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	start at element l of matrix A
 	v is of length j - k + 1 of v are nonzero
 **/
 template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
 {
  int N ; SizeSquare(A,N);
  if(abs(beta) > 0.0){
    for(int p=l; p<N; p++){
      T s = 0;
      if(trans==0){
 	for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
 	s *= beta;
 	for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
      } else {
 	for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
 	s *= beta;
 	for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
      }
    }
  }
 }
 /**
 	Compute the product PA if trans = 0
 	AP if trans = 1
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	start at element l of matrix A
 	v is of length j - k + 1 of v are nonzero
 	A is tridiagonal
 **/
 template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
 {
  if(abs(beta) > 0.0){
    int N ; SizeSquare(A,N);
    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
    T s;
    for(int p=l; p<M; p++){
      s = 0;
      if(trans==0){
 	for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
      }else{
 	for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
      }
      s = beta*s;
      if(trans==0){
 	for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
      }else{
 	for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
      }
    }
    for(int p=l; p<M; p++){
      if(trans==0){
 	for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
      }else{
 	for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
      }
    }
  }
 }
 }
 #endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
--- a/lib/algorithms/iterative/Matrix.h
+++ b/lib/algorithms/iterative/Matrix.h
@ -0,0 +1,453 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Matrix.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef MATRIX_H
 #define MATRIX_H
 #include <cstdlib>
 #include <string>
 #include <cmath>
 #include <vector>
 #include <iostream>
 #include <iomanip>
 #include <complex>
 #include <typeinfo>
 #include <Grid.h>
 /** Sign function **/
 template <class T> T sign(T p){return ( p/abs(p) );}
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////// Hijack STL containers for our wicked means /////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class T> using Vector = Vector<T>;
 template<class T> using Matrix = Vector<Vector<T> >;
 template<class T> void Resize(Vector<T > & vec, int N) { vec.resize(N); }
 template<class T> void Resize(Matrix<T > & mat, int N, int M) { 
  mat.resize(N);
  for(int i=0;i<N;i++){
    mat[i].resize(M);
  }
 }
 template<class T> void Size(Vector<T> & vec, int &N) 
 { 
  N= vec.size();
 }
 template<class T> void Size(Matrix<T> & mat, int &N,int &M) 
 { 
  N= mat.size();
  M= mat[0].size();
 }
 template<class T> void SizeSquare(Matrix<T> & mat, int &N) 
 { 
  int M; Size(mat,N,M);
  assert(N==M);
 }
 template<class T> void SizeSame(Matrix<T> & mat1,Matrix<T> &mat2, int &N1,int &M1) 
 { 
  int N2,M2;
  Size(mat1,N1,M1);
  Size(mat2,N2,M2);
  assert(N1==N2);
  assert(M1==M2);
 }
 //*****************************************
 //*	(Complex) Vector operations	*
 //*****************************************
 /**Conj of a Vector **/
 template <class T> Vector<T> conj(Vector<T> p){
 	Vector<T> q(p.size());
 	for(int i=0;i<p.size();i++){q[i] = conj(p[i]);}
 	return q;
 }
 /** Norm of a Vector**/
 template <class T> T norm(Vector<T> p){
 	T sum = 0;
 	for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
 	return abs(sqrt(sum));
 }
 /** Norm squared of a Vector **/
 template <class T> T norm2(Vector<T> p){
 	T sum = 0;
 	for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
 	return abs((sum));
 }
 /** Sum elements of a Vector **/
 template <class T> T trace(Vector<T> p){
 	T sum = 0;
 	for(int i=0;i<p.size();i++){sum = sum + p[i];}
 	return sum;
 }
 /** Fill a Vector with constant c **/
 template <class T> void Fill(Vector<T> &p, T c){
 	for(int i=0;i<p.size();i++){p[i] = c;}
 }
 /** Normalize a Vector **/
 template <class T> void normalize(Vector<T> &p){
 	T m = norm(p);
 	if( abs(m) > 0.0) for(int i=0;i<p.size();i++){p[i] /= m;}
 }
 /** Vector by scalar **/
 template <class T, class U> Vector<T> times(Vector<T> p, U s){
 	for(int i=0;i<p.size();i++){p[i] *= s;}
 	return p;
 }
 template <class T, class U> Vector<T> times(U s, Vector<T> p){
 	for(int i=0;i<p.size();i++){p[i] *= s;}
 	return p;
 }
 /** inner product of a and b = conj(a) . b **/
 template <class T> T inner(Vector<T> a, Vector<T> b){
 	T m = 0.;
 	for(int i=0;i<a.size();i++){m = m + conj(a[i])*b[i];}
 	return m;
 }
 /** sum of a and b = a + b **/
 template <class T> Vector<T> add(Vector<T> a, Vector<T> b){
 	Vector<T> m(a.size());
 	for(int i=0;i<a.size();i++){m[i] = a[i] + b[i];}
 	return m;
 }
 /** sum of a and b = a - b **/
 template <class T> Vector<T> sub(Vector<T> a, Vector<T> b){
 	Vector<T> m(a.size());
 	for(int i=0;i<a.size();i++){m[i] = a[i] - b[i];}
 	return m;
 }
 /** 
 *********************************
 *	Matrices	         *
 *********************************
 **/
 template<class T> void Fill(Matrix<T> & mat, T&val) { 
  int N,M;
  Size(mat,N,M);
  for(int i=0;i<N;i++){
  for(int j=0;j<M;j++){
    mat[i][j] = val;
  }}
 }
 /** Transpose of a matrix **/
 Matrix<T> Transpose(Matrix<T> & mat){
  int N,M;
  Size(mat,N,M);
  Matrix C; Resize(C,M,N);
  for(int i=0;i<M;i++){
  for(int j=0;j<N;j++){
    C[i][j] = mat[j][i];
  }} 
  return C;
 }
 /** Set Matrix to unit matrix **/
 template<class T> void Unity(Matrix<T> &mat){
  int N;  SizeSquare(mat,N);
  for(int i=0;i<N;i++){
    for(int j=0;j<N;j++){
      if ( i==j ) A[i][j] = 1;
      else        A[i][j] = 0;
    } 
  } 
 }
 /** Add C * I to matrix **/
 template<class T>
 void PlusUnit(Matrix<T> & A,T c){
  int dim;  SizeSquare(A,dim);
  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
 }
 /** return the Hermitian conjugate of matrix **/
 Matrix<T> HermitianConj(Matrix<T> &mat){
  int dim; SizeSquare(mat,dim);
  Matrix<T> C; Resize(C,dim,dim);
  for(int i=0;i<dim;i++){
    for(int j=0;j<dim;j++){
      C[i][j] = conj(mat[j][i]);
    } 
  } 
  return C;
 }
 /** return diagonal entries as a Vector **/
 Vector<T> diag(Matrix<T> &A)
 {
  int dim; SizeSquare(A,dim);
  Vector<T> d; Resize(d,dim);
  for(int i=0;i<dim;i++){
    d[i] = A[i][i];
  }
  return d;
 }
 /** Left multiply by a Vector **/
 Vector<T> operator *(Vector<T> &B,Matrix<T> &A)
 {
  int K,M,N; 
  Size(B,K);
  Size(A,M,N);
  assert(K==M);
  Vector<T> C; Resize(C,N);
  for(int j=0;j<N;j++){
    T sum = 0.0;
    for(int i=0;i<M;i++){
      sum += B[i] * A[i][j];
    }
    C[j] =  sum;
  }
  return C; 
 }
 /** return 1/diagonal entries as a Vector **/
 Vector<T> inv_diag(Matrix<T> & A){
  int dim; SizeSquare(A,dim);
  Vector<T> d; Resize(d,dim);
  for(int i=0;i<dim;i++){
    d[i] = 1.0/A[i][i];
  }
  return d;
 }
 /** Matrix Addition **/
 inline Matrix<T> operator + (Matrix<T> &A,Matrix<T> &B)
 {
  int N,M  ; SizeSame(A,B,N,M);
  Matrix C; Resize(C,N,M);
  for(int i=0;i<N;i++){
    for(int j=0;j<M;j++){
      C[i][j] = A[i][j] +  B[i][j];
    } 
  } 
  return C;
 } 
 /** Matrix Subtraction **/
 inline Matrix<T> operator- (Matrix<T> & A,Matrix<T> &B){
  int N,M  ; SizeSame(A,B,N,M);
  Matrix C; Resize(C,N,M);
  for(int i=0;i<N;i++){
  for(int j=0;j<M;j++){
    C[i][j] = A[i][j] -  B[i][j];
  }}
  return C;
 } 
 /** Matrix scalar multiplication **/
 inline Matrix<T> operator* (Matrix<T> & A,T c){
  int N,M; Size(A,N,M);
  Matrix C; Resize(C,N,M);
  for(int i=0;i<N;i++){
  for(int j=0;j<M;j++){
    C[i][j] = A[i][j]*c;
  }} 
  return C;
 } 
 /** Matrix Matrix multiplication **/
 inline Matrix<T> operator* (Matrix<T> &A,Matrix<T> &B){
  int K,L,N,M;
  Size(A,K,L);
  Size(B,N,M); assert(L==N);
  Matrix C; Resize(C,K,M);
  for(int i=0;i<K;i++){
    for(int j=0;j<M;j++){
      T sum = 0.0;
      for(int k=0;k<N;k++) sum += A[i][k]*B[k][j];
      C[i][j] =sum;
    }
  }
  return C; 
 } 
 /** Matrix Vector multiplication **/
 inline Vector<T> operator* (Matrix<T> &A,Vector<T> &B){
  int M,N,K;
  Size(A,N,M);
  Size(B,K); assert(K==M);
  Vector<T> C; Resize(C,N);
  for(int i=0;i<N;i++){
    T sum = 0.0;
    for(int j=0;j<M;j++) sum += A[i][j]*B[j];
    C[i] =  sum;
  }
  return C; 
 } 
 /** Some version of Matrix norm **/
 /*
 inline T Norm(){ // this is not a usual L2 norm
    T norm = 0;
    for(int i=0;i<dim;i++){
      for(int j=0;j<dim;j++){
 	norm += abs(A[i][j]);
    }}
    return norm;
  }
 */
 /** Some version of Matrix norm **/
 template<class T> T LargestDiag(Matrix<T> &A)
 {
  int dim ; SizeSquare(A,dim); 
  T ld = abs(A[0][0]);
  for(int i=1;i<dim;i++){
    T cf = abs(A[i][i]);
    if(abs(cf) > abs(ld) ){ld = cf;}
  }
  return ld;
 }
 /** Look for entries on the leading subdiagonal that are smaller than 'small' **/
 template <class T,class U> int Chop_subdiag(Matrix<T> &A,T norm, int offset, U small)
 {
  int dim; SizeSquare(A,dim);
  for(int l = dim - 1 - offset; l >= 1; l--) {             		
    if((U)abs(A[l][l - 1]) < (U)small) {
      A[l][l-1]=(U)0.0;
      return l;
    }
  }
  return 0;
 }
 /** Look for entries on the leading subdiagonal that are smaller than 'small' **/
 template <class T,class U> int Chop_symm_subdiag(Matrix<T> & A,T norm, int offset, U small) 
 {
  int dim; SizeSquare(A,dim);
  for(int l = dim - 1 - offset; l >= 1; l--) {
    if((U)abs(A[l][l - 1]) < (U)small) {
      A[l][l - 1] = (U)0.0;
      A[l - 1][l] = (U)0.0;
      return l;
    }
  }
  return 0;
 }
 /**Assign a submatrix to a larger one**/
 template<class T>
 void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
 {
  for(int i = row_st; i<row_end; i++){
    for(int j = col_st; j<col_end; j++){
      A[i][j] = S[i - row_st][j - col_st];
    }
  }
 }
 /**Get a square submatrix**/
 template <class T>
 Matrix<T> GetSubMtx(Matrix<T> &A,int row_st, int row_end, int col_st, int col_end)
 {
  Matrix<T> H; Resize(row_end - row_st,col_end-col_st);
  for(int i = row_st; i<row_end; i++){
  for(int j = col_st; j<col_end; j++){
    H[i-row_st][j-col_st]=A[i][j];
  }}
  return H;
 }
 /**Assign a submatrix to a larger one NB remember Vector Vectors are transposes of the matricies they represent**/
 template<class T>
 void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
 {
  for(int i = row_st; i<row_end; i++){
  for(int j = col_st; j<col_end; j++){
    A[i][j] = S[i - row_st][j - col_st];
  }}
 }
 /** compute b_i A_ij b_j **/ // surprised no Conj
 template<class T> T proj(Matrix<T> A, Vector<T> B){
  int dim; SizeSquare(A,dim);
  int dimB; Size(B,dimB);
  assert(dimB==dim);
  T C = 0;
  for(int i=0;i<dim;i++){
    T sum = 0.0;
    for(int j=0;j<dim;j++){
      sum += A[i][j]*B[j];
    }
    C +=  B[i]*sum; // No conj?
  }
  return C; 
 }
 /*
 *************************************************************
 *
 * Matrix Vector products
 *
 *************************************************************
 */
 // Instead make a linop and call my CG;
 /// q -> q Q
 template <class T,class Fermion> void times(Vector<Fermion> &q, Matrix<T> &Q)
 {
  int M; SizeSquare(Q,M);
  int N; Size(q,N); 
  assert(M==N);
  times(q,Q,N);
 }
 /// q -> q Q
 template <class T> void times(multi1d<LatticeFermion> &q, Matrix<T> &Q, int N)
 {
  GridBase *grid = q[0]._grid;
  int M; SizeSquare(Q,M);
  int K; Size(q,K); 
  assert(N<M);
  assert(N<K);
  Vector<Fermion> S(N,grid );
  for(int j=0;j<N;j++){
    S[j] = zero;
    for(int k=0;k<N;k++){
      S[j] = S[j] +  q[k]* Q[k][j]; 
    }
  }
  for(int j=0;j<q.size();j++){
    q[j] = S[j];
  }
 }
 #endif
--- a/lib/algorithms/iterative/MatrixUtils.h
+++ b/lib/algorithms/iterative/MatrixUtils.h
@ -2,13 +2,11 @@
    Grid physics library, www.github.com/paboyle/Grid 
-    Source file: ./lib/Grid.h
+    Source file: ./lib/algorithms/iterative/MatrixUtils.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -27,34 +25,51 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-//
+#ifndef GRID_MATRIX_UTILS_H
-//  Grid.h
+#define GRID_MATRIX_UTILS_H
 //  simd
 //
 //  Created by Peter Boyle on 09/05/2014.
 //  Copyright (c) 2014 University of Edinburgh. All rights reserved.
 //
-#ifndef GRID_BASE_H
+namespace Grid {
 #define GRID_BASE_H
-#include <Grid/GridStd.h>
+  namespace MatrixUtils { 
-#include <Grid/perfmon/Timer.h>
+    template<class T> inline void Size(Matrix<T>& A,int &N,int &M){
-#include <Grid/perfmon/PerfCount.h>
+      N=A.size(); assert(N>0);
-#include <Grid/log/Log.h>
+      M=A[0].size();
-#include <Grid/allocator/AlignedAllocator.h>
+      for(int i=0;i<N;i++){
-#include <Grid/simd/Simd.h>
+	assert(A[i].size()==M);
-#include <Grid/serialisation/Serialisation.h>
+      }
-#include <Grid/threads/Threads.h>
+    }
 #include <Grid/util/Util.h>
 #include <Grid/communicator/Communicator.h> 
 #include <Grid/cartesian/Cartesian.h>    
 #include <Grid/tensors/Tensors.h>      
 #include <Grid/lattice/Lattice.h>      
 #include <Grid/cshift/Cshift.h>       
 #include <Grid/stencil/Stencil.h>      
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/algorithms/Algorithms.h>   
    template<class T> inline void SizeSquare(Matrix<T>& A,int &N)
    {
      int M;
      Size(A,N,M);
      assert(N==M);
    }
    template<class T> inline void Fill(Matrix<T>& A,T & val)
    { 
      int N,M;
      Size(A,N,M);
      for(int i=0;i<N;i++){
      for(int j=0;j<M;j++){
 	A[i][j]=val;
      }}
    }
    template<class T> inline void Diagonal(Matrix<T>& A,T & val)
    { 
      int N;
      SizeSquare(A,N);
      for(int i=0;i<N;i++){
 	A[i][i]=val;
      }
    }
    template<class T> inline void Identity(Matrix<T>& A)
    {
      Fill(A,0.0);
      Diagonal(A,1.0);
    }
  };
 }
 #endif
--- a/lib/algorithms/iterative/TODO
+++ b/lib/algorithms/iterative/TODO
@ -0,0 +1,15 @@
 - ConjugateGradientMultiShift
 - MCR
 - Potentially Useful Boost libraries
 - MultiArray
 - Aligned allocator; memory pool
 - Remez -- Mike or Boost?
 - Multiprecision
 - quaternians
 - Tokenize
 - Serialization
 - Regex
 - Proto (ET)
 - uBlas
--- a/lib/algorithms/iterative/bisec.c
+++ b/lib/algorithms/iterative/bisec.c
@ -0,0 +1,122 @@
 #include <math.h>
 #include <stdlib.h>
 #include <vector>
 struct Bisection {
 static void get_eig2(int row_num,std::vector<RealD> &ALPHA,std::vector<RealD> &BETA, std::vector<RealD> & eig)
 {
  int i,j;
  std::vector<RealD> evec1(row_num+3);
  std::vector<RealD> evec2(row_num+3);
  RealD eps2;
  ALPHA[1]=0.;
  BETHA[1]=0.;
  for(i=0;i<row_num-1;i++) {
    ALPHA[i+1] = A[i*(row_num+1)].real();
    BETHA[i+2] = A[i*(row_num+1)+1].real();
  }
  ALPHA[row_num] = A[(row_num-1)*(row_num+1)].real();
  bisec(ALPHA,BETHA,row_num,1,row_num,1e-10,1e-10,evec1,eps2);
  bisec(ALPHA,BETHA,row_num,1,row_num,1e-16,1e-16,evec2,eps2);
  // Do we really need to sort here?
  int begin=1;
  int end = row_num;
  int swapped=1;
  while(swapped) {
    swapped=0;
    for(i=begin;i<end;i++){
      if(mag(evec2[i])>mag(evec2[i+1]))	{
 	swap(evec2+i,evec2+i+1);
 	swapped=1;
      }
    }
    end--;
    for(i=end-1;i>=begin;i--){
      if(mag(evec2[i])>mag(evec2[i+1]))	{
 	swap(evec2+i,evec2+i+1);
 	swapped=1;
      }
    }
    begin++;
  }
  for(i=0;i<row_num;i++){
    for(j=0;j<row_num;j++) {
      if(i==j) H[i*row_num+j]=evec2[i+1];
      else H[i*row_num+j]=0.;
    }
  }
 }
 static void bisec(std::vector<RealD> &c,   
 		  std::vector<RealD> &b,
 		  int n,
 		  int m1,
 		  int m2,
 		  RealD eps1,
 		  RealD relfeh,
 		  std::vector<RealD> &x,
 		  RealD &eps2)
 {
  std::vector<RealD> wu(n+2);
  RealD h,q,x1,xu,x0,xmin,xmax; 
  int i,a,k;
  b[1]=0.0;
  xmin=c[n]-fabs(b[n]);
  xmax=c[n]+fabs(b[n]);
  for(i=1;i<n;i++){
    h=fabs(b[i])+fabs(b[i+1]);
    if(c[i]+h>xmax) xmax= c[i]+h;
    if(c[i]-h<xmin) xmin= c[i]-h;
  }
  xmax *=2.;
  eps2=relfeh*((xmin+xmax)>0.0 ? xmax : -xmin);
  if(eps1<=0.0) eps1=eps2;
  eps2=0.5*eps1+7.0*(eps2);
  x0=xmax;
  for(i=m1;i<=m2;i++){
    x[i]=xmax;
    wu[i]=xmin;
  }
  for(k=m2;k>=m1;k--){
    xu=xmin;
    i=k;
    do{
      if(xu<wu[i]){
 	xu=wu[i];
 	i=m1-1;
      }
      i--;
    }while(i>=m1);
    if(x0>x[k]) x0=x[k];
    while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){
      x1=(xu+x0)/2;
      a=0;
      q=1.0;
      for(i=1;i<=n;i++){
 	q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh);
 	if(q<0) a++;
      }
      //			printf("x1=%e a=%d\n",x1,a);
      if(a<k){
 	if(a<m1){
 	  xu=x1;
 	  wu[m1]=x1;
 	}else {
 	  xu=x1;
 	  wu[a+1]=x1;
 	  if(x[a]>x1) x[a]=x1;
 	}
      }else x0=x1;
    }
    x[k]=(x0+xu)/2;
  }
 }
 }
--- a/lib/algorithms/iterative/get_eig.c
+++ b/lib/algorithms/iterative/get_eig.c
@ -0,0 +1 @@
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@ -6,9 +6,8 @@
    Copyright (C) 2015
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -50,9 +49,10 @@ public:
    GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
    // Physics Grid information.
    std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
-    std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
+    std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
    std::vector<int> _gdimensions;// Global dimensions of array after cb removal
    std::vector<int> _ldimensions;// local dimensions of array with processor images removed
    std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed 
@ -62,12 +62,13 @@ public:
    int _isites;
    int _fsites;                  // _isites*_osites = product(dimensions).
    int _gsites;
-    std::vector<int> _slice_block;// subslice information
+    std::vector<int> _slice_block;   // subslice information
    std::vector<int> _slice_stride;
    std::vector<int> _slice_nblock;
-    std::vector<int> _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
+    // Might need these at some point
-    std::vector<int> _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
+    //    std::vector<int> _lstart;     // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
    //    std::vector<int> _lend;       // local end of array in gcoors    _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
 public:
@ -98,7 +99,7 @@ public:
    virtual int oIndex(std::vector<int> &coor)
    {
        int idx=0;
-        // Works with either global or local coordinates
+	// Works with either global or local coordinates
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
@ -120,11 +121,6 @@ public:
      Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
    }
    inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) {
      lcoor.resize(_ndimension);
      for (int d = 0; d < _ndimension; d++)
        lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
    }
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
@ -133,7 +129,6 @@ public:
    {
      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
    }
    inline int PermuteDim(int dimension){
      return _simd_layout[dimension]>1;
    }
@ -151,15 +146,15 @@ public:
      // Distance should be either 0,1,2..
      //
      if ( _simd_layout[dimension] > 2 ) { 
-        for(int d=0;d<_ndimension;d++){
+	for(int d=0;d<_ndimension;d++){
-          if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
+	  if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
-        }
+	}
-        permute_type = RotateBit; // How to specify distance; this is not just direction.
+	permute_type = RotateBit; // How to specify distance; this is not just direction.
-        return permute_type;
+	return permute_type;
      }
      for(int d=_ndimension-1;d>dimension;d--){
-        if (_simd_layout[d]>1 ) permute_type++;
+	if (_simd_layout[d]>1 ) permute_type++;
      }
      return permute_type;
    }
@ -174,50 +169,26 @@ public:
    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
    inline int Nd    (void) const { return _ndimension;};
    inline const std::vector<int> LocalStarts(void)             { return _lstart;    };
    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
    inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
    inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
    ////////////////////////////////////////////////////////////////
    // Utility to print the full decomposition details 
    ////////////////////////////////////////////////////////////////
    void show_decomposition(){
      std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
      std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
      std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
      std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
      std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl;
      std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl;
      std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
      std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
      std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;        
      std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
      std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;             
    } 
    ////////////////////////////////////////////////////////////////
    // Global addressing
    ////////////////////////////////////////////////////////////////
    void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
      assert(gidx< gSites());
      Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
    }
    void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
      assert(lidx<lSites());
      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
    }
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
      gidx=0;
      int mult=1;
      for(int mu=0;mu<_ndimension;mu++) {
-        gidx+=mult*gcoor[mu];
+	gidx+=mult*gcoor[mu];
-        mult*=_gdimensions[mu];
+	mult*=_gdimensions[mu];
      }
    }
    void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
@ -225,9 +196,9 @@ public:
      pcoor.resize(_ndimension);
      lcoor.resize(_ndimension);
      for(int mu=0;mu<_ndimension;mu++){
-        int _fld  = _fdimensions[mu]/_processors[mu];
+	int _fld  = _fdimensions[mu]/_processors[mu];
-        pcoor[mu] = gcoor[mu]/_fld;
+	pcoor[mu] = gcoor[mu]/_fld;
-        lcoor[mu] = gcoor[mu]%_fld;
+	lcoor[mu] = gcoor[mu]%_fld;
      }
    }
    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
@ -236,16 +207,16 @@ public:
      std::vector<int> lcoor;
      GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
      rank = RankFromProcessorCoor(pcoor);
-      /*
+
      std::vector<int> cblcoor(lcoor);
      for(int d=0;d<cblcoor.size();d++){
-        if( this->CheckerBoarded(d) ) {
+	if( this->CheckerBoarded(d) ) {
-          cblcoor[d] = lcoor[d]/2;
+	  cblcoor[d] = lcoor[d]/2;
-        }
+	}
      }
-      */
+
-      i_idx= iIndex(lcoor);
+      i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
-      o_idx= oIndex(lcoor);
+      o_idx= oIndex(lcoor);  // this implies divide by 2 on checkerdim
    }
    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
@ -267,7 +238,7 @@ public:
    {
      RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
      if(CheckerBoarded(0)){
-        fcoor[0] = fcoor[0]*2+cb;
+	fcoor[0] = fcoor[0]*2+cb;
      }
    }
    void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@ -76,8 +76,6 @@ public:
        _ldimensions.resize(_ndimension);
        _rdimensions.resize(_ndimension);
        _simd_layout.resize(_ndimension);
 	_lstart.resize(_ndimension);
 	_lend.resize(_ndimension);
        _ostride.resize(_ndimension);
        _istride.resize(_ndimension);
@ -96,10 +94,8 @@ public:
 	  // Use a reduced simd grid
 	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
 	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
-	  _lstart[d]     = _processor_coor[d]*_ldimensions[d];
+	  _osites *= _rdimensions[d];
-	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
+	  _isites *= _simd_layout[d];
 	  _osites  *= _rdimensions[d];
 	  _isites  *= _simd_layout[d];
 	  // Addressing support
 	  if ( d==0 ) {
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@ -151,8 +151,6 @@ public:
      _ldimensions.resize(_ndimension);
      _rdimensions.resize(_ndimension);
      _simd_layout.resize(_ndimension);
      _lstart.resize(_ndimension);
      _lend.resize(_ndimension);
      _ostride.resize(_ndimension);
      _istride.resize(_ndimension);
@ -171,8 +169,6 @@ public:
 	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
 	}
 	_ldimensions[d] = _gdimensions[d]/_processors[d];
 	_lstart[d]     = _processor_coor[d]*_ldimensions[d];
 	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
 	// Use a reduced simd grid
 	_simd_layout[d] = simd_layout[d];
--- a/lib/communicator/.dirstamp
+++ b/lib/communicator/.dirstamp
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@ -25,8 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/GridCore.h>
+#include <Grid/Grid.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////
@ -34,7 +33,6 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
 uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
 CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 /////////////////////////////////
 // Alloc, free shmem region
@ -60,7 +58,6 @@ void CartesianCommunicator::ShmBufferFreeAll(void) {
 /////////////////////////////////
 // Grid information queries
 /////////////////////////////////
 int                      CartesianCommunicator::Dimensions(void)         { return _ndimension; };
 int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 int                      CartesianCommunicator::BossRank(void)          { return 0; };
 int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
@ -91,10 +88,7 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
-int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
+void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int xmit_to_rank,
 						       void *recv,
@ -102,7 +96,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 						       int bytes)
 {
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 {
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -116,12 +116,6 @@ class CartesianCommunicator {
  // Implemented in Communicator_base.C
  /////////////////////////////////
  static void * ShmCommBuf;
  // Isend/Irecv/Wait, or Sendrecv blocking
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  size_t heap_top;
  size_t heap_bytes;
@ -148,15 +142,12 @@ class CartesianCommunicator {
  int  RankFromProcessorCoor(std::vector<int> &coor);
  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
  int                      Dimensions(void)        ;
  int                      IsBoss(void)            ;
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  int                      NodeCount(void)    ;
  int                      RankCount(void)    ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
@ -177,8 +168,6 @@ class CartesianCommunicator {
  void GlobalSumVector(ComplexF *c,int N);
  void GlobalSum(ComplexD &c);
  void GlobalSumVector(ComplexD *c,int N);
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
@ -211,7 +200,7 @@ class CartesianCommunicator {
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+  void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				  void *xmit,
 				  int xmit_to_rank,
 				  void *recv,
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@ -25,9 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/GridCore.h>
+#include <Grid/Grid.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/ActionCore.h>
 #include <mpi.h>
 namespace Grid {
@ -41,13 +39,9 @@ MPI_Comm CartesianCommunicator::communicator_world;
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  int provided;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
-    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
+    MPI_Init(argc,argv);
    if ( provided != MPI_THREAD_MULTIPLE ) {
      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
    }
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  ShmInitGeneric();
@ -83,14 +77,6 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
@ -166,34 +152,24 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-  int myrank = _processor;
+  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-    MPI_Request xrq;
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-    MPI_Request rrq;
+  
  assert(ierr==0);
-    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  list.push_back(xrq);
-    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  list.push_back(rrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
    // Give the CPU to MPI immediately; can use threads to overlap optionally
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
+  int nreq=list.size();
-    int nreq=list.size();
+  std::vector<MPI_Status> status(nreq);
-    std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  assert(ierr==0);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::Barrier(void)
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@ -1,4 +1,4 @@
-/*************************************************************************************
+    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -25,23 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/GridCore.h>
+#include <Grid/Grid.h>
 #include <mpi.h>
 #include <semaphore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/types.h>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
 //#include <zlib.h>
 #ifndef SHM_HUGETLB
 #define SHM_HUGETLB 04000
 #endif
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@ -64,11 +50,6 @@ std::vector<int> CartesianCommunicator::GroupRanks;
 std::vector<int> CartesianCommunicator::MyGroup;
 std::vector<void *> CartesianCommunicator::ShmCommBufs;
 int CartesianCommunicator::NodeCount(void)    { return GroupSize;};
 int CartesianCommunicator::RankCount(void)    { return WorldSize;};
 #undef FORCE_COMMS
 void *CartesianCommunicator::ShmBufferSelf(void)
 {
  return ShmCommBufs[ShmRank];
@ -76,9 +57,6 @@ void *CartesianCommunicator::ShmBufferSelf(void)
 void *CartesianCommunicator::ShmBuffer(int rank)
 {
  int gpeer = GroupRanks[rank];
 #ifdef FORCE_COMMS
  return NULL;
 #endif
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
@ -87,13 +65,7 @@ void *CartesianCommunicator::ShmBuffer(int rank)
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
 {
  static int count =0;
  int gpeer = GroupRanks[rank];
  assert(gpeer!=ShmRank); // never send to self
  assert(rank!=WorldRank);// never send to self
 #ifdef FORCE_COMMS
  return NULL;
 #endif
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
@ -104,27 +76,16 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
 }
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  int provided;
  //  mtrace();
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
-    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
+    MPI_Init(argc,argv);
    assert (provided == MPI_THREAD_MULTIPLE);
  }
  Grid_quiesce_nodes();
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  MPI_Comm_rank(communicator_world,&WorldRank);
  MPI_Comm_size(communicator_world,&WorldSize);
  if ( WorldRank == 0 ) {
    std::cout << GridLogMessage<< "Initialising MPI "<< WorldRank <<"/"<<WorldSize <<std::endl;
  }
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
@ -170,6 +131,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  ///////////////////////////////////////////////////////////////////
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
  assert(ierr==0);
  ///////////////////////////////////////////////////////////////////
  // find the group leaders world rank
  ///////////////////////////////////////////////////////////////////
@ -179,6 +141,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      leaders_group[group++] = l;
    }
  }
  ///////////////////////////////////////////////////////////////////
  // Identify the rank of the group in which I (and my leader) live
  ///////////////////////////////////////////////////////////////////
@ -189,113 +152,38 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    }
  }
  assert(GroupRank!=-1);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared window for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  MPI_Barrier(ShmComm);
+  
  ShmCommBuf = 0;
-  ShmCommBufs.resize(ShmSize);
+  ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
-
+  assert(ierr==0);
-#if 1
+  // KNL hack -- force to numa-domain 1 in flat
-  char shm_name [NAME_MAX];
+#if 0
-  if ( ShmRank == 0 ) {
+  //#include <numaif.h>
-    for(int r=0;r<ShmSize;r++){
+  for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
-
+    void *pages = (void *) ( page + ShmCommBuf );
-      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
+    int status;
-
+    int flags=MPOL_MF_MOVE_ALL;
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
+    int nodes=1; // numa domain == MCDRAM
-
+    unsigned long count=1;
-      shm_unlink(shm_name);
+    ierr= move_pages(0,count, &pages,&nodes,&status,flags);
-      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
+    if (ierr && (page==0)) perror("numa relocate command failed");
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
      ShmCommBufs[r] =ptr;
    }
  }
  MPI_Barrier(ShmComm);
  if ( ShmRank != 0 ) { 
    for(int r=0;r<ShmSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
      int fd=shm_open(shm_name,O_RDWR,0666);
      if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
      ShmCommBufs[r] =ptr;
    }
  }
 #else
  std::vector<int> shmids(ShmSize);
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
      key_t key   = 0x4545 + r;
      if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
 	int errsv = errno;
 	printf("Errno %d\n",errsv);
 	perror("shmget");
 	exit(1);
      }
      printf("shmid: 0x%x\n", shmids[r]);
    }
  }
  MPI_Barrier(ShmComm);
  MPI_Bcast(&shmids[0],ShmSize*sizeof(int),MPI_BYTE,0,ShmComm);
  MPI_Barrier(ShmComm);
  for(int r=0;r<ShmSize;r++){
    ShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
    if (ShmCommBufs[r] == (uint64_t *)-1) {
      perror("Shared memory attach failure");
      shmctl(shmids[r], IPC_RMID, NULL);
      exit(2);
    }
    printf("shmaddr: %p\n", ShmCommBufs[r]);
  }
  MPI_Barrier(ShmComm);
  // Mark for clean up
  for(int r=0;r<ShmSize;r++){
    shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
  }
  MPI_Barrier(ShmComm);
 #endif
-  ShmCommBuf         = ShmCommBufs[ShmRank];
+  MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
-
+  
-  MPI_Barrier(ShmComm);
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  if ( ShmRank == 0 ) {
+  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
-    for(int r=0;r<ShmSize;r++){
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      uint64_t * check = (uint64_t *) ShmCommBufs[r];
+  ShmCommBufs.resize(ShmSize);
      check[0] = GroupRank;
      check[1] = r;
      check[2] = 0x5A5A5A;
    }
  }
  MPI_Barrier(ShmComm);
  for(int r=0;r<ShmSize;r++){
-    uint64_t * check = (uint64_t *) ShmCommBufs[r];
+    MPI_Aint sz;
-    
+    int dsp_unit;
-    assert(check[0]==GroupRank);
+    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
    assert(check[1]==r);
    assert(check[2]==0x5A5A5A);
  }
  MPI_Barrier(ShmComm);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Verbose for now
@ -304,7 +192,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
    std::cout<< WorldSize << " Ranks " ;
    std::cout<< GroupSize << " Nodes " ;
-    std::cout<< " with "<< ShmSize  << " ranks-per-node "<<std::endl;
+    std::cout<<  ShmSize  << " with ranks-per-node "<<std::endl;
    std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size ";
    std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
@ -319,6 +207,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      if(g!=ShmSize-1) std::cout<<",";
      else std::cout<<"}"<<std::endl;
    }
  }
  for(int g=0;g<GroupSize;g++){
@ -327,21 +216,23 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      if ( (ShmRank == 0) && (GroupRank==g) ) {
 	std::cout<<MyGroup[r];
 	if(r<ShmSize-1) std::cout<<",";
-	else std::cout<<"}"<<std::endl<<std::flush;
+	else std::cout<<"}"<<std::endl;
      }
      MPI_Barrier(communicator_world);
    }
  }
-
+  
  assert(ShmSetup==0);  ShmSetup=1;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Want to implement some magic ... Group sub-cubes into those on same node
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &dest,int &source)
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
-  std::vector<int> coor = _processor_coor; // my coord
+  std::vector<int> coor = _processor_coor;
  assert(std::abs(shift) <_processors[dim]);
  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
@ -351,32 +242,28 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &dest,int &source
  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,dest,_processors);
  dest = LexicographicToWorldRank[dest];
-
+}
 }// rank is world rank.
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  Lexicographic::IndexFromCoor(coor,rank,_processors);
  rank = LexicographicToWorldRank[rank];
  return rank;
-}// rank is world rank
+}
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
-  int lr=-1;
+  Lexicographic::CoorFromIndex(coor,rank,_processors);
-  for(int r=0;r<WorldSize;r++){// map world Rank to lexico and then to coor
+  rank = LexicographicToWorldRank[rank];
    if( LexicographicToWorldRank[r]==rank) lr = r;
  }
  assert(lr!=-1);
  Lexicographic::CoorFromIndex(coor,lr,_processors);
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  int ierr;
  communicator=communicator_world;
  _ndimension = processors.size();
  communicator=communicator_world;
  _ndimension = processors.size();
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
@ -388,22 +275,24 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
    }
  }
  assert(log2size != -1);
-
+  
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int dim = 0;
  std::vector<int> WorldDims = processors;
-  ShmDims.resize  (_ndimension,1);
+  ShmDims.resize(_ndimension,1);
  GroupDims.resize(_ndimension);
-  ShmCoor.resize  (_ndimension);
+    
  ShmCoor.resize(_ndimension);
  GroupCoor.resize(_ndimension);
  WorldCoor.resize(_ndimension);
  int dim = 0;
  for(int l2=0;l2<log2size;l2++){
-    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%_ndimension;
+    while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%_ndimension;
  }
@ -415,29 +304,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
    GroupDims[d] = WorldDims[d]/ShmDims[d];
  }
  ////////////////////////////////////////////////////////////////
  // Verbose
  ////////////////////////////////////////////////////////////////
 #if 0
  std::cout<< GridLogMessage << "MPI-3 usage "<<std::endl;
  std::cout<< GridLogMessage << "SHM   ";
  for(int d=0;d<_ndimension;d++){
    std::cout<< ShmDims[d] <<" ";
  }
  std::cout<< std::endl;
  std::cout<< GridLogMessage << "Group ";
  for(int d=0;d<_ndimension;d++){
    std::cout<< GroupDims[d] <<" ";
  }
  std::cout<< std::endl;
  std::cout<< GridLogMessage<<"World ";
  for(int d=0;d<_ndimension;d++){
    std::cout<< WorldDims[d] <<" ";
  }
  std::cout<< std::endl;
 #endif
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
@ -451,57 +317,29 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  // 
  ////////////////////////////////////////////////////////////////
  LexicographicToWorldRank.resize(WorldSize,0);
  Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
  Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
  for(int d=0;d<_ndimension;d++){
    WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
  }
  _processor_coor = WorldCoor;
-  _processor      = WorldRank;
+
  int lexico;
  Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
  LexicographicToWorldRank[lexico]=WorldRank;
  _processor = lexico;
  ///////////////////////////////////////////////////////////////////
  // global sum Lexico to World mapping
  ///////////////////////////////////////////////////////////////////
  int lexico;
  LexicographicToWorldRank.resize(WorldSize,0);
  Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
  LexicographicToWorldRank[lexico] = WorldRank;
  ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
  assert(ierr==0);
-
+  
  for(int i=0;i<WorldSize;i++){
    int wr = LexicographicToWorldRank[i];
    //    int wr = i;
    std::vector<int> coor(_ndimension);
    ProcessorCoorFromRank(wr,coor); // from world rank
    int ck = RankFromProcessorCoor(coor);
    assert(ck==wr);
    if ( wr == WorldRank ) { 
      for(int j=0;j<coor.size();j++) {
 	assert(coor[j] == _processor_coor[j]);
      }
    }
    /*
    std::cout << GridLogMessage<< " Lexicographic "<<i;
    std::cout << " MPI rank      "<<wr;
    std::cout << " Coor          ";
    for(int j=0;j<coor.size();j++) std::cout << coor[j];
    std::cout<< std::endl;
    */
    /////////////////////////////////////////////////////
    // Check everyone agrees on everyone elses coords
    /////////////////////////////////////////////////////
    std::vector<int> mcoor = coor;
    this->Broadcast(0,(void *)&mcoor[0],mcoor.size()*sizeof(int));
    for(int d = 0 ; d< _ndimension; d++) {
      assert(coor[d] == mcoor[d]);
    }
  }
 };
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@ -510,14 +348,6 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
@ -537,6 +367,8 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
@ -545,14 +377,10 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
@ -569,6 +397,7 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
@ -577,29 +406,95 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-  int myrank = _processor;
+#if 0
  this->StencilBarrier();
  MPI_Request xrq;
  MPI_Request rrq;
  static int sequence;
  int ierr;
  int tag;
  int check;
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
+  assert(dest != _processor);
-    MPI_Request xrq;
+  assert(from != _processor);
-    MPI_Request rrq;
+  
  int gdest = GroupRanks[dest];
  int gfrom = GroupRanks[from];
  int gme   = GroupRanks[_processor];
-    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  sequence++;
-    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  
-    
+  char *from_ptr = (char *)ShmCommBufs[ShmRank];
  int small = (bytes<MAX_MPI_SHM_BYTES);
  typedef uint64_t T;
  int words = bytes/sizeof(T);
  assert(((size_t)bytes &(sizeof(T)-1))==0);
  assert(gme == ShmRank);
  if ( small && (gdest !=MPI_UNDEFINED) ) {
    char *to_ptr   = (char *)ShmCommBufs[gdest];
    assert(gme != gdest);
    T *ip = (T *)xmit;
    T *op = (T *)to_ptr;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      op[w]=ip[w];
    }
    bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
    bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence));
  } else { 
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
    // Give the CPU to MPI immediately; can use threads to overlap optionally
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
  this->StencilBarrier();
  if (small && (gfrom !=MPI_UNDEFINED) ) {
    T *ip = (T *)from_ptr;
    T *op = (T *)recv;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      op[w]=ip[w];
    }
    bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
    bcopy(&from_ptr[bytes+4],&check,sizeof(check));
    assert(check==sequence);
    assert(tag==from);
  } else { 
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    assert(ierr==0);
    list.push_back(rrq);
  }
  this->StencilBarrier();
 #else
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 #endif
 }
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int dest,
 						       void *recv,
@ -610,63 +505,57 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  MPI_Request rrq;
  int ierr;
  assert(dest != _processor);
  assert(from != _processor);
  int gdest = GroupRanks[dest];
  int gfrom = GroupRanks[from];
  int gme   = GroupRanks[_processor];
-  assert(dest != _processor);
+  assert(gme == ShmRank);
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
 #ifdef FORCE_COMMS
  gdest = MPI_UNDEFINED;
  gfrom = MPI_UNDEFINED;
 #endif
  if ( gfrom ==MPI_UNDEFINED) {
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }
  if ( gdest == MPI_UNDEFINED ) {
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
-    off_node_bytes+=bytes;
+  }
  if ( gfrom ==MPI_UNDEFINED) {
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    assert(ierr==0);
    list.push_back(rrq);
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
    this->StencilSendToRecvFromComplete(list);
  }
  return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
-  SendToRecvFromComplete(waitall);
+  SendToRecvFromComplete(list);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  if (nreq==0) return;
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
@ -676,11 +565,7 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 		     communicator);
  assert(ierr==0);
 }
-int CartesianCommunicator::RankWorld(void){ 
+
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
--- a/lib/communicator/Communicator_mpi3_leader.cc
+++ b/lib/communicator/Communicator_mpi3_leader.cc
@ -27,7 +27,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #include "Grid.h"
 #include <mpi.h>
 //#include <numaif.h>
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Workarounds:
@ -43,27 +42,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 typedef sem_t *Grid_semaphore;
 #error  /*THis is deprecated*/
 #if 0 
 #define SEM_INIT(S)      S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
 #define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
 #define SEM_POST(S) assert ( sem_post(S) == 0 ); 
 #define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
-#else
+
 #define SEM_INIT(S)      ;
 #define SEM_INIT_EXCL(S) ;
 #define SEM_POST(S) ;
 #define SEM_WAIT(S) ;
 #endif
 #include <sys/mman.h>
 namespace Grid {
-enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL, COMMAND_SENDRECV };
+enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL };
 struct Descriptor {
  uint64_t buf;
@ -71,12 +62,6 @@ struct Descriptor {
  int rank;
  int tag;
  int command;
  uint64_t xbuf;
  uint64_t rbuf;
  int xtag;
  int rtag;
  int src;
  int dest;
  MPI_Request request;
 };
@ -109,14 +94,18 @@ public:
  void SemInit(void) {
    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
    //    printf("SEM_NAME: %s \n",sem_name);
    SEM_INIT(sem_head);
    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
    //    printf("SEM_NAME: %s \n",sem_name);
    SEM_INIT(sem_tail);
  }  
  void SemInitExcl(void) {
    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
    //    printf("SEM_INIT_EXCL: %s \n",sem_name);
    SEM_INIT_EXCL(sem_head);
    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
    //    printf("SEM_INIT_EXCL: %s \n",sem_name);
    SEM_INIT_EXCL(sem_tail);
  }  
  void WakeUpDMA(void) { 
@ -136,13 +125,6 @@ public:
    while(1){
      WaitForCommand();
      //      std::cout << "Getting command "<<std::endl;
 #if 0
      _mm_monitor((void *)&state->head,0,0);
      int s=state->start;
      if ( s != state->head ) {
 	_mm_mwait(0,0);
      }
 #endif
      Event();
    }
  }
@ -150,7 +132,6 @@ public:
  int Event (void) ;
  uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
  void QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) ;
  void WaitAll() {
    //    std::cout << "Queueing WAIT command  "<<std::endl;
@ -160,7 +141,7 @@ public:
    //    std::cout << "Waiting from semaphore "<<std::endl;
    WaitForComplete();
    //    std::cout << "Checking FIFO is empty "<<std::endl;
-    while ( state->tail != state->head );
+    assert ( state->tail == state->head );
  }
 };
@ -215,12 +196,6 @@ public:
    //    std::cout << "Waking up DMA "<< slave<<std::endl;
  };
  static void QueueSendRecv(int slave,void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) 
  {
    Slaves[slave].QueueSendRecv(xbuf,rbuf,bytes,xtag,rtag,comm,dest,src);
    Slaves[slave].WakeUpDMA();
  }
  static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl;
    Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
@ -251,28 +226,6 @@ public:
    return;
  };
  static void QueueRoundRobinSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
    uint8_t * cxbuf = (uint8_t *) xbuf;
    uint8_t * crbuf = (uint8_t *) rbuf;
    static int rrp=0;
    int procs = VerticalSize-1;
    int myoff=0;
    int mywork=bytes;
    QueueSendRecv(rrp+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
    rrp = rrp+1;
    if ( rrp == (VerticalSize-1) ) rrp = 0;
  }
  static void QueueMultiplexedSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
    uint8_t * cxbuf = (uint8_t *) xbuf;
    uint8_t * crbuf = (uint8_t *) rbuf;
    int mywork, myoff, procs;
    procs = VerticalSize-1;
    for(int s=0;s<procs;s++) {
      GetWork(bytes,s,mywork,myoff,procs);
      QueueSendRecv(s+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
    }
  };
  static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    uint8_t * cbuf = (uint8_t *) buf;
    int mywork, myoff, procs;
@ -322,7 +275,6 @@ std::vector<void *>            MPIoffloadEngine::VerticalShmBufs;
 std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
 std::vector<int>               MPIoffloadEngine::UserCommunicatorToWorldRanks; 
 int CartesianCommunicator::NodeCount(void)    { return HorizontalSize;};
 int MPIoffloadEngine::ShmSetup = 0;
 void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
@ -418,22 +370,12 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
      ftruncate(fd, size);
      VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( VerticalShmBufs[r] == MAP_FAILED ) { 
 	perror("failed mmap");
 	assert(0);
      }
      /*
      for(uint64_t page=0;page<size;page+=4096){
 	void *pages = (void *) ( page + (uint64_t)VerticalShmBufs[r] );
 	int status;
 	int flags=MPOL_MF_MOVE_ALL;
 	int nodes=1; // numa domain == MCDRAM
 	unsigned long count=1;
 	ierr= move_pages(0,count, &pages,&nodes,&status,flags);
 	if (ierr && (page==0)) perror("numa relocate command failed");
      }
      */
      uint64_t * check = (uint64_t *) VerticalShmBufs[r];
      check[0] = WorldRank;
      check[1] = r;
@ -462,7 +404,7 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
    uint64_t * check = (uint64_t *) VerticalShmBufs[r];
    assert(check[0]== WorldRank);
    assert(check[1]== r);
-    //    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
+    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
  }
  }
 #endif
@ -600,8 +542,6 @@ int Slave::Event (void) {
  static int head_last;
  static int start_last;
  int ierr;
  MPI_Status stat;
  static int i=0;
  ////////////////////////////////////////////////////
  // Try to advance the start pointers
@ -610,6 +550,11 @@ int Slave::Event (void) {
  if ( s != state->head ) {
    switch ( state->Descrs[s].command ) {
    case COMMAND_ISEND:
      /*
            std::cout<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
      	       << " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
       << " Comm " << MPIoffloadEngine::communicator_universe<< " me " <<universe_rank<< std::endl;
      */
      ierr = MPI_Isend((void *)(state->Descrs[s].buf+base), 
 		       state->Descrs[s].bytes, 
 		       MPI_CHAR,
@ -623,6 +568,11 @@ int Slave::Event (void) {
      break;
    case COMMAND_IRECV:
      /*
      std::cout<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
 	       << " from " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
 	       << " Comm " << MPIoffloadEngine::communicator_universe<< " me "<< universe_rank<< std::endl;
      */
      ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base), 
 		     state->Descrs[s].bytes, 
 		     MPI_CHAR,
@ -638,32 +588,10 @@ int Slave::Event (void) {
      return 1;
      break;
    case COMMAND_SENDRECV:
      //      fprintf(stderr,"Sendrecv ->%d %d : <-%d %d \n",state->Descrs[s].dest, state->Descrs[s].xtag+i*10,state->Descrs[s].src, state->Descrs[s].rtag+i*10);
      ierr=MPI_Sendrecv((void *)(state->Descrs[s].xbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].dest, state->Descrs[s].xtag+i*10,
 			(void *)(state->Descrs[s].rbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].src , state->Descrs[s].rtag+i*10,
 			MPIoffloadEngine::communicator_universe,MPI_STATUS_IGNORE);
      assert(ierr==0);
      //      fprintf(stderr,"Sendrecv done %d %d\n",ierr,i);
      //      MPI_Barrier(MPIoffloadEngine::HorizontalComm);
      //      fprintf(stderr,"Barrier\n");
      i++;
      state->start = PERI_PLUS(s);
      return 1;
      break;
    case COMMAND_WAITALL:
      for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
-	if ( state->Descrs[t].command != COMMAND_SENDRECV ) {
+	MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
 	  MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
 	}
      };
      s=PERI_PLUS(s);
      state->start = s;
@ -685,45 +613,6 @@ int Slave::Event (void) {
  // External interaction with the queue
  //////////////////////////////////////////////////////////////////////////////
 void Slave::QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) 
 {
  int head =state->head;
  int next = PERI_PLUS(head);
  // Set up descriptor
  int worldrank;
  int hashtag;
  MPI_Comm    communicator;
  MPI_Request request;
  uint64_t relative;
  relative = (uint64_t)xbuf - base;
  state->Descrs[head].xbuf    = relative;
  relative= (uint64_t)rbuf - base;
  state->Descrs[head].rbuf    = relative;
  state->Descrs[head].bytes  = bytes;
  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,xtag,comm,dest);
  state->Descrs[head].dest   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
  state->Descrs[head].xtag    = hashtag;
  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,rtag,comm,src);
  state->Descrs[head].src    = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
  state->Descrs[head].rtag    = hashtag;
  state->Descrs[head].command= COMMAND_SENDRECV;
  // Block until FIFO has space
  while( state->tail==next );
  // Msync on weak order architectures
  // Advance pointer
  state->head = next;
 };
 uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank) 
 {
  /////////////////////////////////////////
@ -923,22 +812,19 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_
  assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
  assert(from!=_processor);
  assert(dest!=_processor);
-
+  MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
-  MPIoffloadEngine::QueueMultiplexedSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
+  MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
  //MPIoffloadEngine::QueueRoundRobinSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
  //MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
  //MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  MPIoffloadEngine::WaitAll();
  //this->Barrier();
 }
-void CartesianCommunicator::StencilBarrier(void) { }
+void CartesianCommunicator::StencilBarrier(void)
 {
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Chulwoo Jung	1e3fb32572	Checking in working version of Lanczos.	2017-03-21 16:45:33 -04:00
Chulwoo Jung	0d5af667d8	Works with CPS evolution	2017-03-21 12:40:43 -04:00
Chulwoo Jung	e9712bc7fb	Zmobius test was wrong! (only mobius) checking in again	2017-03-16 23:04:28 -04:00