Const correctness patch

Hadrons: 3-index RO access to Eigen disk vector
Merge branch 'feature/contractor' into develop
2026-05-12 13:14:31 +01:00 · 2018-11-19 10:38:36 +00:00 · 2018-10-16 14:44:14 +01:00 · 2018-10-16 11:29:37 +01:00 · 2018-10-15 17:25:08 +01:00 · 2018-10-15 15:51:45 +01:00
1213 changed files with 204238 additions and 24879 deletions
@@ -1,8 +1,116 @@
-# Exclude directories
+# Compiled Object files #
-_site
+#########################
-.sass-cache
+*.slo
-.jekyll-metadata
+*.lo
-pdf
+*.o
 *.obj
-# Exclude backup files
+# Editor files #
 ################
 *~
 *#
 *.sublime-*
 # Precompiled Headers #
 #######################
 *.gch
 *.pch
 # Compiled Dynamic libraries #
 ##############################
 *.so
 *.dylib
 *.dll
 # Fortran module files #
 ########################
 *.mod
 # Compiled Static libraries #
 #############################
 *.lai
 *.la
 *.a
 *.lib
 # Executables #
 ###############
 *.exe
 *.out
 *.app
 # http://www.gnu.org/software/automake #
 ########################################
 Makefile.in
 Makefile
 Config.h
 Config.h.in
 config.log
 config.status
 .deps
 Make.inc
 eigen.inc
 Eigen.inc
 # http://www.gnu.org/software/autoconf #
 ########################################
 autom4te.cache
 aclocal.m4
 compile
 configure
 depcomp
 install-sh
 missing
 stamp-h1
 config.sub
 config.guess
 INSTALL
 .dirstamp
 ltmain.sh
 # Logs and databases #
 ######################
 *.log
 *.sql
 *.sqlite
 # OS generated files #
 ######################
 .DS_Store
 .DS_Store?
 ._*
 .Spotlight-V100
 .Trashes
 ehthumbs.db
 Thumbs.db
 .dirstamp
 # build directory #
 ###################
 build*/*
 # IDE related files #
 #####################
 *.xcodeproj/*
 build.sh
 .vscode
 *.code-workspace
 # Eigen source #
 ################
 Grid/Eigen
 Eigen/*
 # libtool macros #
 ##################
 m4/lt*
 m4/libtool.m4
 # github pages #
 ################
 gh-pages/
 # generated sources #
 #####################
 Grid/qcd/spin/gamma-gen/*.h
 Grid/qcd/spin/gamma-gen/*.cc
@@ -0,0 +1,61 @@
 language: cpp
 cache:
  directories:
    - clang
 matrix:
  include:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
      env: PREC=single
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
      env: PREC=double
 before_install:
    - export GRIDDIR=`pwd`
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
 install:
    - export CWD=`pwd`
    - echo $CWD
    - export CC=$CC$VERSION
    - export CXX=$CXX$VERSION
    - echo $PATH
    - which autoconf
    - autoconf  --version
    - which automake
    - automake  --version
    - which $CC
    - $CC  --version
    - which $CXX
    - $CXX --version
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
 script:
    - ./bootstrap.sh
    - mkdir build
    - cd build
    - mkdir lime
    - cd lime
    - mkdir build
    - cd build
    - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
    - tar xf lime-1.3.2.tar.gz
    - cd lime-1.3.2
    - ./configure --prefix=$CWD/build/lime/install
    - make -j4
    - make install
    - cd $CWD/build
    - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
@@ -0,0 +1,4 @@
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@MacBook-Pro.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
@@ -1,571 +0,0 @@
 ## [3.4.8](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.4.8)
 ### Enhancements
 - Improve type readability for larger viewports by bumping up base `font-size`. [#533](https://github.com/mmistakes/minimal-mistakes/issues/533)
 - Update Portuguese localized UI text. [#541](https://github.com/mmistakes/minimal-mistakes/pull/541)
 - Add `page.title` and via parameter to Twitter share link. [#538](https://github.com/mmistakes/minimal-mistakes/pull/538)
 ### Bug Fixes
 - Fix Last.fm author profile URL. [#540](https://github.com/mmistakes/minimal-mistakes/pull/540)
 ### Maintenance
 - Move Brazilian Portuguese localized text under `pt-BR` key.
 ## [3.4.7](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.4.7)
 ### Enhancements
 - Add `layout` based and user-defined class names to `<body>` element for added CSS hooks. [#526](https://github.com/mmistakes/minimal-mistakes/pull/526)
 - Add simplified Chinese localized UI text. [#532](https://github.com/mmistakes/minimal-mistakes/pull/532)
 ### Bug Fixes
 - Remove duplicate include of `base_path` in category-list.html [#522](https://github.com/mmistakes/minimal-mistakes/pull/522)
 ## [3.4.6](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.4.6)
 ### Enhancements
 - Add Italian "comments" related localized UI text. [#514](https://github.com/mmistakes/minimal-mistakes/pull/514)
 ### Bug Fixes
 - Disable `compress` HTML layout by default. To enable add `layout: compress` to `_layouts/default.html`.
 ## [3.4.5](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.4.5)
 ### Enhancements
 - Improve line numbered code block styling when using `{% highlight linenos %}` tag. [#513](https://github.com/mmistakes/minimal-mistakes/issues/513)
 - Add English fallback to "Follow" button label. [#496](https://github.com/mmistakes/minimal-mistakes/pull/496)
 ### Bug Fixes
 - Fix Firefox alignment issues with code blocks generated with the `{% highlight %}` tag. [#512](https://github.com/mmistakes/minimal-mistakes/issues/512)
 ### Maintenance
 - Clarified comment for `author.stackoverflow` value used in author sidebar links. [#487](https://github.com/mmistakes/minimal-mistakes/pull/487)
 - Add list of localized text strings. [#488](https://github.com/mmistakes/minimal-mistakes/pull/488)
 - Add `{% highlight %}` code block examples to demo site.
 - Add documentation for using custom sidebar navigation menus. [#476](https://github.com/mmistakes/minimal-mistakes/issues/476)
 ## [3.4.4](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.4.4)
 ### Enhancements
 - Add French "comments" related localized UI text. [#472](https://github.com/mmistakes/minimal-mistakes/pull/472)
 ### Bug Fixes
 - Exclude `vendor` in Jekyll config file.
 - Fix Liquid syntax error for offending parenthesis. [#479](https://github.com/mmistakes/minimal-mistakes/issues/479)
 ### Maintenance
 - Update gems: `colorator` (1.1.0), `forwardable-extended` (2.6.0), `github-pages` (93), `jekyll` (= 3.2.1), `minima` (= 1.0.1).
 ## [3.4.3](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.4.3)
 ### Enhancements
 - Make ["honeypot" `input`](https://github.com/mmistakes/minimal-mistakes/commit/06a8249a69a37dddda7e2a5bfbe32056c1a9a607) in Staticman comment form less obvious to spam bots
 - Add padding to `.highlight` code blocks to better [align `overflow` scrollbar](https://github.com/mmistakes/minimal-mistakes/commit/e4abec0a6f7f8cff72505ca0754615df294fd5b3) to the bottom.
 - Add additional image options for Twitter card social sharing meta tags. [#466](https://github.com/mmistakes/minimal-mistakes/pull/466)
 - Add structured data markup for Staticman comments. [#458](https://github.com/mmistakes/minimal-mistakes/issues/458)
 ### Bug Fixes
 - Format `og:locale` tag with `_` instead of `-`. [#462](https://github.com/mmistakes/minimal-mistakes/issues/462)
 ### Maintenance
 - Add note to docs about using `url: http://localhost:4000` when working locally.
 ## [3.4.2](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.4.2)
 ### Enhancements
 - Improve UX of static comment forms. [#448](https://github.com/mmistakes/minimal-mistakes/issues/448)
 ## [3.4.1](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.4.1)
 ### Enhancements
 - Add `staticman.filename` configuration with UNIX timestamp for sorting data files. example ~> `comment-1470943149`.
 ### Bug Fixes
 - Don't add `<a>` to author name if URL is blank.
 ## [3.4.0](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.4.0)
 ### Enhancements
 - Support static-based commenting via [Staticman](https://staticman.net/) for sites hosted with GitHub Pages. [#424](https://github.com/mmistakes/minimal-mistakes/issues/424)
 ## [3.3.7](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.3.7)
 ### Bug Fixes
 - Re-enabled Jekyll plugins in `_config.yml` in case they aren't autoloaded in `Gemfile`. [#417](https://github.com/mmistakes/minimal-mistakes/issues/417)
 ### Enhancements
 - Fallback to `site.github.url` for use in `{{ base_path }}` when `site.url` is `nil`.
 - Replace Sass and Autoprefixer `npm` build scripts with [Jekyll's built-in asset support](https://jekyllrb.com/docs/assets/). [#333](https://github.com/mmistakes/minimal-mistakes/issues/333)
 ### Maintenance
 - Document `site.repository` and its role with [`github-metadata`](https://github.com/jekyll/github-metadata) gem.
 - Add sample [archive page with content](https://mmistakes.github.io/minimal-mistakes/archive-layout-with-content/) for testing styles on demo site.
 ## [3.3.6](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.3.6)
 ### Bug Fixes
 - Fix blank `site.teaser` bug. [#412](https://github.com/mmistakes/minimal-mistakes/issues/412)
 ## [3.3.5](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.3.5)
 ### Enhancements
 - Add English default text `site.locale` strings. [#407](https://github.com/mmistakes/minimal-mistakes/issues/407)
 - Add Portuguese localized UI text. [#411](https://github.com/mmistakes/minimal-mistakes/pull/411)
 - Add Italian localized UI text. [#409](https://github.com/mmistakes/minimal-mistakes/pull/409)
 ### Maintenance
 - Remove unused Google AdSense variables in `_config.yml`. [#404](https://github.com/mmistakes/minimal-mistakes/issues/404)
 - Update `Gemfile` instructions for using `github-pages` vs. native `jekyll` gems.
 - Disable `gems:` in `_config.yml` and enable plugins with Bundler instead.
 - Add `repository` to `_config.yml` to suppress GitHub Pages error `Liquid Exception: No repo name found.`
 ## [3.3.4](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.3.4)
 ### Enhancements
 - Add support for configurable feed URL to use a service like FeedBurner instead of linking directly to `feed.xml` in `<head>` and the site footer. [#378](https://github.com/mmistakes/minimal-mistakes/issues/378), [#379](https://github.com/mmistakes/minimal-mistakes/pull/379), [#406](https://github.com/mmistakes/minimal-mistakes/pull/406)
 - Add Turkish localized UI text. [#403](https://github.com/mmistakes/minimal-mistakes/pull/403)
 ### Maintenance
 - Update gems: `activesupport` (4.2.7), `ffi` (1.9.14), `github-pages` (88), `jekyll-redirect-from` (0.11.0), `jekyll-watch` (1.5.0).
 ## [3.3.3](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.3.3)
 ### Enhancements
 - Make footer stick to the bottom of the page.
 ### Bug Fixes
 - Fix `gallery` size bug [#402](https://github.com/mmistakes/minimal-mistakes/issues/402)
 ### Maintenance
 - Set default `lang` to `en`.
 ## [3.3.2](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.3.2)
 ### Bug Fixes
 - Fix JavaScript that triggers "sticky" sidebar to avoid layout issues on screen sizes < `1024px`. [#396](https://github.com/mmistakes/minimal-mistakes/issues/396)
 ## [3.3.1](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.3.1)
 ### Enhancements
 - Enable image popup on < 500px wide screens. [#385](https://github.com/mmistakes/minimal-mistakes/issues/385)
 - Indicate the relationship between component URLs in a paginated series by applying `rel="prev"` and `rel="next"` to pages that use `site.paginator`. [#253](https://github.com/mmistakes/minimal-mistakes/issues/253)
 - Improve link posts in archive listings. [#276](https://github.com/mmistakes/minimal-mistakes/issues/276)
 ### Maintenance
 - Update gems: `github-pages` (86), `ffi` 1.9.13, `jekyll-mentions` 1.1.3, and `rouge` 1.11.1
 - Fix note about custom sidebar content appearing below author profile. [#388](https://github.com/mmistakes/minimal-mistakes/issues/388)
 ## [3.2.13](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.13)
 ### Enhancements
 - Add English default UI text for Canada, Great Britain, and Australia. [#377](https://github.com/mmistakes/minimal-mistakes/issues/377)
 - Switch default locale from `en-US` to `en`.
 ## [3.2.12](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.12)
 ### Enhancements
 - Remove window width "magic number" from sticky sidebar check in `main.js` for improved flexibility. [#375](https://github.com/mmistakes/minimal-mistakes/pull/375)
 ### Bug Fixes
 - Fix author override conditional where a missing `authors.yml` would show broken sidebar content. Defaults to `site.author`. [#376](https://github.com/mmistakes/minimal-mistakes/pull/376)
 ## [3.2.11](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.11)
 ### Bug Fixes
 - Fix disappearing author sidebar links [#372](https://github.com/mmistakes/minimal-mistakes/issues/372)
 ### Maintenance
 - Update gems: `github-pages` (84), `jekyll-github-metadata` 2.0.2, and `kramdown` 1.11.1
 - Update vendor JavaScript: jQuery 1.12.4, Stickyfill.js 1.1.4
 - Update Font Awesome 4.6.3
 ## [3.2.10](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.10)
 ### Maintenance
 - Add `CONTRIBUTING.md`
 ## [3.2.9](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.9)
 ### Enhancements
 - Add support for [header overlay images](https://mmistakes.github.io/minimal-mistakes/docs/layouts/#header-overlay) for Open Graph images. [#358](https://github.com/mmistakes/minimal-mistakes/pull/358)
 ### Bug Fixes
 - Fix `Person` typo Schema.org type [#358](https://github.com/mmistakes/minimal-mistakes/pull/358)
 ### Maintenance
 - Update `github-pages` gem and dependencies.
 - Remove `minutes_read` to avoid awkward reading time wording [#356](https://github.com/mmistakes/minimal-mistakes/issues/356)
 ## [3.2.8](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.8)
 ### Bug Fixes
 - Remove `cursor: pointer` that appears on white-space surrounding author side list items and links. [#354](https://github.com/mmistakes/minimal-mistakes/pull/354)
 ### Maintenance
 - Add contributing information to `README.md`. [#357](https://github.com/mmistakes/minimal-mistakes/issues/357)
 ## [3.2.7](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.7)
 ### Enhancements
 - Add French localized UI text. [#346](https://github.com/mmistakes/minimal-mistakes/pull/346)
 ### Bug Fixes
 - Fix branch logic for Yandex and Alexa in `seo.html`. [#348](https://github.com/mmistakes/minimal-mistakes/pull/348)
 ## [3.2.6](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.6)
 ### Bug Fixes
 - Fix error `Liquid Exception: divided by 0 in _includes/archive-single.html, included in _layouts/single.html` caused by null `words_per_minute` in `_config.yml`. [#345](https://github.com/mmistakes/minimal-mistakes/pull/345)
 ## [3.2.5](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.5)
 ### Bug Fixes
 - Fix link color in hero overlay to be white.
 - Remove underlines from archive item titles.
 ## [3.2.4](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.4)
 ### Enhancements
 - Improve text alignment of masthead, hero overlay, page footer to be flush left and remove awkward white-space gaps. [#342](https://github.com/mmistakes/minimal-mistakes/issues/342)
 - Add Spanish localized UI text. [#338](https://github.com/mmistakes/minimal-mistakes/pull/338)
 ### Bug Fixes
 - Fix alignment of icons in author sidebar [#341](https://github.com/mmistakes/minimal-mistakes/issues/341)
 ### Maintenance
 - Add background color to page footer to set it apart from main content. [#342](https://github.com/mmistakes/minimal-mistakes/issues/342)
 - Add terms and privacy policy to theme's demo site. [#343](https://github.com/mmistakes/minimal-mistakes/issues/343)
 - Update screenshots found in theme documentation.
 ## [3.2.3](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.3)
 ### Enhancements
 - Add [Discourse](https://www.discourse.org/) as a commenting provider. [#335](https://github.com/mmistakes/minimal-mistakes/pull/335)
 ## [3.2.2](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.2)
 ### Enhancements
 - Add support for image captions in Magnific Popup overlays via the [`gallery`](https://mmistakes.github.io/minimal-mistakes/docs/helpers/#gallery) helper. [#334](https://github.com/mmistakes/minimal-mistakes/issues/334)
 ## [3.2.1](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.1)
 ### Bug Fixes
 - Remove need for "double tapping" masthead menu links on iOS devices. [#315](https://github.com/mmistakes/minimal-mistakes/issues/315)
 ### Maintenance
 - Add `ISSUE_TEMPLATE.md` for improve issue submission process.
 ## [3.2.0](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.2.0)
 ### Bug Fixes
 - Fix missing category/tag links in post footer due to possible conflict with `site.tags` and `site.categories`. [#329](https://github.com/mmistakes/minimal-mistakes/issues/329#issuecomment-222375568)
 ## [3.1.8](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.1.8)
 ### Bug Fixes
 - Fix `Liquid Exception: undefined method 'gsub' for nil:NilClass in _layouts/single.html` error when `page.title` is null. `<h1>` element is now conditional if `title: ` is not set for a `page` or collection item. [#312](https://github.com/mmistakes/minimal-mistakes/issues/312)
 ### Maintenance
 - Remove duplicate `fa-twitter` and `fa-twitter-square` classes from `_utilities.scss`. [#302](https://github.com/mmistakes/minimal-mistakes/issues/302)
 - Document installing additional Jekyll gem dependencies when using `gem "jekyll"` instead of `gem "github-pages"` to avoid any errors on run. [#305](https://github.com/mmistakes/minimal-mistakes/issues/305)
 ## [3.1.7](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.1.7)
 ### Enhancements
 - Add translation key for "Recent Posts" used in home page `index.html`. [#316](https://github.com/mmistakes/minimal-mistakes/pull/316)
 ### Maintenance
 - Small fix to avoid underlying the whitespace between icons and related text when hovering. [#303](https://github.com/mmistakes/minimal-mistakes/pull/303)
 ## [3.1.6](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.1.6)
 ### Maintenance
 - Update gem dependencies. Run `bundle` to update `Gemfile.lock`.
 ## [3.1.5](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.1.5)
 ### Maintenance
 - Fix `www` and `https` links in author profile include [#293](https://github.com/mmistakes/minimal-mistakes/pull/293)
 ## [3.1.4](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.1.4)
 ### Enhancements
 - Add overlay_filter param to hero headers [#298](https://github.com/mmistakes/minimal-mistakes/pull/298)
 ## [3.1.3](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.1.3)
 ### Enhancements
 - Improve `site.locale` documentation [#284](https://github.com/mmistakes/minimal-mistakes/issues/284)
 - Remove ProTip note about protocol-less `site.url` as it is an anti-pattern [#288](https://github.com/mmistakes/minimal-mistakes/issues/288)
 ### Bug Fixes
 - Fix `og_image` URL in seo.html [#277](https://github.com/mmistakes/minimal-mistakes/issues/277)
 - Fix `author_profile` toggle when assigned in a `_layout` [#285](https://github.com/mmistakes/minimal-mistakes/issues/285)
 - Fix typo in `build:all` npm script [#283](https://github.com/mmistakes/minimal-mistakes/pull/283)
 - Fix URL typo documentation [#287](https://github.com/mmistakes/minimal-mistakes/issues/287)
 - SEO author bug. If `twitter.username` is set and `author.twitter` is `nil` bad things happen. [#289](https://github.com/mmistakes/minimal-mistakes/issues/289)
 ## [3.1.2](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.1.2)
 ### Enhancements
 - Explain how to use `nav_list` helper in [documentation](https://mmistakes.github.io/minimal-mistakes/docs/helpers/#navigation-list).
 - Reduce left/right padding on smaller screens to increase width of main content column.
 ### Bug Fixes
 - Fix alignment issues with related posts [#273](https://github.com/mmistakes/minimal-mistakes/issues/273) and "Follow" button in author profile [#274](https://github.com/mmistakes/minimal-mistakes/issues/274).
 ## [3.1.1](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.1.1)
 ### Bug Fix
 - Fixed reading time bug when `words_per_minute` wasn't set in `_config.yml` [#271](https://github.com/mmistakes/minimal-mistakes/issues/271)
 ## [3.1.0](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.1.0)
 ### Enhancements
 - Updated [Font Awesome](https://fortawesome.github.io/Font-Awesome/whats-new/) to version 4.6.1
 - Added optional GitHub and Bitbucket links to footer if set on `site.author` in `_config.yml`.
 ### Bug Fixes
 - Fixed Bitbucket URL typo in author sidebar.
 ## [3.0.3](https://github.com/mmistakes/minimal-mistakes/releases/tag/3.0.3)
 ### Enhancements
 - Rebuilt the entire theme: layouts, includes, stylesheets, scripts, you name it.
 - Refreshed the look and feel while staying true to the original design of the theme (author sidebar/main content).
 - Replaced grid system with [Susy](http://susy.oddbird.net/).
 - Replaced Grunt tasks with `npm` scripts.
 - Removed Google Fonts and replaced with system fonts to improve performance (they can be [added back](https://mmistakes.github.io/minimal-mistakes/docs/stylesheets/) if desired)
 - Greatly improved [theme documentation](https://mmistakes.github.io/minimal-mistakes/docs/quick-start-guide/).
 - Increased the amount of sample posts, sample pages, and sample collections to throughly test the theme and edge-cases.
 - Moved all sample content and assets out of `master` to keep it as clean as possible for forking.
 - Added new layouts for `splash` pages, archives for [`jekyll-archives`](https://github.com/jekyll/jekyll-archives) if enabled, and [`compress.html`](https://github.com/penibelst/jekyll-compress-html) to improve performance.
 - Added taxonomy links to posts (tags and categories).
 - Added optional "reading time" meta data.
 - Improved Liquid used for Twitter Cards and Open Graph data in `<head>`.
 - Improved `gallery` include helper and added `feature_row` for use with splash page layout.
 - Added Keybase.io, author web URI, and Bitbucket optional links to sidebar.
 - Add `feed.xml` link to footer.
 - Added a [UI text data file](https://mmistakes.github.io/minimal-mistakes/docs/ui-text/) to easily change all text found in the theme.
 - Added LinkedIn to optional social share buttons.
 - Added Facebook, Google+, and custom commenting options in addition to Disqus.
 - Added optional breadcrumb links.
 ## [2.2.1](https://github.com/mmistakes/minimal-mistakes/releases/tag/2.2.1)
 ## [2.2.0](https://github.com/mmistakes/minimal-mistakes/releases/tag/2.2.0)
 ### Enhancements
 - Add support for Jekyll 3.0
 - Minor updates to syntax highlighting CSS and theme documentation
 ## [2.1.3](https://github.com/mmistakes/minimal-mistakes/releases/tag/2.1.3)
 ### Enhancements
 - Cleaner print styles that remove the top navigation, social sharing buttons, and other elements not needed when printed.
 ## [2.1.2](https://github.com/mmistakes/minimal-mistakes/releases/tag/2.1.2)
 ### Enhancements
 - Add optional CodePen icon/url to author side bar [#156](https://github.com/mmistakes/minimal-mistakes/pull/156)
 - Documented Stackoverflow username explanation in `_config.yml` [#157](https://github.com/mmistakes/minimal-mistakes/pull/157)
 - Simplified Liquid in `post-index.html` to better handle year listings [#166](https://github.com/mmistakes/minimal-mistakes/pull/166)
 ### Bug Fixes
 - Cleanup Facebook related Open Graph meta tags [#149](https://github.com/mmistakes/minimal-mistakes/issues/149)
 - Corrected minor typos [#158](https://github.com/mmistakes/minimal-mistakes/pull/158) [#175](https://github.com/mmistakes/minimal-mistakes/issues/175)
 ## [2.1.1](https://github.com/mmistakes/minimal-mistakes/releases/tag/2.1.1)
 ### Enhancements
 - Add optional XING profile link to author sidebar
 - Include open graph meta tags for feature image (if assigned) [#149](https://github.com/mmistakes/minimal-mistakes/issues/149)
 - Create an include for feed footer
 ### Bug Fixes
 - Remove http protocol from Google search form on sample 404 page
 - Only show related posts if there are one or more available
 - Fix alignment of email address link in author sidebar
 ## [2.1.0](https://github.com/mmistakes/minimal-mistakes/releases/tag/2.1.0)
 ### Enhancements
 - Add optional social sharing buttons ([#42](https://github.com/mmistakes/minimal-mistakes/issues/42))
 ![social sharing buttons](https://cloud.githubusercontent.com/assets/1376749/5860522/d9f28a96-a22f-11e4-9b83-940a3a9a766a.png)
 - Add Soundcloud, YouTube ([#95](https://github.com/mmistakes/minimal-mistakes/pull/95)), Flickr ([#119](https://github.com/mmistakes/minimal-mistakes/pull/119)), and Weibo ([#116](https://github.com/mmistakes/minimal-mistakes/pull/116)) icons for use in author sidebar.
 - Fix typos in posts and documentation and remove references to Less
 - Include note about Octopress gem being optional
 - Post author override support extended to the Atom feed ([#71](https://github.com/mmistakes/minimal-mistakes/pull/71))
 - Only include email address in feed if specified in `_config.yml` or author `_data`
 - Wrap all page content in `#main` to harmonize article and post index styles ([#86](https://github.com/mmistakes/minimal-mistakes/issues/86))
 - Include new sample feature images for posts and pages
 - Table of contents improvements: fix collapse toggle, indent nested elements, show on small screens, and create an `_include` for reusing in posts and pages.
 - Include note about running Jekyll with `bundle exec` when using Bundler
 - Fix home page path in top navigation
 - Remove Google Authorship ([#120](https://github.com/mmistakes/minimal-mistakes/issues/120))
 - Remove duplicate author content that displayed in `div.article-author-bottom`
 - Removed unused `_sass/print.scss` styles
 - Improve comments in `.scss` files
 ## [2.0.0](https://github.com/mmistakes/minimal-mistakes/releases/tag/v2.0)
 ## [1.3.3](https://github.com/mmistakes/minimal-mistakes/releases/tag/1.3.3)
 ### Enhancements
 - Added new icons and profile links for Stackoverflow, Dribbble, Pinterest, Foursquare, and Steam to the author bio sidebar.
 - Cleaned up the Kramdown auto table of contents styling to be more readable
 - Removed page width specific .less stylesheets and created mixins for easier updating
 - Removed Modernizr since it wasn't being used
 - Added pages to sitemap.xml
 - Added category: to rake new_post task
 - Minor typographic changes
 ### Bug Fixes
 - Corrected various broken links in README and Theme Setup.
 ## [1.3.1](https://github.com/mmistakes/minimal-mistakes/releases/tag/1.3.1)
 ### Enhancements
 - Cleaned up table of contents styling
 - Reworked top navigation to be a better experience on small screens. Nav items now display vertically when the menu button is tapped, revealing links with larger touch targets.
 ![menu animation](https://camo.githubusercontent.com/3fbd8c1326485f4b1ab32c0005c0fca7660b5d31/68747470733a2f2f662e636c6f75642e6769746875622e636f6d2f6173736574732f313337363734392f323136343037352f31653366303663322d393465372d313165332d383961612d6436623636376562306564662e676966)
 ## [1.2.0](https://github.com/mmistakes/minimal-mistakes/releases/tag/1.2.0)
 ### Bug Fixes
 - Table weren't filling the entire width of the content container. They now scale at 100%. Thanks [@dhruvbhatia](https://github.com/dhruvbhatia)
 ### Enhancements
 - Decreased spacing between Markdown footnotes
 - Removed dark background on footer
 - Removed UPPERCASE styling on post titles in the index listing
 ## [1.1.4](https://github.com/mmistakes/minimal-mistakes/releases/tag/1.1.4)
 ### Bug Fixes
 - Fix top navigation bug issue ([#10](https://github.com/mmistakes/minimal-mistakes/issues/10)) for real this time. Remember to clear your floats kids.
 ## [1.1.3](https://github.com/mmistakes/minimal-mistakes/releases/tag/1.1.3)
 ### Bug Fixes
 - Fix top navigation links that weren't click able on small viewports (Issue [#10](https://github.com/mmistakes/minimal-mistakes/issues/10)).
 - Remove line wrap from top navigation links that may span multiple lines.
 ## [1.1.2](https://github.com/mmistakes/minimal-mistakes/releases/tag/1.1.2)
 ### Enhancements
 - Added Grunt build script for compiling Less/JavaScript and optimizing image assets.
 - Added support for large image summary Twitter card.
 - Stylesheet adjustments
 ## [1.1.1](https://github.com/mmistakes/minimal-mistakes/releases/tag/1.1.1)
 ### Bug Fixes
 - Removed [Typeplate](http://typeplate.com/) styles. Was [causing issues with newer versions of Less](https://github.com/typeplate/typeplate.github.io/issues/108) and is no longer maintained.
 ### Enhancements
 - Added [image attribution](http://mmistakes.github.io/minimal-mistakes/theme-setup/#feature-images) for post and page feature images.
 - Added [404 page](http://mmistakes.github.io/minimal-mistakes/404.html).
 - Cleaned up various Less variables to better align with naming conventions used in other MM Jekyll themes.
 - Removed Chrome Frame references.
 - Added global CSS3 transitions to text and block elements.
 - Improved typography in a few places.
 ## [1.0.2](https://github.com/mmistakes/minimal-mistakes/releases/tag/v1.0.2)
 ### Enhancements
 - Google Analytics, Google Authorship, webmaster verifies, and Twitter card meta are now optional.
 ## [1.0.1](https://github.com/mmistakes/minimal-mistakes/releases/tag/v1.0.1)
@@ -1,30 +1,14 @@
---
+                   GNU GENERAL PUBLIC LICENSE
-layout: splash
+                       Version 2, June 1991
 title : "GRID license"
 author_profile: false
 excerpt: "Grid is licensed under GPL 2.0"
 permalink: /license/
 header:
  overlay_color: "#333"
  cta_label: "GPL licenses FAQs"
  cta_url: "https://www.gnu.org/licenses/gpl-faq.html"
 ---
-{% include base_path %}
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
-GNU General Public License
+                            Preamble
 ==========================
-_Version 2, June 1991_  
+  The licenses for most software are designed to take away your
 _Copyright © 1989, 1991 Free Software Foundation, Inc.,_  
 _51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA_
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
 ### Preamble
 The licenses for most software are designed to take away your
 freedom to share and change it.  By contrast, the GNU General Public
 License is intended to guarantee your freedom to share and change free
 software--to make sure the software is free for all its users.  This
@@ -34,55 +18,56 @@ using it.  (Some other Free Software Foundation software is covered by
 the GNU Lesser General Public License instead.)  You can apply it to
 your programs, too.
-When we speak of free software, we are referring to freedom, not
+  When we speak of free software, we are referring to freedom, not
 price.  Our General Public Licenses are designed to make sure that you
 have the freedom to distribute copies of free software (and charge for
 this service if you wish), that you receive source code or can get it
 if you want it, that you can change the software or use pieces of it
 in new free programs; and that you know you can do these things.
-To protect your rights, we need to make restrictions that forbid
+  To protect your rights, we need to make restrictions that forbid
 anyone to deny you these rights or to ask you to surrender the rights.
 These restrictions translate to certain responsibilities for you if you
 distribute copies of the software, or if you modify it.
-For example, if you distribute copies of such a program, whether
+  For example, if you distribute copies of such a program, whether
 gratis or for a fee, you must give the recipients all the rights that
 you have.  You must make sure that they, too, receive or can get the
 source code.  And you must show them these terms so they know their
 rights.
-We protect your rights with two steps: **(1)** copyright the software, and
+  We protect your rights with two steps: (1) copyright the software, and
-**(2)** offer you this license which gives you legal permission to copy,
+(2) offer you this license which gives you legal permission to copy,
 distribute and/or modify the software.
-Also, for each author's protection and ours, we want to make certain
+  Also, for each author's protection and ours, we want to make certain
 that everyone understands that there is no warranty for this free
 software.  If the software is modified by someone else and passed on, we
 want its recipients to know that what they have is not the original, so
 that any problems introduced by others will not reflect on the original
 authors' reputations.
-Finally, any free program is threatened constantly by software
+  Finally, any free program is threatened constantly by software
 patents.  We wish to avoid the danger that redistributors of a free
 program will individually obtain patent licenses, in effect making the
 program proprietary.  To prevent this, we have made it clear that any
 patent must be licensed for everyone's free use or not licensed at all.
-The precise terms and conditions for copying, distribution and
+  The precise terms and conditions for copying, distribution and
 modification follow.
-### TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+                    GNU GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-**0.** This License applies to any program or other work which contains
+  0. This License applies to any program or other work which contains
 a notice placed by the copyright holder saying it may be distributed
-under the terms of this General Public License.  The “Program”, below,
+under the terms of this General Public License.  The "Program", below,
-refers to any such program or work, and a “work based on the Program”
+refers to any such program or work, and a "work based on the Program"
 means either the Program or any derivative work under copyright law:
 that is to say, a work containing the Program or a portion of it,
 either verbatim or with modifications and/or translated into another
 language.  (Hereinafter, translation is included without limitation in
-the term “modification”.)  Each licensee is addressed as “you”.
+the term "modification".)  Each licensee is addressed as "you".
 Activities other than copying, distribution and modification are not
 covered by this License; they are outside its scope.  The act of
@@ -91,7 +76,7 @@ is covered only if its contents constitute a work based on the
 Program (independent of having been made by running the Program).
 Whether that is true depends on what the Program does.
-**1.** You may copy and distribute verbatim copies of the Program's
+  1. You may copy and distribute verbatim copies of the Program's
 source code as you receive it, in any medium, provided that you
 conspicuously and appropriately publish on each copy an appropriate
 copyright notice and disclaimer of warranty; keep intact all the
@@ -102,27 +87,29 @@ along with the Program.
 You may charge a fee for the physical act of transferring a copy, and
 you may at your option offer warranty protection in exchange for a fee.
-**2.** You may modify your copy or copies of the Program or any portion
+  2. You may modify your copy or copies of the Program or any portion
 of it, thus forming a work based on the Program, and copy and
 distribute such modifications or work under the terms of Section 1
 above, provided that you also meet all of these conditions:
-* **a)** You must cause the modified files to carry prominent notices
+    a) You must cause the modified files to carry prominent notices
-stating that you changed the files and the date of any change.
+    stating that you changed the files and the date of any change.
-* **b)** You must cause any work that you distribute or publish, that in
+
-whole or in part contains or is derived from the Program or any
+    b) You must cause any work that you distribute or publish, that in
-part thereof, to be licensed as a whole at no charge to all third
+    whole or in part contains or is derived from the Program or any
-parties under the terms of this License.
+    part thereof, to be licensed as a whole at no charge to all third
-* **c)** If the modified program normally reads commands interactively
+    parties under the terms of this License.
-when run, you must cause it, when started running for such
+
-interactive use in the most ordinary way, to print or display an
+    c) If the modified program normally reads commands interactively
-announcement including an appropriate copyright notice and a
+    when run, you must cause it, when started running for such
-notice that there is no warranty (or else, saying that you provide
+    interactive use in the most ordinary way, to print or display an
-a warranty) and that users may redistribute the program under
+    announcement including an appropriate copyright notice and a
-these conditions, and telling the user how to view a copy of this
+    notice that there is no warranty (or else, saying that you provide
-License.  (Exception: if the Program itself is interactive but
+    a warranty) and that users may redistribute the program under
-does not normally print such an announcement, your work based on
+    these conditions, and telling the user how to view a copy of this
-the Program is not required to print an announcement.)
+    License.  (Exception: if the Program itself is interactive but
    does not normally print such an announcement, your work based on
    the Program is not required to print an announcement.)
 These requirements apply to the modified work as a whole.  If
 identifiable sections of that work are not derived from the Program,
@@ -144,24 +131,26 @@ with the Program (or with a work based on the Program) on a volume of
 a storage or distribution medium does not bring the other work under
 the scope of this License.
-**3.** You may copy and distribute the Program (or a work based on it,
+  3. You may copy and distribute the Program (or a work based on it,
 under Section 2) in object code or executable form under the terms of
 Sections 1 and 2 above provided that you also do one of the following:
-* **a)** Accompany it with the complete corresponding machine-readable
+    a) Accompany it with the complete corresponding machine-readable
-source code, which must be distributed under the terms of Sections
+    source code, which must be distributed under the terms of Sections
-1 and 2 above on a medium customarily used for software interchange; or,
+    1 and 2 above on a medium customarily used for software interchange; or,
-* **b)** Accompany it with a written offer, valid for at least three
+
-years, to give any third party, for a charge no more than your
+    b) Accompany it with a written offer, valid for at least three
-cost of physically performing source distribution, a complete
+    years, to give any third party, for a charge no more than your
-machine-readable copy of the corresponding source code, to be
+    cost of physically performing source distribution, a complete
-distributed under the terms of Sections 1 and 2 above on a medium
+    machine-readable copy of the corresponding source code, to be
-customarily used for software interchange; or,
+    distributed under the terms of Sections 1 and 2 above on a medium
-* **c)** Accompany it with the information you received as to the offer
+    customarily used for software interchange; or,
-to distribute corresponding source code.  (This alternative is
+
-allowed only for noncommercial distribution and only if you
+    c) Accompany it with the information you received as to the offer
-received the program in object code or executable form with such
+    to distribute corresponding source code.  (This alternative is
-an offer, in accord with Subsection b above.)
+    allowed only for noncommercial distribution and only if you
    received the program in object code or executable form with such
    an offer, in accord with Subsection b above.)
 The source code for a work means the preferred form of the work for
 making modifications to it.  For an executable work, complete source
@@ -180,7 +169,7 @@ access to copy the source code from the same place counts as
 distribution of the source code, even though third parties are not
 compelled to copy the source along with the object code.
-**4.** You may not copy, modify, sublicense, or distribute the Program
+  4. You may not copy, modify, sublicense, or distribute the Program
 except as expressly provided under this License.  Any attempt
 otherwise to copy, modify, sublicense or distribute the Program is
 void, and will automatically terminate your rights under this License.
@@ -188,7 +177,7 @@ However, parties who have received copies, or rights, from you under
 this License will not have their licenses terminated so long as such
 parties remain in full compliance.
-**5.** You are not required to accept this License, since you have not
+  5. You are not required to accept this License, since you have not
 signed it.  However, nothing else grants you permission to modify or
 distribute the Program or its derivative works.  These actions are
 prohibited by law if you do not accept this License.  Therefore, by
@@ -197,7 +186,7 @@ Program), you indicate your acceptance of this License to do so, and
 all its terms and conditions for copying, distributing or modifying
 the Program or works based on it.
-**6.** Each time you redistribute the Program (or any work based on the
+  6. Each time you redistribute the Program (or any work based on the
 Program), the recipient automatically receives a license from the
 original licensor to copy, distribute or modify the Program subject to
 these terms and conditions.  You may not impose any further
@@ -205,7 +194,7 @@ restrictions on the recipients' exercise of the rights granted herein.
 You are not responsible for enforcing compliance by third parties to
 this License.
-**7.** If, as a consequence of a court judgment or allegation of patent
+  7. If, as a consequence of a court judgment or allegation of patent
 infringement or for any other reason (not limited to patent issues),
 conditions are imposed on you (whether by court order, agreement or
 otherwise) that contradict the conditions of this License, they do not
@@ -237,7 +226,7 @@ impose that choice.
 This section is intended to make thoroughly clear what is believed to
 be a consequence of the rest of this License.
-**8.** If the distribution and/or use of the Program is restricted in
+  8. If the distribution and/or use of the Program is restricted in
 certain countries either by patents or by copyrighted interfaces, the
 original copyright holder who places the Program under this License
 may add an explicit geographical distribution limitation excluding
@@ -245,20 +234,20 @@ those countries, so that distribution is permitted only in or among
 countries not thus excluded.  In such case, this License incorporates
 the limitation as if written in the body of this License.
-**9.** The Free Software Foundation may publish revised and/or new versions
+  9. The Free Software Foundation may publish revised and/or new versions
 of the General Public License from time to time.  Such new versions will
 be similar in spirit to the present version, but may differ in detail to
 address new problems or concerns.
 Each version is given a distinguishing version number.  If the Program
-specifies a version number of this License which applies to it and “any
+specifies a version number of this License which applies to it and "any
-later version”, you have the option of following the terms and conditions
+later version", you have the option of following the terms and conditions
 either of that version or of any later version published by the Free
 Software Foundation.  If the Program does not specify a version number of
 this License, you may choose any version ever published by the Free Software
 Foundation.
-**10.** If you wish to incorporate parts of the Program into other free
+  10. If you wish to incorporate parts of the Program into other free
 programs whose distribution conditions are different, write to the author
 to ask for permission.  For software which is copyrighted by the Free
 Software Foundation, write to the Free Software Foundation; we sometimes
@@ -266,19 +255,19 @@ make exceptions for this.  Our decision will be guided by the two goals
 of preserving the free status of all derivatives of our free software and
 of promoting the sharing and reuse of software generally.
-### NO WARRANTY
+                            NO WARRANTY
-**11.** BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
 FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
 OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
-PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
 OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
 TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
 PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
 REPAIR OR CORRECTION.
-**12.** IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
 WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
 REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
 INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
@@ -288,18 +277,18 @@ YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
 PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGES.
-END OF TERMS AND CONDITIONS
+                     END OF TERMS AND CONDITIONS
-### How to Apply These Terms to Your New Programs
+            How to Apply These Terms to Your New Programs
-If you develop a new program, and you want it to be of the greatest
+  If you develop a new program, and you want it to be of the greatest
 possible use to the public, the best way to achieve this is to make it
 free software which everyone can redistribute and change under these terms.
-To do so, attach the following notices to the program.  It is safest
+  To do so, attach the following notices to the program.  It is safest
 to attach them to the start of each source file to most effectively
 convey the exclusion of warranty; and each file should have at least
-the “copyright” line and a pointer to where the full notice is found.
+the "copyright" line and a pointer to where the full notice is found.
    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>
@@ -328,13 +317,13 @@ when it starts in an interactive mode:
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.
-The hypothetical commands `show w` and `show c` should show the appropriate
+The hypothetical commands `show w' and `show c' should show the appropriate
 parts of the General Public License.  Of course, the commands you use may
-be called something other than `show w` and `show c`; they could even be
+be called something other than `show w' and `show c'; they could even be
 mouse-clicks or menu items--whatever suits your program.
 You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a “copyright disclaimer” for the program, if
+school, if any, to sign a "copyright disclaimer" for the program, if
 necessary.  Here is a sample; alter the names:
  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
@@ -349,13 +338,3 @@ consider it more useful to permit linking proprietary applications with the
 library.  If this is what you want to do, use the GNU Lesser General
 Public License instead of this License.
 {% for post in paginator.posts %}
  {% include archive-single.html %}
 {% endfor %}
 {% include paginator.html %}
@@ -1,27 +0,0 @@
 source "https://rubygems.org"
 # Hello! This is where you manage which Jekyll version is used to run.
 # When you want to use a different version, change it below, save the
 # file and run `bundle install`. Run Jekyll with `bundle exec`, like so:
 #
 #     bundle exec jekyll serve
 #
 # This will help ensure the proper Jekyll version is running.
 # Happy Jekylling!
 gem "github-pages", group: :jekyll_plugins
 # If you want to use Jekyll native, uncomment the line below.
 # To upgrade, run `bundle update`.
 # gem "jekyll"
 gem "wdm", "~> 0.1.0" if Gem.win_platform?
 # If you have any plugins, put them here!
 group :jekyll_plugins do
  # gem "jekyll-archives"
 gem 'jekyll-octicons'
 end
@@ -1,155 +0,0 @@
 GEM
  remote: https://rubygems.org/
  specs:
    activesupport (4.2.7)
      i18n (~> 0.7)
      json (~> 1.7, >= 1.7.7)
      minitest (~> 5.1)
      thread_safe (~> 0.3, >= 0.3.4)
      tzinfo (~> 1.1)
    addressable (2.4.0)
    coffee-script (2.4.1)
      coffee-script-source
      execjs
    coffee-script-source (1.10.0)
    colorator (1.1.0)
    ethon (0.9.1)
      ffi (>= 1.3.0)
    execjs (2.7.0)
    faraday (0.9.2)
      multipart-post (>= 1.2, < 3)
    ffi (1.9.14)
    ffi (1.9.14-x64-mingw32)
    forwardable-extended (2.6.0)
    gemoji (2.1.0)
    github-pages (104)
      activesupport (= 4.2.7)
      github-pages-health-check (= 1.2.0)
      jekyll (= 3.3.0)
      jekyll-avatar (= 0.4.2)
      jekyll-coffeescript (= 1.0.1)
      jekyll-feed (= 0.8.0)
      jekyll-gist (= 1.4.0)
      jekyll-github-metadata (= 2.2.0)
      jekyll-mentions (= 1.2.0)
      jekyll-paginate (= 1.1.0)
      jekyll-redirect-from (= 0.11.0)
      jekyll-sass-converter (= 1.3.0)
      jekyll-seo-tag (= 2.1.0)
      jekyll-sitemap (= 0.12.0)
      jekyll-swiss (= 0.4.0)
      jemoji (= 0.7.0)
      kramdown (= 1.11.1)
      liquid (= 3.0.6)
      listen (= 3.0.6)
      mercenary (~> 0.3)
      minima (= 2.0.0)
      rouge (= 1.11.1)
      terminal-table (~> 1.4)
    github-pages-health-check (1.2.0)
      addressable (~> 2.3)
      net-dns (~> 0.8)
      octokit (~> 4.0)
      public_suffix (~> 1.4)
      typhoeus (~> 0.7)
    html-pipeline (2.4.2)
      activesupport (>= 2)
      nokogiri (>= 1.4)
    i18n (0.7.0)
    jekyll (3.3.0)
      addressable (~> 2.4)
      colorator (~> 1.0)
      jekyll-sass-converter (~> 1.0)
      jekyll-watch (~> 1.1)
      kramdown (~> 1.3)
      liquid (~> 3.0)
      mercenary (~> 0.3.3)
      pathutil (~> 0.9)
      rouge (~> 1.7)
      safe_yaml (~> 1.0)
    jekyll-avatar (0.4.2)
      jekyll (~> 3.0)
    jekyll-coffeescript (1.0.1)
      coffee-script (~> 2.2)
    jekyll-feed (0.8.0)
      jekyll (~> 3.3)
    jekyll-gist (1.4.0)
      octokit (~> 4.2)
    jekyll-github-metadata (2.2.0)
      jekyll (~> 3.1)
      octokit (~> 4.0, != 4.4.0)
    jekyll-mentions (1.2.0)
      activesupport (~> 4.0)
      html-pipeline (~> 2.3)
      jekyll (~> 3.0)
    jekyll-octicons (3.0.1)
      jekyll (~> 3.1)
      octicons (~> 3.0)
    jekyll-paginate (1.1.0)
    jekyll-redirect-from (0.11.0)
      jekyll (>= 2.0)
    jekyll-sass-converter (1.3.0)
      sass (~> 3.2)
    jekyll-seo-tag (2.1.0)
      jekyll (~> 3.3)
    jekyll-sitemap (0.12.0)
      jekyll (~> 3.3)
    jekyll-swiss (0.4.0)
    jekyll-watch (1.5.0)
      listen (~> 3.0, < 3.1)
    jemoji (0.7.0)
      activesupport (~> 4.0)
      gemoji (~> 2.0)
      html-pipeline (~> 2.2)
      jekyll (>= 3.0)
    json (1.8.3)
    kramdown (1.11.1)
    liquid (3.0.6)
    listen (3.0.6)
      rb-fsevent (>= 0.9.3)
      rb-inotify (>= 0.9.7)
    mercenary (0.3.6)
    mini_portile2 (2.1.0)
    minima (2.0.0)
    minitest (5.9.1)
    multipart-post (2.0.0)
    net-dns (0.8.0)
    nokogiri (1.6.8.1)
      mini_portile2 (~> 2.1.0)
    nokogiri (1.6.8.1-x64-mingw32)
      mini_portile2 (~> 2.1.0)
    octicons (3.0.1)
      nokogiri (>= 1.6.3.1)
    octokit (4.4.1)
      sawyer (~> 0.7.0, >= 0.5.3)
    pathutil (0.14.0)
      forwardable-extended (~> 2.6)
    public_suffix (1.5.3)
    rb-fsevent (0.9.8)
    rb-inotify (0.9.7)
      ffi (>= 0.5.0)
    rouge (1.11.1)
    safe_yaml (1.0.4)
    sass (3.4.22)
    sawyer (0.7.0)
      addressable (>= 2.3.5, < 2.5)
      faraday (~> 0.8, < 0.10)
    terminal-table (1.7.3)
      unicode-display_width (~> 1.1.1)
    thread_safe (0.3.5)
    typhoeus (0.8.0)
      ethon (>= 0.8.0)
    tzinfo (1.2.2)
      thread_safe (~> 0.1)
    unicode-display_width (1.1.1)
 PLATFORMS
  ruby
  x64-mingw32
 DEPENDENCIES
  github-pages
  jekyll-octicons
 BUNDLED WITH
   1.13.3
@@ -0,0 +1,37 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/DisableWarnings.h
 Copyright (C) 2016
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef DISABLE_WARNINGS_H
 #define DISABLE_WARNINGS_H
 //disables and intel compiler specific warning (in json.hpp)
 #pragma warning disable 488  
 #endif
@@ -0,0 +1,49 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Grid.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 //
 //  Grid.h
 //  simd
 //
 //  Created by Peter Boyle on 09/05/2014.
 //  Copyright (c) 2014 University of Edinburgh. All rights reserved.
 //
 #ifndef GRID_H
 #define GRID_H
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/Action.h>
 #include <Grid/qcd/utils/GaugeFix.h>
 #include <Grid/qcd/smearing/Smearing.h>
 #include <Grid/parallelIO/MetaData.h>
 #include <Grid/qcd/hmc/HMC_aggregate.h>
 #endif
@@ -0,0 +1,61 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Grid.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 //
 //  Grid.h
 //  simd
 //
 //  Created by Peter Boyle on 09/05/2014.
 //  Copyright (c) 2014 University of Edinburgh. All rights reserved.
 //
 #ifndef GRID_BASE_H
 #define GRID_BASE_H
 #include <Grid/GridStd.h>
 #include <Grid/perfmon/Timer.h>
 #include <Grid/perfmon/PerfCount.h>
 #include <Grid/log/Log.h>
 #include <Grid/allocator/AlignedAllocator.h>
 #include <Grid/simd/Simd.h>
 #include <Grid/serialisation/Serialisation.h>
 #include <Grid/threads/Threads.h>
 #include <Grid/util/Util.h>
 #include <Grid/util/Sha.h>
 #include <Grid/communicator/Communicator.h> 
 #include <Grid/cartesian/Cartesian.h>    
 #include <Grid/tensors/Tensors.h>      
 #include <Grid/lattice/Lattice.h>      
 #include <Grid/cshift/Cshift.h>       
 #include <Grid/stencil/Stencil.h>      
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/algorithms/Algorithms.h>   
 #endif
@@ -0,0 +1,42 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Grid.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_QCD_CORE_H
 #define GRID_QCD_CORE_H
 /////////////////////////
 // Core Grid QCD headers
 /////////////////////////
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 #endif
@@ -0,0 +1,29 @@
 #ifndef GRID_STD_H
 #define GRID_STD_H
 ///////////////////
 // Std C++ dependencies
 ///////////////////
 #include <cassert>
 #include <complex>
 #include <vector>
 #include <string>
 #include <iostream>
 #include <iomanip>
 #include <random>
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
 #include <sys/time.h>
 #include <chrono>
 #include <zlib.h>
 ///////////////////
 // Grid config
 ///////////////////
 #include "Config.h"
 #endif /* GRID_STD_H */
@@ -0,0 +1,14 @@
 #pragma once
 // Force Eigen to use MKL if Grid has been configured with --enable-mkl
 #ifdef USE_MKL
 #define EIGEN_USE_MKL_ALL
 #endif
 #if defined __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
 #include <Grid/Eigen/Dense>
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif
@@ -0,0 +1,63 @@
 extra_sources=
 extra_headers=
 if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_mpi3.cc
  extra_sources+=communicator/Communicator_base.cc
  extra_sources+=communicator/SharedMemoryMPI.cc
  extra_sources+=communicator/SharedMemory.cc
 endif
 if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_none.cc
  extra_sources+=communicator/Communicator_base.cc
  extra_sources+=communicator/SharedMemoryNone.cc
  extra_sources+=communicator/SharedMemory.cc
 endif
 if BUILD_HDF5
  extra_sources+=serialisation/Hdf5IO.cc 
  extra_headers+=serialisation/Hdf5IO.h
  extra_headers+=serialisation/Hdf5Type.h
 endif
 all: version-cache
 version-cache:
 	@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
 		a="uncommited changes";\
 	else\
 		a="clean";\
 	fi;\
 	echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d $$a\\"%n" HEAD`" > vertmp;\
 	if [ -e version-cache ]; then\
 		d=`diff vertmp version-cache`;\
 		if [ "$${d}" != "" ]; then\
 			mv vertmp version-cache;\
 			rm -f Version.h;\
 		fi;\
 	else\
 		mv vertmp version-cache;\
 		rm -f Version.h;\
 	fi;\
 	rm -f vertmp
 Version.h:
 	cp version-cache Version.h
 .PHONY: version-cache
 #
 # Libraries
 #
 include Make.inc
 include Eigen.inc
 lib_LIBRARIES = libGrid.a
 CCFILES += $(extra_sources)
 HFILES  += $(extra_headers) Config.h Version.h
 libGrid_a_SOURCES              = $(CCFILES)
 libGrid_adir                   = $(includedir)/Grid
 nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) $(eigen_unsupp_files)
@@ -0,0 +1,61 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/Algorithms.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_ALGORITHMS_H
 #define GRID_ALGORITHMS_H
 #include <Grid/algorithms/SparseMatrix.h>
 #include <Grid/algorithms/LinearOperator.h>
 #include <Grid/algorithms/Preconditioner.h>
 #include <Grid/algorithms/approx/Zolotarev.h>
 #include <Grid/algorithms/approx/Chebyshev.h>
 #include <Grid/algorithms/approx/Remez.h>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
 #include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/CoarsenedMatrix.h>
 #include <Grid/algorithms/FFT.h>
 // EigCg
 // Pcg
 // Hdcg
 // GCR
 // etc..
 #endif
@@ -0,0 +1,480 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/CoarsenedMatrix.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H
 namespace Grid {
  class Geometry {
    //    int dimension;
  public:
    int npoint;
    std::vector<int> directions   ;
    std::vector<int> displacements;
  Geometry(int _d)  {
      int base = (_d==5) ? 1:0;
      // make coarse grid stencil for 4d , not 5d
      if ( _d==5 ) _d=4;
      npoint = 2*_d+1;
      directions.resize(npoint);
      displacements.resize(npoint);
      for(int d=0;d<_d;d++){
 	directions[2*d  ] = d+base;
 	directions[2*d+1] = d+base;
 	displacements[2*d  ] = +1;
 	displacements[2*d+1] = -1;
      }
      directions   [2*_d]=0;
      displacements[2*_d]=0;
      //// report back
      std::cout<<GridLogMessage<<"directions    :";
      for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
      std::cout <<std::endl;
      std::cout<<GridLogMessage<<"displacements :";
      for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
      std::cout<<std::endl;
    }
    /*
      // Original cleaner code
    Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
      for(int d=0;d<dimension;d++){
 	directions[2*d  ] = d;
 	directions[2*d+1] = d;
 	displacements[2*d  ] = +1;
 	displacements[2*d+1] = -1;
      }
      directions   [2*dimension]=0;
      displacements[2*dimension]=0;
    }
    std::vector<int> GetDelta(int point) {
      std::vector<int> delta(dimension,0);
      delta[directions[point]] = displacements[point];
      return delta;
    };
    */    
  };
  template<class Fobj,class CComplex,int nbasis>
  class Aggregation   {
  public:
    typedef iVector<CComplex,nbasis >             siteVector;
    typedef Lattice<siteVector>                 CoarseVector;
    typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
    typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
    typedef Lattice<Fobj >        FineField;
    GridBase *CoarseGrid;
    GridBase *FineGrid;
    std::vector<Lattice<Fobj> > subspace;
    int checkerboard;
  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
    CoarseGrid(_CoarseGrid),
      FineGrid(_FineGrid),
      subspace(nbasis,_FineGrid),
      checkerboard(_checkerboard)
 	{
 	};
    void Orthogonalise(void){
      CoarseScalar InnerProd(CoarseGrid); 
      std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
      blockOrthogonalise(InnerProd,subspace);
      std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
      blockOrthogonalise(InnerProd,subspace);
      //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
      //      CheckOrthogonal();
    } 
    void CheckOrthogonal(void){
      CoarseVector iProj(CoarseGrid); 
      CoarseVector eProj(CoarseGrid); 
      for(int i=0;i<nbasis;i++){
 	blockProject(iProj,subspace[i],subspace);
 	eProj=zero; 
 	parallel_for(int ss=0;ss<CoarseGrid->oSites();ss++){
 	  eProj._odata[ss](i)=CComplex(1.0);
 	}
 	eProj=eProj - iProj;
 	std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
      }
      std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
    }
    void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
      blockProject(CoarseVec,FineVec,subspace);
    }
    void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
      FineVec.checkerboard = subspace[0].checkerboard;
      blockPromote(CoarseVec,FineVec,subspace);
    }
    void CreateSubspaceRandom(GridParallelRNG &RNG){
      for(int i=0;i<nbasis;i++){
 	random(RNG,subspace[i]);
 	std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
      }
      Orthogonalise();
    }
    /*
    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
    {
      // Run a Lanczos with sloppy convergence
 	const int Nstop = nn;
 	const int Nk = nn+20;
 	const int Np = nn+20;
 	const int Nm = Nk+Np;
 	const int MaxIt= 10000;
 	RealD resid = 1.0e-3;
 	Chebyshev<FineField> Cheb(0.5,64.0,21);
 	ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
 	//	IRL.lock = 1;
 	FineField noise(FineGrid); gaussian(RNG,noise);
 	FineField tmp(FineGrid); 
 	std::vector<RealD>     eval(Nm);
 	std::vector<FineField> evec(Nm,FineGrid);
 	int Nconv;
 	IRL.calc(eval,evec,
 		 noise,
 		 Nconv);
    	// pull back nn vectors
 	for(int b=0;b<nn;b++){
 	  subspace[b]   = evec[b];
 	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 	  hermop.Op(subspace[b],tmp); 
 	  std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
 	  noise = tmp -  sqrt(eval[b])*subspace[b] ;
 	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
 	  noise = tmp +  eval[b]*subspace[b] ;
 	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
 	}
 	Orthogonalise();
 	for(int b=0;b<nn;b++){
 	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 	}
    }
    */
    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
      RealD scale;
      ConjugateGradient<FineField> CG(1.0e-2,10000);
      FineField noise(FineGrid);
      FineField Mn(FineGrid);
      for(int b=0;b<nn;b++){
 	gaussian(RNG,noise);
 	scale = std::pow(norm2(noise),-0.5); 
 	noise=noise*scale;
 	hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
 	for(int i=0;i<1;i++){
 	  CG(hermop,noise,subspace[b]);
 	  noise = subspace[b];
 	  scale = std::pow(norm2(noise),-0.5); 
 	  noise=noise*scale;
 	}
 	hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
 	subspace[b]   = noise;
      }
      Orthogonalise();
    }
  };
  // Fine Object == (per site) type of fine field
  // nbasis      == number of deflation vectors
  template<class Fobj,class CComplex,int nbasis>
  class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
  public:
    typedef iVector<CComplex,nbasis >             siteVector;
    typedef Lattice<siteVector>                 CoarseVector;
    typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
    typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
    typedef Lattice<Fobj >        FineField;
    ////////////////////
    // Data members
    ////////////////////
    Geometry         geom;
    GridBase *       _grid; 
    CartesianStencil<siteVector,siteVector> Stencil; 
    std::vector<CoarseMatrix> A;
    ///////////////////////
    // Interface
    ///////////////////////
    GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know
    RealD M (const CoarseVector &in, CoarseVector &out){
      conformable(_grid,in._grid);
      conformable(in._grid,out._grid);
      SimpleCompressor<siteVector> compressor;
      Stencil.HaloExchange(in,compressor);
      parallel_for(int ss=0;ss<Grid()->oSites();ss++){
        siteVector res = zero;
 	siteVector nbr;
 	int ptype;
 	StencilEntry *SE;
 	for(int point=0;point<geom.npoint;point++){
 	  SE=Stencil.GetEntry(ptype,point,ss);
 	  if(SE->_is_local&&SE->_permute) { 
 	    permute(nbr,in._odata[SE->_offset],ptype);
 	  } else if(SE->_is_local) { 
 	    nbr = in._odata[SE->_offset];
 	  } else {
 	    nbr = Stencil.CommBuf()[SE->_offset];
 	  }
 	  res = res + A[point]._odata[ss]*nbr;
 	}
 	vstream(out._odata[ss],res);
      }
      return norm2(out);
    };
    RealD Mdag (const CoarseVector &in, CoarseVector &out){ 
      return M(in,out);
    };
    // Defer support for further coarsening for now
    void Mdiag    (const CoarseVector &in,  CoarseVector &out){};
    void Mdir     (const CoarseVector &in,  CoarseVector &out,int dir, int disp){};
    CoarsenedMatrix(GridCartesian &CoarseGrid) 	: 
      _grid(&CoarseGrid),
      geom(CoarseGrid._ndimension),
      Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
      A(geom.npoint,&CoarseGrid)
    {
    };
    void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
 			 Aggregation<Fobj,CComplex,nbasis> & Subspace){
      FineField iblock(FineGrid); // contributions from within this block
      FineField oblock(FineGrid); // contributions from outwith this block
      FineField     phi(FineGrid);
      FineField     tmp(FineGrid);
      FineField     zz(FineGrid); zz=zero;
      FineField    Mphi(FineGrid);
      Lattice<iScalar<vInteger> > coor(FineGrid);
      CoarseVector iProj(Grid()); 
      CoarseVector oProj(Grid()); 
      CoarseScalar InnerProd(Grid()); 
      // Orthogonalise the subblocks over the basis
      blockOrthogonalise(InnerProd,Subspace.subspace);
      // Compute the matrix elements of linop between this orthonormal
      // set of vectors.
      int self_stencil=-1;
      for(int p=0;p<geom.npoint;p++){ 
 	A[p]=zero;
 	if( geom.displacements[p]==0){
 	  self_stencil=p;
 	}
      }
      assert(self_stencil!=-1);
      for(int i=0;i<nbasis;i++){
 	phi=Subspace.subspace[i];
 	std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
 	for(int p=0;p<geom.npoint;p++){ 
 	  int dir   = geom.directions[p];
 	  int disp  = geom.displacements[p];
 	  Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
 	  LatticeCoordinate(coor,dir);
 	  if ( disp==0 ){
 	    linop.OpDiag(phi,Mphi);
 	  }
 	  else  {
 	    linop.OpDir(phi,Mphi,dir,disp); 
 	  }
 	  ////////////////////////////////////////////////////////////////////////
 	  // Pick out contributions coming from this cell and neighbour cell
 	  ////////////////////////////////////////////////////////////////////////
 	  if ( disp==0 ) {
 	    iblock = Mphi;
 	    oblock = zero;
 	  } else if ( disp==1 ) {
 	    oblock = where(mod(coor,block)==(block-1),Mphi,zz);
 	    iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
 	  } else if ( disp==-1 ) {
 	    oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
 	    iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
 	  } else {
 	    assert(0);
 	  }
 	  Subspace.ProjectToSubspace(iProj,iblock);
 	  Subspace.ProjectToSubspace(oProj,oblock);
 	  //	  blockProject(iProj,iblock,Subspace.subspace);
 	  //	  blockProject(oProj,oblock,Subspace.subspace);
 	  parallel_for(int ss=0;ss<Grid()->oSites();ss++){
 	    for(int j=0;j<nbasis;j++){
 	      if( disp!= 0 ) {
 		A[p]._odata[ss](j,i) = oProj._odata[ss](j);
 	      }
 	      A[self_stencil]._odata[ss](j,i) =	A[self_stencil]._odata[ss](j,i) + iProj._odata[ss](j);
 	    }
 	  }
 	}
      }
 #if 0
      ///////////////////////////
      // test code worth preserving in if block
      ///////////////////////////
      std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
      for(int p=0;p<geom.npoint;p++){
 	std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
 	std::cout<<GridLogMessage<< A[p] << std::endl;
      }
      std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
      phi=Subspace.subspace[0];
      std::vector<int> bc(FineGrid->_ndimension,0);
      blockPick(Grid(),phi,tmp,bc);      // Pick out a block
      linop.Op(tmp,Mphi);                // Apply big dop
      blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
      std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
      std::cout<<GridLogMessage<< iProj <<std::endl;
      std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
      //      ForceHermitian();
      AssertHermitian();
      // ForceDiagonal();
    }
    void ForceDiagonal(void) {
      std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
      std::cout<<GridLogMessage<<"****   Forcing coarse operator to be diagonal ****"<<std::endl;
      std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
      for(int p=0;p<8;p++){
 	A[p]=zero;
      }
      GridParallelRNG  RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34}));
      Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);
      Complex one(1.0);
      iMatrix<CComplex,nbasis> ident;  ident=one;
      val = val*adj(val);
      val = val + 1.0;
      A[8] = val*ident;
      //      for(int s=0;s<Grid()->oSites();s++) {
      //	A[8]._odata[s]=val._odata[s];
      //      }
    }
    void ForceHermitian(void) {
      for(int d=0;d<4;d++){
 	int dd=d+1;
 	A[2*d] = adj(Cshift(A[2*d+1],dd,1));
      }
      //      A[8] = 0.5*(A[8] + adj(A[8]));
    }
    void AssertHermitian(void) {
      CoarseMatrix AA    (Grid());
      CoarseMatrix AAc   (Grid());
      CoarseMatrix Diff  (Grid());
      for(int d=0;d<4;d++){
 	int dd=d+1;
 	AAc = Cshift(A[2*d+1],dd,1);
 	AA  = A[2*d];
 	Diff = AA - adj(AAc);
 	std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
 	std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
      }
      Diff = A[8] - adj(A[8]);
      std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
      std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
    }
  };
 }
 #endif
@@ -0,0 +1,306 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Cshift.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_
 #ifdef HAVE_FFTW
 #ifdef USE_MKL
 #include <fftw/fftw3.h>
 #else
 #include <fftw3.h>
 #endif
 #endif
 namespace Grid {
  template<class scalar> struct FFTW { };
 #ifdef HAVE_FFTW	
  template<> struct FFTW<ComplexD> {
  public:
    typedef fftw_complex FFTW_scalar;
    typedef fftw_plan    FFTW_plan;
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 					FFTW_scalar *in, const int *inembed,		
 					int istride, int idist,		
 					FFTW_scalar *out, const int *onembed,		
 					int ostride, int odist,		
 					int sign, unsigned flags) {
      return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
    }	  
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
      ::fftw_flops(p,add,mul,fmas);
    }
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
      ::fftw_execute_dft(p,in,out);
    }
    inline static void fftw_destroy_plan(const FFTW_plan p) {
      ::fftw_destroy_plan(p);
    }
  };
  template<> struct FFTW<ComplexF> {
  public:
    typedef fftwf_complex FFTW_scalar;
    typedef fftwf_plan    FFTW_plan;
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 					FFTW_scalar *in, const int *inembed,		
 					int istride, int idist,		
 					FFTW_scalar *out, const int *onembed,		
 					int ostride, int odist,		
 					int sign, unsigned flags) {
      return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
    }	  
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
      ::fftwf_flops(p,add,mul,fmas);
    }
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
      ::fftwf_execute_dft(p,in,out);
    }
    inline static void fftw_destroy_plan(const FFTW_plan p) {
      ::fftwf_destroy_plan(p);
    }
  };
 #endif
 #ifndef FFTW_FORWARD
 #define FFTW_FORWARD (-1)
 #define FFTW_BACKWARD (+1)
 #endif
  class FFT {
  private:
    GridCartesian *vgrid;
    GridCartesian *sgrid;
    int Nd;
    double flops;
    double flops_call;
    uint64_t usec;
    std::vector<int> dimensions;
    std::vector<int> processors;
    std::vector<int> processor_coor;
  public:
    static const int forward=FFTW_FORWARD;
    static const int backward=FFTW_BACKWARD;
    double Flops(void) {return flops;}
    double MFlops(void) {return flops/usec;}
    double USec(void)   {return (double)usec;}    
    FFT ( GridCartesian * grid ) :
    vgrid(grid),
    Nd(grid->_ndimension),
    dimensions(grid->_fdimensions),
    processors(grid->_processors),
    processor_coor(grid->_processor_coor)
    {
      flops=0;
      usec =0;
      std::vector<int> layout(Nd,1);
      sgrid = new GridCartesian(dimensions,layout,processors);
    };
    ~FFT ( void)  {
      delete sgrid;
    }
    template<class vobj>
    void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){
      conformable(result._grid,vgrid);
      conformable(source._grid,vgrid);
      Lattice<vobj> tmp(vgrid);
      tmp = source;
      for(int d=0;d<Nd;d++){
 	if( mask[d] ) {
 	  FFT_dim(result,tmp,d,sign);
 	  tmp=result;
 	}
      }
    }
    template<class vobj>
    void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
      std::vector<int> mask(Nd,1);
      FFT_dim_mask(result,source,mask,sign);
    }
    template<class vobj>
    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
 #ifndef HAVE_FFTW
      assert(0);
 #else
      conformable(result._grid,vgrid);
      conformable(source._grid,vgrid);
      int L = vgrid->_ldimensions[dim];
      int G = vgrid->_fdimensions[dim];
      std::vector<int> layout(Nd,1);
      std::vector<int> pencil_gd(vgrid->_fdimensions);
      pencil_gd[dim] = G*processors[dim];
      // Pencil global vol LxLxGxLxL per node
      GridCartesian pencil_g(pencil_gd,layout,processors);
      // Construct pencils
      typedef typename vobj::scalar_object sobj;
      typedef typename sobj::scalar_type   scalar;
      Lattice<sobj> pgbuf(&pencil_g);
      typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
      typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
      int Ncomp = sizeof(sobj)/sizeof(scalar);
      int Nlow  = 1;
      for(int d=0;d<dim;d++){
        Nlow*=vgrid->_ldimensions[d];
      }
      int rank = 1;  /* 1d transforms */
      int n[] = {G}; /* 1d transforms of length G */
      int howmany = Ncomp;
      int odist,idist,istride,ostride;
      idist   = odist   = 1;          /* Distance between consecutive FT's */
      istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
      int *inembed = n, *onembed = n;
      scalar div;
 	  if ( sign == backward ) div = 1.0/G;
 	  else if ( sign == forward ) div = 1.0;
 	  else assert(0);
      FFTW_plan p;
      {
        FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0];
        FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0];
        p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
                                             in,inembed,
                                             istride,idist,
                                             out,onembed,
                                             ostride, odist,
                                             sign,FFTW_ESTIMATE);
      }
      // Barrel shift and collect global pencil
      std::vector<int> lcoor(Nd), gcoor(Nd);
      result = source;
      int pc = processor_coor[dim];
      for(int p=0;p<processors[dim];p++) {
        PARALLEL_REGION
        {
          std::vector<int> cbuf(Nd);
          sobj s;
          PARALLEL_FOR_LOOP_INTERN
          for(int idx=0;idx<sgrid->lSites();idx++) {
            sgrid->LocalIndexToLocalCoor(idx,cbuf);
            peekLocalSite(s,result,cbuf);
 	    cbuf[dim]+=((pc+p) % processors[dim])*L;
 	    //            cbuf[dim]+=p*L;
            pokeLocalSite(s,pgbuf,cbuf);
          }
        }
        if (p != processors[dim] - 1)
        {
          result = Cshift(result,dim,L);
        }
      }
      // Loop over orthog coords
      int NN=pencil_g.lSites();
      GridStopWatch timer;
      timer.Start();
      PARALLEL_REGION
      {
        std::vector<int> cbuf(Nd);
        PARALLEL_FOR_LOOP_INTERN
        for(int idx=0;idx<NN;idx++) {
          pencil_g.LocalIndexToLocalCoor(idx, cbuf);
          if ( cbuf[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
            FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx];
            FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx];
            FFTW<scalar>::fftw_execute_dft(p,in,out);
          }
        }
      }
      timer.Stop();
      // performance counting
      double add,mul,fma;
      FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
      flops_call = add+mul+2.0*fma;
      usec += timer.useconds();
      flops+= flops_call*NN;
      // writing out result
      PARALLEL_REGION
      {
        std::vector<int> clbuf(Nd), cgbuf(Nd);
        sobj s;
        PARALLEL_FOR_LOOP_INTERN
        for(int idx=0;idx<sgrid->lSites();idx++) {
          sgrid->LocalIndexToLocalCoor(idx,clbuf);
          cgbuf = clbuf;
          cgbuf[dim] = clbuf[dim]+L*pc;
          peekLocalSite(s,pgbuf,cgbuf);
          pokeLocalSite(s,result,clbuf);
        }
      }
      result = result*div;
      // destroying plan
      FFTW<scalar>::fftw_destroy_plan(p);
 #endif
    }
  };
 }
 #endif
@@ -0,0 +1,481 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/LinearOperator.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef  GRID_ALGORITHM_LINEAR_OP_H
 #define  GRID_ALGORITHM_LINEAR_OP_H
 namespace Grid {
  /////////////////////////////////////////////////////////////////////////////////////////////
  // LinearOperators Take a something and return a something.
  /////////////////////////////////////////////////////////////////////////////////////////////
  //
  // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real):
  //SBase
  //   i)  F(a x + b y) = aF(x) + b F(y).
  //  ii)  <x|Op|y> = <y|AdjOp|x>^\ast
  //
  // Would be fun to have a test linearity & Herm Conj function!
  /////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field> class LinearOperatorBase {
    public:
      // Support for coarsening to a multigrid
      virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
      virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
      virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
      virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
      virtual void HermOp(const Field &in, Field &out)=0;
    };
  /////////////////////////////////////////////////////////////////////////////////////////////
  // By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code
  // between RB and non-RB variants. Sparse matrix is like the fermion action def, and then
  // the wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising
  // replication of code.
  //
  // I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
  // while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
  // with an assert trap in the non-herm. This isn't right; there must be a better C++ way to
  // do it, but I fear it required multiple inheritance and mixed in abstract base classes
  /////////////////////////////////////////////////////////////////////////////////////////////
    ////////////////////////////////////////////////////////////////////
    // Construct herm op from non-herm matrix
    ////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field>
    class MdagMLinearOperator : public LinearOperatorBase<Field> {
      Matrix &_Mat;
    public:
    MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
      // Support for coarsening to a multigrid
      void OpDiag (const Field &in, Field &out) {
 	_Mat.Mdiag(in,out);
      }
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	_Mat.Mdir(in,out,dir,disp);
      }
      void Op     (const Field &in, Field &out){
 	_Mat.M(in,out);
      }
      void AdjOp     (const Field &in, Field &out){
 	_Mat.Mdag(in,out);
      }
      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	_Mat.MdagM(in,out,n1,n2);
      }
      void HermOp(const Field &in, Field &out){
 	RealD n1,n2;
 	HermOpAndNorm(in,out,n1,n2);
      }
    };
    ////////////////////////////////////////////////////////////////////
    // Construct herm op and shift it for mgrid smoother
    ////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field>
    class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
      Matrix &_Mat;
      RealD _shift;
    public:
    ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
      // Support for coarsening to a multigrid
      void OpDiag (const Field &in, Field &out) {
 	_Mat.Mdiag(in,out);
 	assert(0);
      }
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	_Mat.Mdir(in,out,dir,disp);
 	assert(0);
      }
      void Op     (const Field &in, Field &out){
 	_Mat.M(in,out);
 	assert(0);
      }
      void AdjOp     (const Field &in, Field &out){
 	_Mat.Mdag(in,out);
 	assert(0);
      }
      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	_Mat.MdagM(in,out,n1,n2);
 	out = out + _shift*in;
 	ComplexD dot;	
 	dot= innerProduct(in,out);
 	n1=real(dot);
 	n2=norm2(out);
      }
      void HermOp(const Field &in, Field &out){
 	RealD n1,n2;
 	HermOpAndNorm(in,out,n1,n2);
      }
    };
    ////////////////////////////////////////////////////////////////////
    // Wrap an already herm matrix
    ////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field>
    class HermitianLinearOperator : public LinearOperatorBase<Field> {
      Matrix &_Mat;
    public:
    HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
      // Support for coarsening to a multigrid
      void OpDiag (const Field &in, Field &out) {
 	_Mat.Mdiag(in,out);
      }
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	_Mat.Mdir(in,out,dir,disp);
      }
      void Op     (const Field &in, Field &out){
 	_Mat.M(in,out);
      }
      void AdjOp     (const Field &in, Field &out){
 	_Mat.M(in,out);
      }
      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	_Mat.M(in,out);
 	ComplexD dot= innerProduct(in,out); n1=real(dot);
 	n2=norm2(out);
      }
      void HermOp(const Field &in, Field &out){
 	_Mat.M(in,out);
      }
    };
    //////////////////////////////////////////////////////////
    // Even Odd Schur decomp operators; there are several
    // ways to introduce the even odd checkerboarding
    //////////////////////////////////////////////////////////
    template<class Field>
      class SchurOperatorBase :  public LinearOperatorBase<Field> {
    public:
      virtual  RealD Mpc      (const Field &in, Field &out) =0;
      virtual  RealD MpcDag   (const Field &in, Field &out) =0;
      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
      Field tmp(in._grid);
      tmp.checkerboard = in.checkerboard;
 	ni=Mpc(in,tmp);
 	no=MpcDag(tmp,out);
      }
      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
      out.checkerboard = in.checkerboard;
 	MpcDagMpc(in,out,n1,n2);
      }
      virtual void HermOp(const Field &in, Field &out){
 	RealD n1,n2;
 	HermOpAndNorm(in,out,n1,n2);
      }
      void Op     (const Field &in, Field &out){
 	Mpc(in,out);
      }
      void AdjOp     (const Field &in, Field &out){ 
 	MpcDag(in,out);
      }
      // Support for coarsening to a multigrid
      void OpDiag (const Field &in, Field &out) {
 	assert(0); // must coarsen the unpreconditioned system
      }
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	assert(0);
      }
    };
    template<class Matrix,class Field>
      class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
    protected:
      Matrix &_Mat;
    public:
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
      Field tmp(in._grid);
      tmp.checkerboard = !in.checkerboard;
 	//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);
      //std::cout << "cb in " << in.checkerboard << "  cb out " << out.checkerboard << std::endl;
 	_Mat.Mooee(in,out);
 	return axpy_norm(out,-1.0,tmp,out);
      }
      virtual  RealD MpcDag   (const Field &in, Field &out){
 	Field tmp(in._grid);
 	_Mat.MeooeDag(in,tmp);
        _Mat.MooeeInvDag(tmp,out);
 	_Mat.MeooeDag(out,tmp);
 	_Mat.MooeeDag(in,out);
 	return axpy_norm(out,-1.0,tmp,out);
      }
    };
    template<class Matrix,class Field>
      class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
    protected:
      Matrix &_Mat;
    public:
      SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
 	_Mat.Meooe(in,out);
 	_Mat.MooeeInv(out,tmp);
 	_Mat.Meooe(tmp,out);
 	_Mat.MooeeInv(out,tmp);
 	return axpy_norm(out,-1.0,tmp,in);
      }
      virtual  RealD MpcDag   (const Field &in, Field &out){
 	Field tmp(in._grid);
 	_Mat.MooeeInvDag(in,out);
 	_Mat.MeooeDag(out,tmp);
 	_Mat.MooeeInvDag(tmp,out);
 	_Mat.MeooeDag(out,tmp);
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
    template<class Matrix,class Field>
      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
    protected:
      Matrix &_Mat;
    public:
      SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
 	_Mat.MooeeInv(in,out);
 	_Mat.Meooe(out,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);
 	return axpy_norm(out,-1.0,tmp,in);
      }
      virtual  RealD MpcDag   (const Field &in, Field &out){
 	Field tmp(in._grid);
 	_Mat.MeooeDag(in,out);
 	_Mat.MooeeInvDag(out,tmp);
 	_Mat.MeooeDag(tmp,out);
 	_Mat.MooeeInvDag(out,tmp);
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
    template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    //  Staggered use
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field>
      class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
    protected:
      Matrix &_Mat;
      Field tmp;
      RealD mass;
      double tMpc;
      double tIP;
      double tMeo;
      double taxpby_norm;
      uint64_t ncall;
    public:
      void Report(void)
      {
 	std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
 	std::cout << GridLogMessage << " HermOpAndNorm.IP  "<< tIP /ncall<<" usec "<<std::endl;
 	std::cout << GridLogMessage << " Mpc.MeoMoe        "<< tMeo/ncall<<" usec "<<std::endl;
 	std::cout << GridLogMessage << " Mpc.axpby_norm    "<< taxpby_norm/ncall<<" usec "<<std::endl;
      }
      SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
      { 
 	assert( _Mat.isTrivialEE() );
 	mass = _Mat.Mass();
 	tMpc=0;
 	tIP =0;
        tMeo=0;
        taxpby_norm=0;
 	ncall=0;
      }
      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	ncall++;
 	tMpc-=usecond();
 	n2 = Mpc(in,out);
 	tMpc+=usecond();
 	tIP-=usecond();
 	ComplexD dot= innerProduct(in,out);
 	tIP+=usecond();
 	n1 = real(dot);
      }
      virtual void HermOp(const Field &in, Field &out){
 	ncall++;
 	tMpc-=usecond();
 	_Mat.Meooe(in,out);
 	_Mat.Meooe(out,tmp);
 	tMpc+=usecond();
 	taxpby_norm-=usecond();
 	axpby(out,-1.0,mass*mass,tmp,in);
 	taxpby_norm+=usecond();
      }
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	tMeo-=usecond();
 	_Mat.Meooe(in,out);
 	_Mat.Meooe(out,tmp);
 	tMeo+=usecond();
 	taxpby_norm-=usecond();
 	RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
 	taxpby_norm+=usecond();
 	return nn;
      }
      virtual  RealD MpcDag   (const Field &in, Field &out){
 	return Mpc(in,out);
      }
      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
 	assert(0);// Never need with staggered
      }
    };
    template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
    /////////////////////////////////////////////////////////////
    // Base classes for functions of operators
    /////////////////////////////////////////////////////////////
    template<class Field> class OperatorFunction {
    public:
      virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
    };
    template<class Field> class LinearFunction {
    public:
      virtual void operator() (const Field &in, Field &out) = 0;
    };
    template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
    public:
      void operator() (const Field &in, Field &out){
 	out = in;
      };
    };
    /////////////////////////////////////////////////////////////
    // Base classes for Multishift solvers for operators
    /////////////////////////////////////////////////////////////
    template<class Field> class OperatorMultiFunction {
    public:
      virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, std::vector<Field> &out) = 0;
    };
    // FIXME : To think about
    // Chroma functionality list defining LinearOperator
    /*
     virtual void operator() (T& chi, const T& psi, enum PlusMinus isign) const = 0;
     virtual void operator() (T& chi, const T& psi, enum PlusMinus isign, Real epsilon) const
     virtual const Subset& subset() const = 0;
     virtual unsigned long nFlops() const { return 0; }
     virtual void deriv(P& ds_u, const T& chi, const T& psi, enum PlusMinus isign) const
     class UnprecLinearOperator : public DiffLinearOperator<T,P,Q>
       const Subset& subset() const {return all;}
     };
    */
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hermitian operator Linear function and operator function
  ////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field>
      class HermOpOperatorFunction : public OperatorFunction<Field> {
      void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 	Linop.HermOp(in,out);
      };
    };
    template<typename Field>
      class PlainHermOp : public LinearFunction<Field> {
    public:
      LinearOperatorBase<Field> &_Linop;
      PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
      {}
      void operator()(const Field& in, Field& out) {
 	_Linop.HermOp(in,out);
      }
    };
    template<typename Field>
    class FunctionHermOp : public LinearFunction<Field> {
    public:
      OperatorFunction<Field>   & _poly;
      LinearOperatorBase<Field> &_Linop;
      FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop) 
 	: _poly(poly), _Linop(linop) {};
      void operator()(const Field& in, Field& out) {
 	_poly(_Linop,in,out);
      }
    };
  template<class Field>
  class Polynomial : public OperatorFunction<Field> {
  private:
    std::vector<RealD> Coeffs;
  public:
    Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      Field AtoN(in._grid);
      Field Mtmp(in._grid);
      AtoN = in;
      out = AtoN*Coeffs[0];
      for(int n=1;n<Coeffs.size();n++){
 	Mtmp = AtoN;
 	Linop.HermOp(Mtmp,AtoN);
 	out=out+AtoN*Coeffs[n];
      }
    };
  };
 }
 #endif
@@ -0,0 +1,46 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/Preconditioner.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_PRECONDITIONER_H
 #define GRID_PRECONDITIONER_H
 namespace Grid {
  template<class Field> class Preconditioner :  public LinearFunction<Field> { 
    virtual void operator()(const Field &src, Field & psi)=0;
  };
  template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
  public:
    void operator()(const Field &src, Field & psi){
      psi = src;
    }
    TrivialPrecon(void){};
  };
 }
 #endif
@@ -0,0 +1,71 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/SparseMatrix.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef  GRID_ALGORITHM_SPARSE_MATRIX_H
 #define  GRID_ALGORITHM_SPARSE_MATRIX_H
 namespace Grid {
  /////////////////////////////////////////////////////////////////////////////////////////////
  // Interface defining what I expect of a general sparse matrix, such as a Fermion action
  /////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field> class SparseMatrixBase {
    public:
      virtual GridBase *Grid(void) =0;
      // Full checkerboar operations
      virtual RealD M    (const Field &in, Field &out)=0;
      virtual RealD Mdag (const Field &in, Field &out)=0;
      virtual void  MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
 	Field tmp (in._grid);
 	ni=M(in,tmp);
 	no=Mdag(tmp,out);
      }
      virtual  void Mdiag    (const Field &in, Field &out)=0;
      virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
    };
  /////////////////////////////////////////////////////////////////////////////////////////////
  // Interface augmented by a red black sparse matrix, such as a Fermion action
  /////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
    public:
      virtual GridBase *RedBlackGrid(void)=0;
      // half checkerboard operaions
      virtual  void Meooe    (const Field &in, Field &out)=0;
      virtual  void Mooee    (const Field &in, Field &out)=0;
      virtual  void MooeeInv (const Field &in, Field &out)=0;
      virtual  void MeooeDag    (const Field &in, Field &out)=0;
      virtual  void MooeeDag    (const Field &in, Field &out)=0;
      virtual  void MooeeInvDag (const Field &in, Field &out)=0;
    };
 }
 #endif
@@ -0,0 +1,377 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/approx/Chebyshev.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Christoph Lehner <clehner@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CHEBYSHEV_H
 #define GRID_CHEBYSHEV_H
 #include <Grid/algorithms/LinearOperator.h>
 namespace Grid {
 struct ChebyParams : Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
 				  RealD, alpha,  
 				  RealD, beta,   
 				  int, Npoly);
 };
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Generic Chebyshev approximations
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field>
  class Chebyshev : public OperatorFunction<Field> {
  private:
    std::vector<RealD> Coeffs;
    int order;
    RealD hi;
    RealD lo;
  public:
    void csv(std::ostream &out){
      RealD diff = hi-lo;
      RealD delta = (hi-lo)*1.0e-9;
      for (RealD x=lo; x<hi; x+=delta) {
 	delta*=1.1;
 	RealD f = approx(x);
 	out<< x<<" "<<f<<std::endl;
      }
      return;
    }
    // Convenience for plotting the approximation
    void   PlotApprox(std::ostream &out) {
      out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
      for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
 	out <<x<<"\t"<<approx(x)<<std::endl;
      }
    };
    Chebyshev(){};
    Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
    Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 // CJ: the one we need for Lanczos
    void Init(RealD _lo,RealD _hi,int _order)
    {
      lo=_lo;
      hi=_hi;
      order=_order;
      if(order < 2) exit(-1);
      Coeffs.resize(order);
      Coeffs.assign(0.,order);
      Coeffs[order-1] = 1.;
    };
    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
    {
      lo=_lo;
      hi=_hi;
      order=_order;
      if(order < 2) exit(-1);
      Coeffs.resize(order);
      for(int j=0;j<order;j++){
 	RealD s=0;
 	for(int k=0;k<order;k++){
 	  RealD y=std::cos(M_PI*(k+0.5)/order);
 	  RealD x=0.5*(y*(hi-lo)+(hi+lo));
 	  RealD f=func(x);
 	  s=s+f*std::cos( j*M_PI*(k+0.5)/order );
 	}
 	Coeffs[j] = s * 2.0/order;
      }
    };
    void JacksonSmooth(void){
      RealD M=order;
      RealD alpha = M_PI/(M+2);
      RealD lmax = std::cos(alpha);
      RealD sumUsq =0;
      std::vector<RealD> U(M);
      std::vector<RealD> a(M);
      std::vector<RealD> g(M);
      for(int n=0;n<=M;n++){
 	U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
 	sumUsq += U[n]*U[n];
      }      
      sumUsq = std::sqrt(sumUsq);
      for(int i=1;i<=M;i++){
 	a[i] = U[i]/sumUsq;
      }
      g[0] = 1.0;
      for(int m=1;m<=M;m++){
 	g[m] = 0;
 	for(int i=0;i<=M-m;i++){
 	  g[m]+= a[i]*a[m+i];
 	}
      }
      for(int m=1;m<=M;m++){
 	Coeffs[m]*=g[m];
      }
    }
    RealD approx(RealD x) // Convenience for plotting the approximation
    {
      RealD Tn;
      RealD Tnm;
      RealD Tnp;
      RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
      RealD T0=1;
      RealD T1=y;
      RealD sum;
      sum = 0.5*Coeffs[0]*T0;
      sum+= Coeffs[1]*T1;
      Tn =T1;
      Tnm=T0;
      for(int i=2;i<order;i++){
 	Tnp=2*y*Tn-Tnm;
 	Tnm=Tn;
 	Tn =Tnp;
 	sum+= Tn*Coeffs[i];
      }
      return sum;
    };
    RealD approxD(RealD x)
    {
      RealD Un;
      RealD Unm;
      RealD Unp;
      RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
      RealD U0=1;
      RealD U1=2*y;
      RealD sum;
      sum = Coeffs[1]*U0;
      sum+= Coeffs[2]*U1*2.0;
      Un =U1;
      Unm=U0;
      for(int i=2;i<order-1;i++){
 	Unp=2*y*Un-Unm;
 	Unm=Un;
 	Un =Unp;
 	sum+= Un*Coeffs[i+1]*(i+1.0);
      }
      return sum/(0.5*(hi-lo));
    };
    RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
      RealD x = x0;
      RealD eps;
      int i;
      for (i=0;i<maxiter;i++) {
 	eps = approx(x) - z;
 	if (fabs(eps / z) < resid)
 	  return x;
 	x = x - eps / approxD(x);
      }
      return std::numeric_limits<double>::quiet_NaN();
    }
    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      GridBase *grid=in._grid;
      // std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
      //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
      int vol=grid->gSites();
      Field T0(grid); T0 = in;  
      Field T1(grid); 
      Field T2(grid);
      Field y(grid);
      Field *Tnm = &T0;
      Field *Tn  = &T1;
      Field *Tnp = &T2;
      // Tn=T1 = (xscale M + mscale)in
      RealD xscale = 2.0/(hi-lo);
      RealD mscale = -(hi+lo)/(hi-lo);
      Linop.HermOp(T0,y);
      T1=y*xscale+in*mscale;
      // sum = .5 c[0] T0 + c[1] T1
      out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
      for(int n=2;n<order;n++){
 	Linop.HermOp(*Tn,y);
 	y=xscale*y+mscale*(*Tn);
 	*Tnp=2.0*y-(*Tnm);
 	out=out+Coeffs[n]* (*Tnp);
 	// Cycle pointers to avoid copies
 	Field *swizzle = Tnm;
 	Tnm    =Tn;
 	Tn     =Tnp;
 	Tnp    =swizzle;
      }
    }
  };
  template<class Field>
  class ChebyshevLanczos : public Chebyshev<Field> {
  private:
    std::vector<RealD> Coeffs;
    int order;
    RealD alpha;
    RealD beta;
    RealD mu;
  public:
    ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
    alpha(_alpha),
      beta(_beta),
          mu(_mu)
    {
      order=_order;
      Coeffs.resize(order);
      for(int i=0;i<_order;i++){
 	Coeffs[i] = 0.0;
      }
      Coeffs[order-1]=1.0;
    };
    void csv(std::ostream &out){
      for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
 	RealD f = approx(x);
 	out<< x<<" "<<f<<std::endl;
      }
      return;
    }
    RealD approx(RealD xx) // Convenience for plotting the approximation
    {
      RealD Tn;
      RealD Tnm;
      RealD Tnp;
      Real aa = alpha * alpha;
      Real bb = beta  *  beta;
      RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
      RealD y= x;
      RealD T0=1;
      RealD T1=y;
      RealD sum;
      sum = 0.5*Coeffs[0]*T0;
      sum+= Coeffs[1]*T1;
      Tn =T1;
      Tnm=T0;
      for(int i=2;i<order;i++){
 	Tnp=2*y*Tn-Tnm;
 	Tnm=Tn;
 	Tn =Tnp;
 	sum+= Tn*Coeffs[i];
      }
      return sum;
    };
    // shift_Multiply in Rudy's code
    void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out) 
    {
      GridBase *grid=in._grid;
      Field tmp(grid);
      RealD aa= alpha*alpha;
      RealD bb= beta * beta;
      Linop.HermOp(in,out);
      out = out - mu*in;
      Linop.HermOp(out,tmp);
      tmp = tmp - mu * out;
      out = (2.0/ (aa-bb) ) * tmp -  ((aa+bb)/(aa-bb))*in;
    };
    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      GridBase *grid=in._grid;
      int vol=grid->gSites();
      Field T0(grid); T0 = in;  
      Field T1(grid); 
      Field T2(grid);
      Field  y(grid);
      Field *Tnm = &T0;
      Field *Tn  = &T1;
      Field *Tnp = &T2;
      // Tn=T1 = (xscale M )*in
      AminusMuSq(Linop,T0,T1);
      // sum = .5 c[0] T0 + c[1] T1
      out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
      for(int n=2;n<order;n++){
 	AminusMuSq(Linop,*Tn,y);
 	*Tnp=2.0*y-(*Tnm);
 	out=out+Coeffs[n]* (*Tnp);
 	// Cycle pointers to avoid copies
 	Field *swizzle = Tnm;
 	Tnm    =Tn;
 	Tn     =Tnp;
 	Tnp    =swizzle;
      }
    }
  };
 }
 #endif
@@ -0,0 +1,152 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/approx/Forecast.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef INCLUDED_FORECAST_H
 #define INCLUDED_FORECAST_H
 namespace Grid {
  // Abstract base class.
  // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
  // and returns a forecasted solution to the system D*psi = phi (psi).
  template<class Matrix, class Field>
  class Forecast
  {
    public:
      virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
  };
  // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
  // used to forecast solutions across poles of the EOFA heatbath.
  //
  // Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
  template<class Matrix, class Field>
  class ChronoForecast : public Forecast<Matrix,Field>
  {
    public:
      Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
      {
        int degree = prev_solns.size();
        Field chi(phi); // forecasted solution
        // Trivial cases
        if(degree == 0){ chi = zero; return chi; }
        else if(degree == 1){ return prev_solns[0]; }
        RealD dot;
        ComplexD xp;
        Field r(phi); // residual
        Field Mv(phi);
        std::vector<Field> v(prev_solns); // orthonormalized previous solutions
        std::vector<Field> MdagMv(degree,phi);
        // Array to hold the matrix elements
        std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
        // Solution and source vectors
        std::vector<ComplexD> a(degree);
        std::vector<ComplexD> b(degree);
        // Orthonormalize the vector basis
        for(int i=0; i<degree; i++){
          v[i] *= 1.0/std::sqrt(norm2(v[i]));
          for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
        }
        // Perform sparse matrix multiplication and construct rhs
        for(int i=0; i<degree; i++){
          b[i] = innerProduct(v[i],phi);
          Mat.M(v[i],Mv);
          Mat.Mdag(Mv,MdagMv[i]);
          G[i][i] = innerProduct(v[i],MdagMv[i]);
        }
        // Construct the matrix
        for(int j=0; j<degree; j++){
        for(int k=j+1; k<degree; k++){
          G[j][k] = innerProduct(v[j],MdagMv[k]);
          G[k][j] = std::conj(G[j][k]);
        }}
        // Gauss-Jordan elimination with partial pivoting
        for(int i=0; i<degree; i++){
          // Perform partial pivoting
          int k = i;
          for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
          if(k != i){
            xp = b[k];
            b[k] = b[i];
            b[i] = xp;
            for(int j=0; j<degree; j++){
              xp = G[k][j];
              G[k][j] = G[i][j];
              G[i][j] = xp;
            }
          }
          // Convert matrix to upper triangular form
          for(int j=i+1; j<degree; j++){
            xp = G[j][i]/G[i][i];
            b[j] -= xp * b[i];
            for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
          }
        }
        // Use Gaussian elimination to solve equations and calculate initial guess
        chi = zero;
        r = phi;
        for(int i=degree-1; i>=0; i--){
          a[i] = 0.0;
          for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
          a[i] = (b[i]-a[i])/G[i][i];
          chi += a[i]*v[i];
          r -= a[i]*MdagMv[i];
        }
        RealD true_r(0.0);
        ComplexD tmp;
        for(int i=0; i<degree; i++){
          tmp = -b[i];
          for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
          tmp = std::conj(tmp)*tmp;
          true_r += std::sqrt(tmp.real());
        }
        RealD error = std::sqrt(norm2(r)/norm2(phi));
        std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
        return chi;
      };
  };
 }
 #endif
@@ -1,6 +1,5 @@
 MIT License
-Copyright (c) 2012-2016 GitHub, Inc.
+Copyright (c) 2011 Michael Clark
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -9,13 +8,14 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
+The above copyright notice and this permission notice shall be included in
-copies or substantial portions of the Software.
+all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-SOFTWARE.
+THE SOFTWARE.
@@ -0,0 +1,56 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/approx/MultiShiftFunction.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 namespace Grid {
 double MultiShiftFunction::approx(double x)
 {
  double a = norm;
  for(int n=0;n<poles.size();n++){
    a = a + residues[n]/(x+poles[n]);
  }
  return a;
 }
 void MultiShiftFunction::gnuplot(std::ostream &out)
 {
  out<<"f(x) = "<<norm<<"";
  for(int n=0;n<poles.size();n++){
    out<<"+("<<residues[n]<<"/(x+"<<poles[n]<<"))";
  }
  out<<";"<<std::endl;
 }
 void MultiShiftFunction::csv(std::ostream &out)
 {
  for (double x=lo; x<hi; x*=1.05) {
    double f = approx(x);
    double r = sqrt(x);
    out<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
  }
  return;
 }
 }
@@ -0,0 +1,67 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/approx/MultiShiftFunction.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef MULTI_SHIFT_FUNCTION
 #define MULTI_SHIFT_FUNCTION
 namespace Grid {
 class MultiShiftFunction {
 public:
  int order;
  std::vector<RealD> poles;
  std::vector<RealD> residues;
  std::vector<RealD> tolerances;
  RealD norm;
  RealD lo,hi;
  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
  RealD approx(RealD x);
  void csv(std::ostream &out);
  void gnuplot(std::ostream &out);
  void Init(AlgRemez & remez,double tol,bool inverse) 
  {
    order=remez.getDegree();
    tolerances.resize(remez.getDegree(),tol);
    poles.resize(remez.getDegree());
    residues.resize(remez.getDegree());
    remez.getBounds(lo,hi);
    if ( inverse ) remez.getIPFE (&residues[0],&poles[0],&norm);
    else           remez.getPFE (&residues[0],&poles[0],&norm);
  }
  // Allow deferred initialisation
  MultiShiftFunction(void){};
  MultiShiftFunction(AlgRemez & remez,double tol,bool inverse)
  {
    Init(remez,tol,inverse);
  }
 };
 }
 #endif
@@ -0,0 +1,80 @@
 -----------------------------------------------------------------------------------
 PAB. Took Mike Clark's AlgRemez from GitHub and (modified a little) include.
 This is open source and license and readme and comments are preserved consistent
 with the license. Mike, thankyou!
 -----------------------------------------------------------------------------------
 -----------------------------------------------------------------------------------
 AlgRemez
 The archive downloadable here contains an implementation of the Remez
 algorithm which calculates optimal rational (and polynomial)
 approximations to the nth root over a given spectral range.  The Remez
 algorithm, although in principle is extremely straightforward to
 program, is quite difficult to get completely correct, e.g., the Maple
 implementation of the algorithm does not always converge to the
 correct answer.
 To use this algorithm you need to install GMP, the GNU Multiple
 Precision Library, and when configuring the install, you must include
 the --enable-mpfr option (see the GMP manual for more details).  You
 also have to edit the Makefile for AlgRemez appropriately for your
 system, namely to point corrrectly to the location of the GMP library.
 The simple main program included with this archive invokes the
 AlgRemez class to calculate an approximation given by command line
 arguments.  It is invoked by the following
 ./test y z n d lambda_low lambda_high precision,
 where the function to be approximated is f(x) = x^(y/z), with degree
 (n,d) over the spectral range [lambda_low, lambda_high], using
 precision digits of precision in the arithmetic.  So an example would
 be
 ./test 1 2 5 5 0.0004 64 40
 which corresponds to constructing a rational approximation to the
 square root function, with degree (5,5) over the range [0.0004,64]
 with 40 digits of precision used for the arithmetic.  The parameters y
 and z must be positive, the approximation to f(x) = x^(-y/z) is simply
 the inverse of the approximation to f(x) = x^(y/z).  After the
 approximation has been constructed, the roots and poles of the
 rational function are found, and then the partial fraction expansion
 of both the rational function and it's inverse are found, the results
 of which are output to a file called "approx.dat".  In addition, the
 error function of the approximation is output to "error.dat", where it
 can be checked that the resultant approximation satisfies Chebychev's
 criterion, namely all error maxima are equal in magnitude, and
 adjacent maxima are oppostie in sign.  There are some caveats here
 however, the optimal polynomial approximation has complex roots, and
 the root finding implemented here cannot (yet) handle complex roots.
 In addition, the partial fraction expansion of rational approximations
 is only found for the case n = d, i.e., the degree of numerator
 polynomial equals that of the denominator polynomial.  The convention
 for the partial fraction expansion is that polar shifts are always
 written added to x, not subtracted.
 To do list
 1.  Include an exponential dampening factor in the function to be
 approximated.  This may sound trivial to implement, but for some
 parameters, the algorithm seems to breakdown.  Also, the roots in the
 rational approximation sometimes become complex, which currently
 breaks the stupidly simple root finding code.
 2. Make the algorithm faster - it's too slow when running on qcdoc.
 3. Add complex root finding.
 4. Add more options for error minimisation - currently the code
 minimises the relative error, should add options for absolute error,
 and other norms.
 There will be a forthcoming publication concerning the results
 generated by this software, but in the meantime, if you use this
 software, please cite it as
 "M.A. Clark and A.D. Kennedy, https://github.com/mikeaclark/AlgRemez, 2005".
 If you have any problems using the software, then please email scientist.mike@gmail.com.
@@ -0,0 +1,760 @@
 /*
  Mike Clark - 25th May 2005
  alg_remez.C
  AlgRemez is an implementation of the Remez algorithm, which in this
  case is used for generating the optimal nth root rational
  approximation.
  Note this class requires the gnu multiprecision (GNU MP) library.
 */
 #include<math.h>
 #include<stdio.h>
 #include<stdlib.h>
 #include<string>
 #include<iostream>
 #include<iomanip>
 #include<cassert>
 #include<Grid/algorithms/approx/Remez.h>
 // Constructor
 AlgRemez::AlgRemez(double lower, double upper, long precision) 
 {
  prec = precision;
  bigfloat::setDefaultPrecision(prec);
  apstrt = lower;
  apend = upper;
  apwidt = apend - apstrt;
  std::cout<<"Approximation bounds are ["<<apstrt<<","<<apend<<"]\n";
  std::cout<<"Precision of arithmetic is "<<precision<<std::endl;
  alloc = 0;
  n = 0;
  d = 0;
  foundRoots = 0;
  // Only require the approximation spread to be less than 1 ulp
  tolerance = 1e-15;
 }
 // Destructor
 AlgRemez::~AlgRemez()
 {
  if (alloc) {
    delete [] param;
    delete [] roots;
    delete [] poles;
    delete [] xx;
    delete [] mm;
    delete [] a_power;
    delete [] a;
  }
 }
 // Free memory and reallocate as necessary
 void AlgRemez::allocate(int num_degree, int den_degree)
 {
  // Arrays have previously been allocated, deallocate first, then allocate
  if (alloc) {
    delete [] param;
    delete [] roots;
    delete [] poles;
    delete [] xx;
    delete [] mm;
  }
  // Note use of new and delete in memory allocation - cannot run on qcdsp
  param = new bigfloat[num_degree+den_degree+1];
  roots = new bigfloat[num_degree];
  poles = new bigfloat[den_degree];
  xx = new bigfloat[num_degree+den_degree+3];
  mm = new bigfloat[num_degree+den_degree+2];
  if (!alloc) {
    // The coefficients of the sum in the exponential
    a = new bigfloat[SUM_MAX];
    a_power = new int[SUM_MAX];
  }
  alloc = 1;
 }
 // Reset the bounds of the approximation
 void AlgRemez::setBounds(double lower, double upper)
 {
  apstrt = lower;
  apend = upper;
  apwidt = apend - apstrt;
 }
 // Generate the rational approximation x^(pnum/pden)
 double AlgRemez::generateApprox(int degree, unsigned long pnum, 
 				unsigned long pden)
 {
  return generateApprox(degree, degree, pnum, pden);
 }
 double AlgRemez::generateApprox(int num_degree, int den_degree, 
 				unsigned long pnum, unsigned long pden)
 {
  double *a_param = 0;
  int *a_pow = 0;
  return generateApprox(num_degree, den_degree, pnum, pden, 0, a_param, a_pow);
 }
 // Generate the rational approximation x^(pnum/pden)
 double AlgRemez::generateApprox(int num_degree, int den_degree, 
 				unsigned long pnum, unsigned long pden,
 				int a_len, double *a_param, int *a_pow)
 {
  std::cout<<"Degree of the approximation is ("<<num_degree<<","<<den_degree<<")\n";
  std::cout<<"Approximating the function x^("<<pnum<<"/"<<pden<<")\n";
  // Reallocate arrays, since degree has changed
  if (num_degree != n || den_degree != d) allocate(num_degree,den_degree);
  assert(a_len<=SUM_MAX);
  step = new bigfloat[num_degree+den_degree+2];
  a_length = a_len;
  for (int j=0; j<a_len; j++) {
    a[j]= a_param[j];
    a_power[j] = a_pow[j];
  }
  power_num = pnum;
  power_den = pden;
  spread = 1.0e37;
  iter = 0;
  n = num_degree;
  d = den_degree;
  neq = n + d + 1;
  initialGuess();
  stpini(step);
  while (spread > tolerance) { //iterate until convergance
    if (iter++%100==0) 
      std::cout<<"Iteration " <<iter-1<<" spread "<<(double)spread<<" delta "<<(double)delta<<std::endl; 
    equations();
    if (delta < tolerance) {
      std::cout<<"Delta too small, try increasing precision\n";
      assert(0);
    };    
    assert( delta>= tolerance);
    search(step);
  }
  int sign;
  double error = (double)getErr(mm[0],&sign);
  std::cout<<"Converged at "<<iter<<" iterations; error = "<<error<<std::endl;
  // Once the approximation has been generated, calculate the roots
  if(!root()) {
    std::cout<<"Root finding failed\n";
  } else {
    foundRoots = 1;
  }
  delete [] step;
  // Return the maximum error in the approximation
  return error;
 }
 // Return the partial fraction expansion of the approximation x^(pnum/pden)
 int AlgRemez::getPFE(double *Res, double *Pole, double *Norm) {
  if (n!=d) {
    std::cout<<"Cannot handle case: Numerator degree neq Denominator degree\n";
    return 0;
  }
  if (!alloc) {
    std::cout<<"Approximation not yet generated\n";
    return 0;
  }
  if (!foundRoots) {
    std::cout<<"Roots not found, so PFE cannot be taken\n";
    return 0;
  }
  bigfloat *r = new bigfloat[n];
  bigfloat *p = new bigfloat[d];
  for (int i=0; i<n; i++) r[i] = roots[i];
  for (int i=0; i<d; i++) p[i] = poles[i];
  // Perform a partial fraction expansion
  pfe(r, p, norm);
  // Convert to double and return
  *Norm = (double)norm;
  for (int i=0; i<n; i++) Res[i] = (double)r[i];
  for (int i=0; i<d; i++) Pole[i] = (double)p[i];
  delete [] r;
  delete [] p;
  // Where the smallest shift is located
  return 0;
 }
 // Return the partial fraction expansion of the approximation x^(-pnum/pden)
 int AlgRemez::getIPFE(double *Res, double *Pole, double *Norm) {
  if (n!=d) {
    std::cout<<"Cannot handle case: Numerator degree neq Denominator degree\n";
    return 0;
  }
  if (!alloc) {
    std::cout<<"Approximation not yet generated\n";
    return 0;
  }
  if (!foundRoots) {
    std::cout<<"Roots not found, so PFE cannot be taken\n";
    return 0;
  }
  bigfloat *r = new bigfloat[d];
  bigfloat *p = new bigfloat[n];
  // Want the inverse function
  for (int i=0; i<n; i++) {
    r[i] = poles[i];
    p[i] = roots[i];
  }
  // Perform a partial fraction expansion
  pfe(r, p, (bigfloat)1l/norm);
  // Convert to double and return
  *Norm = (double)((bigfloat)1l/(norm));
  for (int i=0; i<n; i++) {
    Res[i] = (double)r[i];
    Pole[i] = (double)p[i];
  }
  delete [] r;
  delete [] p;
  // Where the smallest shift is located
  return 0;
 }
 // Initial values of maximal and minimal errors
 void AlgRemez::initialGuess() {
  // Supply initial guesses for solution points
  long ncheb = neq;			// Degree of Chebyshev error estimate
  bigfloat a, r;
  // Find ncheb+1 extrema of Chebyshev polynomial
  a = ncheb;
  mm[0] = apstrt;
  for (long i = 1; i < ncheb; i++) {
    r = 0.5 * (1 - cos((M_PI * i)/(double) a));
    //r *= sqrt_bf(r);
    r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
    mm[i] = apstrt + r * apwidt;
  }
  mm[ncheb] = apend;
  a = 2.0 * ncheb;
  for (long i = 0; i <= ncheb; i++) {
    r = 0.5 * (1 - cos(M_PI * (2*i+1)/(double) a));
    //r *= sqrt_bf(r); // Squeeze to low end of interval
    r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
    xx[i] = apstrt + r * apwidt;
  }
 }
 // Initialise step sizes
 void AlgRemez::stpini(bigfloat *step) {
  xx[neq+1] = apend;
  delta = 0.25;
  step[0] = xx[0] - apstrt;
  for (int i = 1; i < neq; i++) step[i] = xx[i] - xx[i-1];
  step[neq] = step[neq-1];
 }
 // Search for error maxima and minima
 void AlgRemez::search(bigfloat *step) {
  bigfloat a, q, xm, ym, xn, yn, xx0, xx1;
  int i, j, meq, emsign, ensign, steps;
  meq = neq + 1;
  bigfloat *yy = new bigfloat[meq];
  bigfloat eclose = 1.0e30;
  bigfloat farther = 0l;
  j = 1;
  xx0 = apstrt;
  for (i = 0; i < meq; i++) {
    steps = 0;
    xx1 = xx[i]; // Next zero
    if (i == meq-1) xx1 = apend;
    xm = mm[i];
    ym = getErr(xm,&emsign);
    q = step[i];
    xn = xm + q;
    if (xn < xx0 || xn >= xx1) {	// Cannot skip over adjacent boundaries
      q = -q;
      xn = xm;
      yn = ym;
      ensign = emsign;
    } else {
      yn = getErr(xn,&ensign);
      if (yn < ym) {
 	q = -q;
 	xn = xm;
 	yn = ym;
 	ensign = emsign;
      }
    }
    while(yn >= ym) {		// March until error becomes smaller.
      if (++steps > 10) break;
      ym = yn;
      xm = xn;
      emsign = ensign;
      a = xm + q;
      if (a == xm || a <= xx0 || a >= xx1) break;// Must not skip over the zeros either side.
      xn = a;
      yn = getErr(xn,&ensign);
    }
    mm[i] = xm;			// Position of maximum
    yy[i] = ym;			// Value of maximum
    if (eclose > ym) eclose = ym;
    if (farther < ym) farther = ym;
    xx0 = xx1; // Walk to next zero.
  } // end of search loop
  q = (farther - eclose);	// Decrease step size if error spread increased
  if (eclose != 0.0) q /= eclose; // Relative error spread
  if (q >= spread) delta *= 0.5; // Spread is increasing; decrease step size
  spread = q;
  for (i = 0; i < neq; i++) {
    q = yy[i+1];
    if (q != 0.0) q = yy[i] / q  - (bigfloat)1l;
    else q = 0.0625;
    if (q > (bigfloat)0.25) q = 0.25;
    q *= mm[i+1] - mm[i];
    step[i] = q * delta;
  }
  step[neq] = step[neq-1];
  for (i = 0; i < neq; i++) {	// Insert new locations for the zeros.
    xm = xx[i] - step[i];
    if (xm <= apstrt) continue;
    if (xm >= apend) continue;
    if (xm <= mm[i]) xm = (bigfloat)0.5 * (mm[i] + xx[i]);
    if (xm >= mm[i+1]) xm = (bigfloat)0.5 * (mm[i+1] + xx[i]);
    xx[i] = xm;
  }
  delete [] yy;
 }
 // Solve the equations
 void AlgRemez::equations(void) {
  bigfloat x, y, z;
  int i, j, ip;
  bigfloat *aa;
  bigfloat *AA = new bigfloat[(neq)*(neq)];
  bigfloat *BB = new bigfloat[neq];
  for (i = 0; i < neq; i++) {	// set up the equations for solution by simq()
    ip = neq * i;		// offset to 1st element of this row of matrix
    x = xx[i];			// the guess for this row
    y = func(x);		// right-hand-side vector
    z = (bigfloat)1l;
    aa = AA+ip;
    for (j = 0; j <= n; j++) {
      *aa++ = z;
      z *= x;
    }
    z = (bigfloat)1l;
    for (j = 0; j < d; j++) {
      *aa++ = -y * z;
      z *= x;
    }
    BB[i] = y * z;		// Right hand side vector
  }
  // Solve the simultaneous linear equations.
  if (simq(AA, BB, param, neq)) {
    std::cout<<"simq failed\n";
    exit(0);
  }
  delete [] AA;
  delete [] BB;
 }
 // Evaluate the rational form P(x)/Q(x) using coefficients
 // from the solution vector param
 bigfloat AlgRemez::approx(const bigfloat x) {
  bigfloat yn, yd;
  int i;
  // Work backwards toward the constant term.
  yn = param[n];		// Highest order numerator coefficient
  for (i = n-1; i >= 0; i--) yn = x * yn  +  param[i]; 
  yd = x + param[n+d];	// Highest degree coefficient = 1.0
  for (i = n+d-1; i > n; i--) yd = x * yd  +  param[i];
  return(yn/yd);
 }
 // Compute size and sign of the approximation error at x
 bigfloat AlgRemez::getErr(bigfloat x, int *sign) {
  bigfloat e, f;
  f = func(x);
  e = approx(x) - f;
  if (f != 0) e /= f;
  if (e < (bigfloat)0.0) {
    *sign = -1;
    e = -e;
  }
  else *sign = 1;
  return(e);
 }
 // Calculate function required for the approximation.
 bigfloat AlgRemez::func(const bigfloat x) {
  bigfloat z = (bigfloat)power_num / (bigfloat)power_den;
  bigfloat y;
  if (x == (bigfloat)1.0) y = (bigfloat)1.0;
  else y = pow_bf(x,z);
  if (a_length > 0) {
    bigfloat sum = 0l;
    for (int j=0; j<a_length; j++) sum += a[j]*pow_bf(x,a_power[j]);
    return y * exp_bf(sum);
  } else {
    return y;
  }
 }
 // Solve the system AX=B
 int AlgRemez::simq(bigfloat A[], bigfloat B[], bigfloat X[], int n) {
  int i, j, ij, ip, ipj, ipk, ipn;
  int idxpiv, iback;
  int k, kp, kp1, kpk, kpn;
  int nip, nkp, nm1;
  bigfloat em, q, rownrm, big, size, pivot, sum;
  bigfloat *aa;
  // simq() work vector
  int *IPS = new int[(neq) * sizeof(int)];
  nm1 = n - 1;
  // Initialize IPS and X
  ij = 0;
  for (i = 0; i < n; i++) {
    IPS[i] = i;
    rownrm = 0.0;
    for(j = 0; j < n; j++) {
      q = abs_bf(A[ij]);
      if(rownrm < q) rownrm = q;
      ++ij;
    }
    if (rownrm == (bigfloat)0l) {
      std::cout<<"simq rownrm=0\n";
      delete [] IPS;
      return(1);
    }
    X[i] = (bigfloat)1.0 / rownrm;
  }
  for (k = 0; k < nm1; k++) {
    big = 0.0;
    idxpiv = 0;
    for (i = k; i < n; i++) {
      ip = IPS[i];
      ipk = n*ip + k;
      size = abs_bf(A[ipk]) * X[ip];
      if (size > big) {
 	big = size;
 	idxpiv = i;
      }
    }
    if (big == (bigfloat)0l) {
      std::cout<<"simq big=0\n";
      delete [] IPS;
      return(2);
    }
    if (idxpiv != k) {
      j = IPS[k];
      IPS[k] = IPS[idxpiv];
      IPS[idxpiv] = j;
    }
    kp = IPS[k];
    kpk = n*kp + k;
    pivot = A[kpk];
    kp1 = k+1;
    for (i = kp1; i < n; i++) {
      ip = IPS[i];
      ipk = n*ip + k;
      em = -A[ipk] / pivot;
      A[ipk] = -em;
      nip = n*ip;
      nkp = n*kp;
      aa = A+nkp+kp1;
      for (j = kp1; j < n; j++) {
 	ipj = nip + j;
 	A[ipj] = A[ipj] + em * *aa++;
      }
    }
  }
  kpn = n * IPS[n-1] + n - 1;	// last element of IPS[n] th row
  if (A[kpn] == (bigfloat)0l) {
    std::cout<<"simq A[kpn]=0\n";
    delete [] IPS;
    return(3);
  }
  ip = IPS[0];
  X[0] = B[ip];
  for (i = 1; i < n; i++) {
    ip = IPS[i];
    ipj = n * ip;
    sum = 0.0;
    for (j = 0; j < i; j++) {
      sum += A[ipj] * X[j];
      ++ipj;
    }
    X[i] = B[ip] - sum;
  }
  ipn = n * IPS[n-1] + n - 1;
  X[n-1] = X[n-1] / A[ipn];
  for (iback = 1; iback < n; iback++) {
    //i goes (n-1),...,1
    i = nm1 - iback;
    ip = IPS[i];
    nip = n*ip;
    sum = 0.0;
    aa = A+nip+i+1;
    for (j= i + 1; j < n; j++) 
      sum += *aa++ * X[j];
    X[i] = (X[i] - sum) / A[nip+i];
  }
  delete [] IPS;
  return(0);
 }
 // Calculate the roots of the approximation
 int AlgRemez::root() {
  long i,j;
  bigfloat x,dx=0.05;
  bigfloat upper=1, lower=-100000;
  bigfloat tol = 1e-20;
  bigfloat *poly = new bigfloat[neq+1];
  // First find the numerator roots
  for (i=0; i<=n; i++) poly[i] = param[i];
  for (i=n-1; i>=0; i--) {
    roots[i] = rtnewt(poly,i+1,lower,upper,tol);
    if (roots[i] == 0.0) {
      std::cout<<"Failure to converge on root "<<i+1<<"/"<<n<<"\n";
      return 0;
    }
    poly[0] = -poly[0]/roots[i];
    for (j=1; j<=i; j++) poly[j] = (poly[j-1] - poly[j])/roots[i];
  }
 // Now find the denominator roots
  poly[d] = 1l;
  for (i=0; i<d; i++) poly[i] = param[n+1+i];
  for (i=d-1; i>=0; i--) {
    poles[i]=rtnewt(poly,i+1,lower,upper,tol);
    if (poles[i] == 0.0) {
      std::cout<<"Failure to converge on pole "<<i+1<<"/"<<d<<"\n";
      return 0;
    }
    poly[0] = -poly[0]/poles[i];
    for (j=1; j<=i; j++) poly[j] = (poly[j-1] - poly[j])/poles[i];
  }
  norm = param[n];
  delete [] poly;
  return 1;
 }
 // Evaluate the polynomial
 bigfloat AlgRemez::polyEval(bigfloat x, bigfloat *poly, long size) {
  bigfloat f = poly[size];
  for (int i=size-1; i>=0; i--) f = f*x + poly[i];
  return f;
 }
 // Evaluate the differential of the polynomial
 bigfloat AlgRemez::polyDiff(bigfloat x, bigfloat *poly, long size) {
  bigfloat df = (bigfloat)size*poly[size];
  for (int i=size-1; i>0; i--) df = df*x + (bigfloat)i*poly[i];
  return df;
 }
 // Newton's method to calculate roots
 bigfloat AlgRemez::rtnewt(bigfloat *poly, long i, bigfloat x1, 
 			  bigfloat x2, bigfloat xacc) {
  int j;
  bigfloat df, dx, f, rtn;
  rtn=(bigfloat)0.5*(x1+x2);
  for (j=1; j<=JMAX;j++) {
    f = polyEval(rtn, poly, i);
    df = polyDiff(rtn, poly, i);
    dx = f/df;
    rtn -= dx;
    if (abs_bf(dx) < xacc) return rtn;
  }
  std::cout<<"Maximum number of iterations exceeded in rtnewt\n";
  return 0.0;
 }
 // Evaluate the partial fraction expansion of the rational function
 // with res roots and poles poles.  Result is overwritten on input
 // arrays.
 void AlgRemez::pfe(bigfloat *res, bigfloat *poles, bigfloat norm) {
  int i,j,small;
  bigfloat temp;
  bigfloat *numerator = new bigfloat[n];
  bigfloat *denominator = new bigfloat[d];
  // Construct the polynomials explicitly 
  for (i=1; i<n; i++) {
    numerator[i] = 0l;
    denominator[i] = 0l;
  }
  numerator[0]=1l;
  denominator[0]=1l;
  for (j=0; j<n; j++) {
    for (i=n-1; i>=0; i--) {
      numerator[i] *= -res[j];
      denominator[i] *= -poles[j];
      if (i>0) {
 	numerator[i] += numerator[i-1];
 	denominator[i] += denominator[i-1];
      }
    }
  }
  // Convert to proper fraction form.
  // Fraction is now in the form 1 + n/d, where O(n)+1=O(d)
  for (i=0; i<n; i++) numerator[i] -= denominator[i];
  // Find the residues of the partial fraction expansion and absorb the
  // coefficients.
  for (i=0; i<n; i++) {
    res[i] = 0l;
    for (j=n-1; j>=0; j--) {
      res[i] = poles[i]*res[i]+numerator[j];
    }
    for (j=n-1; j>=0; j--) {
      if (i!=j) res[i] /= poles[i]-poles[j];
    }
    res[i] *= norm;
  }  
  // res now holds the residues
  j = 0;
  for (i=0; i<n; i++) poles[i] = -poles[i];
  // Move the ordering of the poles from smallest to largest
  for (j=0; j<n; j++) {
    small = j;
    for (i=j+1; i<n; i++) {
      if (poles[i] < poles[small]) small = i;
    }
    if (small != j) {
      temp = poles[small];
      poles[small] = poles[j];
      poles[j] = temp;
      temp = res[small];
      res[small] = res[j];
      res[j] = temp;
    }
  }
  delete [] numerator;
  delete [] denominator;
 }
 double AlgRemez::evaluateApprox(double x) {
  return (double)approx((bigfloat)x);
 }
 double AlgRemez::evaluateInverseApprox(double x) {
  return 1.0/(double)approx((bigfloat)x);
 }
 double AlgRemez::evaluateFunc(double x) {
  return (double)func((bigfloat)x);
 }
 double AlgRemez::evaluateInverseFunc(double x) {
  return 1.0/(double)func((bigfloat)x);
 }
 void AlgRemez::csv(std::ostream & os)
 {
  double lambda_low = apstrt;
  double lambda_high= apend;
  for (double x=lambda_low; x<lambda_high; x*=1.05) {
    double f = evaluateFunc(x);
    double r = evaluateApprox(x);
    os<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
  }
  return;
 }
@@ -0,0 +1,184 @@
 /*
  Mike Clark - 25th May 2005
  alg_remez.h
  AlgRemez is an implementation of the Remez algorithm, which in this
  case is used for generating the optimal nth root rational
  approximation.
  Note this class requires the gnu multiprecision (GNU MP) library.
 */
 #ifndef INCLUDED_ALG_REMEZ_H
 #define INCLUDED_ALG_REMEZ_H
 #include <stddef.h>
 #include <Grid/GridStd.h>
 #ifdef HAVE_LIBGMP
 #include "bigfloat.h"
 #else
 #include "bigfloat_double.h"
 #endif
 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
 #define SUM_MAX 10 // Maximum number of terms in exponential
 /*
 *Usage examples
  AlgRemez remez(lambda_low,lambda_high,precision);
  error = remez.generateApprox(n,d,y,z);
  remez.getPFE(res,pole,&norm);
  remez.getIPFE(res,pole,&norm);
  remez.csv(ostream &os);
 */
 class AlgRemez
 {
 private:
  char *cname;
  // The approximation parameters
  bigfloat *param, *roots, *poles;
  bigfloat norm;
  // The numerator and denominator degree (n=d)
  int n, d;
  // The bounds of the approximation
  bigfloat apstrt, apwidt, apend;
  // the numerator and denominator of the power we are approximating
  unsigned long power_num; 
  unsigned long power_den;
  // Flag to determine whether the arrays have been allocated
  int alloc;
  // Flag to determine whether the roots have been found
  int foundRoots;
  // Variables used to calculate the approximation
  int nd1, iter;
  bigfloat *xx, *mm, *step;
  bigfloat delta, spread, tolerance;
  // The exponential summation coefficients
  bigfloat *a;
  int *a_power;
  int a_length;
  // The number of equations we must solve at each iteration (n+d+1)
  int neq;
  // The precision of the GNU MP library
  long prec;
  // Initial values of maximal and minmal errors
  void initialGuess();
  // Solve the equations
  void equations();
  // Search for error maxima and minima
  void search(bigfloat *step); 
  // Initialise step sizes
  void stpini(bigfloat *step);
  // Calculate the roots of the approximation
  int root();
  // Evaluate the polynomial
  bigfloat polyEval(bigfloat x, bigfloat *poly, long size);
  //complex_bf polyEval(complex_bf x, complex_bf *poly, long size);
  // Evaluate the differential of the polynomial
  bigfloat polyDiff(bigfloat x, bigfloat *poly, long size);
  //complex_bf polyDiff(complex_bf x, complex_bf *poly, long size);
  // Newton's method to calculate roots
  bigfloat rtnewt(bigfloat *poly, long i, bigfloat x1, bigfloat x2, bigfloat xacc);
  //complex_bf rtnewt(complex_bf *poly, long i, bigfloat x1, bigfloat x2, bigfloat xacc);
  // Evaluate the partial fraction expansion of the rational function
  // with res roots and poles poles.  Result is overwritten on input
  // arrays.
  void pfe(bigfloat *res, bigfloat* poles, bigfloat norm);
  // Calculate function required for the approximation
  bigfloat func(bigfloat x);
  // Compute size and sign of the approximation error at x
  bigfloat getErr(bigfloat x, int *sign);
  // Solve the system AX=B
  int simq(bigfloat *A, bigfloat *B, bigfloat *X, int n);
  // Free memory and reallocate as necessary
  void allocate(int num_degree, int den_degree);
  // Evaluate the rational form P(x)/Q(x) using coefficients from the
  // solution vector param
  bigfloat approx(bigfloat x);
 public:
  // Constructor
  AlgRemez(double lower, double upper, long prec);
  // Destructor
  virtual ~AlgRemez();
  int getDegree(void){ 
    assert(n==d);
    return n;
  }
  // Reset the bounds of the approximation
  void setBounds(double lower, double upper);
  // Reset the bounds of the approximation
  void getBounds(double &lower, double &upper) { 
    lower=(double)apstrt;
    upper=(double)apend;
  }
  // Generate the rational approximation x^(pnum/pden)
  double generateApprox(int num_degree, int den_degree, 
 			unsigned long power_num, unsigned long power_den, 
 			int a_len, double* a_param, int* a_pow);
  double generateApprox(int num_degree, int den_degree, 
 			unsigned long power_num, unsigned long power_den);
  double generateApprox(int degree, unsigned long power_num, 
 			unsigned long power_den);
  // Return the partial fraction expansion of the approximation x^(pnum/pden)
  int getPFE(double *res, double *pole, double *norm);
  // Return the partial fraction expansion of the approximation x^(-pnum/pden)
  int getIPFE(double *res, double *pole, double *norm);
  // Evaluate the rational form P(x)/Q(x) using coefficients from the
  // solution vector param
  double evaluateApprox(double x);
  // Evaluate the rational form Q(x)/P(x) using coefficients from the
  // solution vector param
  double evaluateInverseApprox(double x);
  // Calculate function required for the approximation
  double evaluateFunc(double x);
  // Calculate inverse function required for the approximation
  double evaluateInverseFunc(double x);
  // Dump csv of function, approx and error
  void csv(std::ostream &os);
 };
 #endif  // Include guard
@@ -0,0 +1,727 @@
 /* -*- Mode: C; comment-column: 22; fill-column: 79; compile-command: "gcc -o zolotarev zolotarev.c -ansi -pedantic -lm -DTEST"; -*- */
 #define VERSION Source Time-stamp: <2015-05-18 16:32:08 neo>
 /* These C routines evalute the optimal rational approximation to the signum
 * function for epsilon < |x| < 1 using Zolotarev's theorem.
 *
 * To obtain reliable results for high degree approximations (large n) it is
 * necessary to compute using sufficiently high precision arithmetic. To this
 * end the code has been parameterised to work with the preprocessor names
 * INTERNAL_PRECISION and PRECISION set to float, double, or long double as
 * appropriate. INTERNAL_PRECISION is used in computing the Zolotarev
 * coefficients, which are converted to PRECISION before being returned to the
 * caller. Presumably even higher precision could be obtained using GMP or
 * similar package, but bear in mind that rounding errors might also be
 * significant in evaluating the resulting polynomial. The convergence criteria
 * have been written in a precision-independent form. */
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #ifndef INTERNAL_PRECISION
 #define INTERNAL_PRECISION double
 #endif
 #include "Zolotarev.h"
 #define ZOLOTAREV_INTERNAL
 #undef ZOLOTAREV_DATA
 #define ZOLOTAREV_DATA izd
 #undef ZPRECISION
 #define ZPRECISION INTERNAL_PRECISION
 #include "Zolotarev.h"
 #undef ZOLOTAREV_INTERNAL
 /* The ANSI standard appears not to know what pi is */
 #ifndef M_PI
 #define M_PI ((INTERNAL_PRECISION) 3.141592653589793238462643383279502884197\
 169399375105820974944592307816406286208998628034825342117068)
 #endif
 #define ZERO ((INTERNAL_PRECISION) 0)
 #define ONE ((INTERNAL_PRECISION) 1)
 #define TWO ((INTERNAL_PRECISION) 2)
 #define THREE ((INTERNAL_PRECISION) 3)
 #define FOUR ((INTERNAL_PRECISION) 4)
 #define HALF (ONE/TWO)
 /* The following obscenity seems to be the simplest (?) way to coerce the C
 * preprocessor to convert the value of a preprocessor token into a string. */
 #define PP2(x) #x
 #define PP1(a,b,c) a ## b(c)
 #define STRINGIFY(name) PP1(PP,2,name)
 /* Compute the partial fraction expansion coefficients (alpha) from the
 * factored form */
 namespace Grid {
 namespace Approx {
 static void construct_partfrac(izd *z) {
  int dn = z -> dn, dd = z -> dd, type = z -> type;
  int j, k, da = dd + 1 + type;
  INTERNAL_PRECISION A = z -> A, *a = z -> a, *ap = z -> ap, *alpha;
  alpha = (INTERNAL_PRECISION*) malloc(da * sizeof(INTERNAL_PRECISION));
  for (j = 0; j < dd; j++)
    for (k = 0, alpha[j] = A; k < dd; k++)
      alpha[j] *=
 	(k < dn ? ap[j] - a[k] : ONE) / (k == j ? ONE : ap[j] - ap[k]);
  if(type == 1)	      /* implicit pole at zero? */
    for (k = 0, alpha[dd] = A * (dn > dd ? - a[dd] : ONE); k < dd; k++) {
      alpha[dd] *= a[k] / ap[k];
      alpha[k] *= (dn > dd ? ap[k] - a[dd] : ONE) / ap[k];
    }
  alpha[da-1] = dn == da - 1 ? A : ZERO;
  z -> alpha = alpha;
  z -> da = da;
  return;
 }
 /* Convert factored polynomial into dense polynomial. The input is the overall
 * factor A and the roots a[i], such that p = A product(x - a[i], i = 1..d) */
 static INTERNAL_PRECISION *poly_factored_to_dense(INTERNAL_PRECISION A, 
 						  INTERNAL_PRECISION *a,
 						  int d) {
  INTERNAL_PRECISION *p;
  int i, j;
  p = (INTERNAL_PRECISION *) malloc((d + 2) * sizeof(INTERNAL_PRECISION));
  p[0] = A;
  for (i = 0; i < d; i++) {
    p[i+1] = p[i];
    for (j = i; j > 0; j--) p[j] = p[j-1] - a[i]*p[j];
    p[0] *= - a[i];
  }
  return p;
 }
 /* Convert a rational function of the form R0(x) = x p(x^2)/q(x^2) (type 0) or
 * R1(x) = p(x^2)/[x q(x^2)] (type 1) into its continued fraction
 * representation. We assume that 0 <= deg(q) - deg(p) <= 1 for type 0 and 0 <=
 * deg(p) - deg(q) <= 1 for type 1. On input p and q are in factored form, and
 * deg(q) = dq, deg(p) = dp.  The output is the continued fraction coefficients
 * beta, where R(x) = beta[0] x + 1/(beta[1] x + 1/(...)).
 *
 * The method used is as follows. There are four cases to consider:
 *
 * 0.i.  Type 0, deg p = deg q
 *
 * 0.ii. Type 0, deg p = deg q - 1
 *
 * 1.i.  Type 1, deg p = deg q
 *
 * 1.ii. Type 1, deg p = deg q + 1
 *
 * and these are connected by two transformations:
 *
 * A. To obtain a continued fraction expansion of type 1 we use a single-step
 * polynomial division we find beta and r(x) such that p(x) = beta x q(x) +
 * r(x), with deg(r) = deg(q). This implies that p(x^2) = beta x^2 q(x^2) +
 * r(x^2), and thus R1(x) = x beta + r(x^2)/(x q(x^2)) = x beta + 1/R0(x)
 * with R0(x) = x q(x^2)/r(x^2).
 *
 * B. A continued fraction expansion of type 0 is obtained in a similar, but
 * not identical, manner. We use the polynomial division algorithm to compute
 * the quotient beta and the remainder r that satisfy p(x) = beta q(x) + r(x)
 * with deg(r) = deg(q) - 1. We thus have x p(x^2) = x beta q(x^2) + x r(x^2),
 * so R0(x) = x beta + x r(x^2)/q(x^2) = x beta + 1/R1(x) with R1(x) = q(x^2) /
 * (x r(x^2)).
 *
 * Note that the deg(r) must be exactly deg(q) for (A) and deg(q) - 1 for (B)
 * because p and q have disjoint roots all of multiplicity 1. This means that
 * the division algorithm requires only a single polynomial subtraction step.
 *
 * The transformations between the cases form the following finite state
 * automaton:
 *
 * +------+            +------+            +------+            +------+
 * |      |            |      | ---(A)---> |      |            |      |
 * | 0.ii | ---(B)---> | 1.ii |            | 0.i  | <---(A)--- | 1.i  |
 * |      |            |      | <---(B)--- |      |            |      |
 * +------+            +------+            +------+            +------+
 */
 static INTERNAL_PRECISION *contfrac_A(INTERNAL_PRECISION *,
 				      INTERNAL_PRECISION *,
 				      INTERNAL_PRECISION *,
 				      INTERNAL_PRECISION *, int, int);
 static INTERNAL_PRECISION *contfrac_B(INTERNAL_PRECISION *,
 				      INTERNAL_PRECISION *,
 				      INTERNAL_PRECISION *,
 				      INTERNAL_PRECISION *, int, int);
 static void construct_contfrac(izd *z){
  INTERNAL_PRECISION *r, A = z -> A, *p = z -> a, *q = z -> ap;
  int dp = z -> dn, dq = z -> dd, type = z -> type;
  z -> db = 2 * dq + 1 + type;
  z -> beta = (INTERNAL_PRECISION *)
    malloc(z -> db * sizeof(INTERNAL_PRECISION));
  p = poly_factored_to_dense(A, p, dp);
  q = poly_factored_to_dense(ONE, q, dq);
  r = (INTERNAL_PRECISION *) malloc((MAX(dp,dq) + 1) *
 				    sizeof(INTERNAL_PRECISION));
  if (type == 0) (void) contfrac_B(z -> beta, p, q, r, dp, dq);
  else (void) contfrac_A(z -> beta, p, q, r, dp, dq);
  free(p); free(q); free(r);
  return;
 }
 static INTERNAL_PRECISION *contfrac_A(INTERNAL_PRECISION *beta,
 				      INTERNAL_PRECISION *p,
 				      INTERNAL_PRECISION *q,
 				      INTERNAL_PRECISION *r, int dp, int dq) {
  INTERNAL_PRECISION quot, *rb;
  int j;
  /* p(x) = x beta q(x) + r(x); dp = dq or dp = dq + 1 */
  quot = dp == dq ? ZERO : p[dp] / q[dq];
  r[0] = p[0];
  for (j = 1; j <= dp; j++) r[j] = p[j] - quot * q[j-1];
 #ifdef DEBUG
  printf("%s: Continued Fraction form: deg p = %2d, deg q = %2d, beta = %g\n",
 	 __FUNCTION__, dp, dq, (float) quot);
  for (j = 0; j <= dq + 1; j++)
    printf("\tp[%2d] = %14.6g\tq[%2d] = %14.6g\tr[%2d] = %14.6g\n",
 	   j, (float) (j > dp ? ZERO : p[j]),
 	   j, (float) (j == 0 ? ZERO : q[j-1]),
 	   j, (float) (j == dp ? ZERO : r[j]));
 #endif /* DEBUG */
  *(rb = contfrac_B(beta, q, r, p, dq, dq)) = quot;
  return rb + 1;
 }
 static INTERNAL_PRECISION *contfrac_B(INTERNAL_PRECISION *beta,
 				      INTERNAL_PRECISION *p,
 				      INTERNAL_PRECISION *q,
 				      INTERNAL_PRECISION *r, int dp, int dq) {
  INTERNAL_PRECISION quot, *rb;
  int j;
  /* p(x) = beta q(x) + r(x); dp = dq or dp = dq - 1 */
  quot = dp == dq ? p[dp] / q[dq] : ZERO;
  for (j = 0; j < dq; j++) r[j] = p[j] - quot * q[j];
 #ifdef DEBUG
  printf("%s: Continued Fraction form: deg p = %2d, deg q = %2d, beta = %g\n",
 	 __FUNCTION__, dp, dq, (float) quot);
  for (j = 0; j <= dq; j++)
    printf("\tp[%2d] = %14.6g\tq[%2d] = %14.6g\tr[%2d] = %14.6g\n",
 	   j, (float) (j > dp ? ZERO : p[j]),
 	   j, (float) q[j],
 	   j, (float) (j == dq ? ZERO : r[j]));
 #endif /* DEBUG */
  *(rb = dq > 0 ? contfrac_A(beta, q, r, p, dq, dq-1) : beta) = quot;
  return rb + 1;
 }
 /* The global variable U is used to hold the argument u throughout the AGM
 * recursion. The global variables F and K are set in the innermost
 * instantiation of the recursive function AGM to the values of the elliptic
 * integrals F(u,k) and K(k) respectively. They must be made thread local to
 * make this code thread-safe in a multithreaded environment. */
 static INTERNAL_PRECISION U, F, K;	/* THREAD LOCAL */
 /* Recursive implementation of Gauss' arithmetico-geometric mean, which is the
 * kernel of the method used to compute the Jacobian elliptic functions
 * sn(u,k), cn(u,k), and dn(u,k) with parameter k (where 0 < k < 1), as well
 * as the elliptic integral F(s,k) satisfying F(sn(u,k)) = u and the complete
 * elliptic integral K(k).
 *
 * The algorithm used is a recursive implementation of the Gauss (Landen)
 * transformation.
 *
 * The function returns the value of sn(u,k'), where k' is the dual parameter,
 * and also sets the values of the global variables F and K.  The latter is
 * used to determine the sign of cn(u,k').
 *
 * The algorithm is deemed to have converged when b ceases to increase. This
 * works whatever INTERNAL_PRECISION is specified. */
 static INTERNAL_PRECISION AGM(INTERNAL_PRECISION a,
 			      INTERNAL_PRECISION b,
 			      INTERNAL_PRECISION s) {
  static INTERNAL_PRECISION pb = -ONE;
  INTERNAL_PRECISION c, d, xi;
  if (b <= pb) {
    pb = -ONE;
    F = asin(s) / a;		/* Here, a is the AGM */
    K = M_PI / (TWO * a);
    return sin(U * a);
  }
  pb = b;
  c = a - b;
  d = a + b;
  xi = AGM(HALF*d, sqrt(a*b), ONE + c*c == ONE ?
 	   HALF*s*d/a : (a - sqrt(a*a - s*s*c*d))/(c*s));
  return 2*a*xi / (d + c*xi*xi);
 }
 /* Computes sn(u,k), cn(u,k), dn(u,k), F(u,k), and K(k). It is essentially a
 * wrapper for the routine AGM. The sign of cn(u,k) is defined to be -1 if
 * K(k) < u < 3*K(k) and +1 otherwise, and thus sign is computed by some quite
 * unnecessarily obfuscated bit manipulations. */
 static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
 		     INTERNAL_PRECISION* sn, INTERNAL_PRECISION* cn,
 		     INTERNAL_PRECISION* dn, INTERNAL_PRECISION* elF,
 		     INTERNAL_PRECISION* elK) {
  int sgn;
  U = u;
  *sn = AGM(ONE, sqrt(ONE - k*k), u);
  sgn = ((int) (fabs(u) / K)) % 4; /* sgn = 0, 1, 2, 3 */
  sgn ^= sgn >> 1;    /* (sgn & 1) = 0, 1, 1, 0 */
  sgn = 1 - ((sgn & 1) << 1);	/* sgn = 1, -1, -1, 1 */
  *cn = ((INTERNAL_PRECISION) sgn) * sqrt(ONE - *sn * *sn);
  *dn = sqrt(ONE - k*k* *sn * *sn);
  *elF = F;
  *elK = K;
 }
 /* Compute the coefficients for the optimal rational approximation R(x) to
 * sgn(x) of degree n over the interval epsilon < |x| < 1 using Zolotarev's
 * formula. 
 *
 * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
 * type = 1 for the approximation which is infinite at x = 0. */
 zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
  INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
    l, invlambda, xi, xisq, *tv, s, opl;
  int m, czero, ts;
  zolotarev_data *zd;
  izd *d = (izd*) malloc(sizeof(izd));
  d -> type = type;
  d -> epsilon = (INTERNAL_PRECISION) epsilon;
  d -> n = n;
  d -> dd = n / 2;
  d -> dn = d -> dd - 1 + n % 2; /* n even: dn = dd - 1, n odd: dn = dd */
  d -> deg_denom = 2 * d -> dd;
  d -> deg_num = 2 * d -> dn + 1;
  d -> a = (INTERNAL_PRECISION*) malloc((1 + d -> dn) *
 					sizeof(INTERNAL_PRECISION));
  d -> ap = (INTERNAL_PRECISION*) malloc(d -> dd *
 					 sizeof(INTERNAL_PRECISION));
  ksq = d -> epsilon * d -> epsilon;
  kp = sqrt(ONE - ksq);
  sncndnFK(ZERO, kp, &sn, &cn, &dn, &F, &Kp); /* compute Kp = K(kp) */
  z0 = TWO * Kp / (INTERNAL_PRECISION) n;
  M = ONE;
  A = ONE / d -> epsilon;
  sncndnFK(HALF * z0, kp, &sn, &cn, &dn, &F, &Kj); /* compute xi */
  xi = ONE / dn;
  xisq = xi * xi;
  invlambda = xi;
  for (m = 0; m < d -> dd; m++) {
    czero = 2 * (m + 1) == n; /* n even and m = dd -1 */
    z = z0 * ((INTERNAL_PRECISION) m + ONE);
    sncndnFK(z, kp, &sn, &cn, &dn, &F, &Kj);
    t = cn / sn;
    c = - t*t;
    if (!czero) (d -> a)[d -> dn - 1 - m] = ksq / c;
    z = z0 * ((INTERNAL_PRECISION) m + HALF);
    sncndnFK(z, kp, &sn, &cn, &dn, &F, &Kj);
    t = cn / sn;
    cp = - t*t;
    (d -> ap)[d -> dd - 1 - m] = ksq / cp;
    M *= (ONE - c) / (ONE - cp);
    A *= (czero ? -ksq : c) * (ONE - cp) / (cp * (ONE - c));
    invlambda *= (ONE - c*xisq) / (ONE - cp*xisq);
  }
  invlambda /= M;
  d -> A = TWO / (ONE + invlambda) * A;
  d -> Delta = (invlambda - ONE) / (invlambda + ONE);
  d -> gamma = (INTERNAL_PRECISION*) malloc((1 + d -> n) *
 					    sizeof(INTERNAL_PRECISION));
  l = ONE / invlambda;
  opl = ONE + l;
  sncndnFK(sqrt( d -> type == 1
 		   ? (THREE + l) / (FOUR * opl)
 		   : (ONE + THREE*l) / (opl*opl*opl)
 	       ), sqrt(ONE - l*l), &sn, &cn, &dn, &F, &Kj);
  s = M * F;
  for (m = 0; m < d -> n; m++) {
    sncndnFK(s + TWO*Kp*m/n, kp, &sn, &cn, &dn, &F, &Kj);
    d -> gamma[m] = d -> epsilon / dn;
  }
  /* If R(x) is a Zolotarev rational approximation of degree (n,m) with maximum
   * error Delta, then (1 - Delta^2) / R(x) is also an optimal Chebyshev
   * approximation of degree (m,n) */
  if (d -> type == 1) {
    d -> A = (ONE - d -> Delta * d -> Delta) / d -> A;
    tv = d -> a; d -> a = d -> ap; d -> ap = tv;
    ts = d -> dn; d -> dn = d -> dd; d -> dd = ts;
    ts = d -> deg_num; d -> deg_num = d -> deg_denom; d -> deg_denom = ts;
  }
  construct_partfrac(d);
  construct_contfrac(d);
  /* Converting everything to PRECISION for external use only */
  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
  zd -> A = (PRECISION) d -> A;
  zd -> Delta = (PRECISION) d -> Delta;
  zd -> epsilon = (PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
  zd -> dd = d -> dd;
  zd -> da = d -> da;
  zd -> db = d -> db;
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;
  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
  free(d -> a);
  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
  free(d -> ap);
  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
  free(d -> alpha);
  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
  free(d -> beta);
  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
  free(d -> gamma);
  free(d);
  return zd;
 }
 void zolotarev_free(zolotarev_data *zdata)
 {
    free(zdata -> a);
    free(zdata -> ap);
    free(zdata -> alpha);
    free(zdata -> beta);
    free(zdata -> gamma);
    free(zdata);
 }
 zolotarev_data* higham(PRECISION epsilon, int n) {
  INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
  int m, czero;
  zolotarev_data *zd;
  izd *d = (izd*) malloc(sizeof(izd));
  d -> type = 0;
  d -> epsilon = (INTERNAL_PRECISION) epsilon;
  d -> n = n;
  d -> dd = n / 2;
  d -> dn = d -> dd - 1 + n % 2; /* n even: dn = dd - 1, n odd: dn = dd */
  d -> deg_denom = 2 * d -> dd;
  d -> deg_num = 2 * d -> dn + 1;
  d -> a = (INTERNAL_PRECISION*) malloc((1 + d -> dn) *
 					sizeof(INTERNAL_PRECISION));
  d -> ap = (INTERNAL_PRECISION*) malloc(d -> dd *
 					 sizeof(INTERNAL_PRECISION));
  A = (INTERNAL_PRECISION) n;
  z0 = M_PI / A;
  A = n % 2 == 0 ? A : ONE / A;
  M = d -> epsilon * A;
  epssq = d -> epsilon * d -> epsilon;
  for (m = 0; m < d -> dd; m++) {
    czero = 2 * (m + 1) == n; /* n even and m = dd - 1*/
    if (!czero) {
      z = z0 * ((INTERNAL_PRECISION) m + ONE);
      t = tan(z);
      c = - t*t;
      (d -> a)[d -> dn - 1 - m] = c;
      M *= epssq - c;
    }
    z = z0 * ((INTERNAL_PRECISION) m + HALF);
    t = tan(z);
    cp = - t*t;
    (d -> ap)[d -> dd - 1 - m] = cp;
    M /= epssq - cp;
  }
  d -> gamma = (INTERNAL_PRECISION*) malloc((1 + d -> n) *
 					    sizeof(INTERNAL_PRECISION));
  for (m = 0; m < d -> n; m++) d -> gamma[m] = ONE;
  d -> A = A;
  d -> Delta = ONE - M;
  construct_partfrac(d);
  construct_contfrac(d);
  /* Converting everything to PRECISION for external use only */
  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
  zd -> A = (PRECISION) d -> A;
  zd -> Delta = (PRECISION) d -> Delta;
  zd -> epsilon = (PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
  zd -> dd = d -> dd;
  zd -> da = d -> da;
  zd -> db = d -> db;
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;
  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
  free(d -> a);
  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
  free(d -> ap);
  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
  free(d -> alpha);
  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
  free(d -> beta);
  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
  free(d -> gamma);
  free(d);
  return zd;
 }
 }}
 #ifdef TEST
 #undef ZERO
 #define ZERO ((PRECISION) 0)
 #undef ONE
 #define ONE ((PRECISION) 1)
 #undef TWO
 #define TWO ((PRECISION) 2)
 /* Evaluate the rational approximation R(x) using the factored form */
 static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
  PRECISION R;
  if (rdata -> type == 0) {
    R = rdata -> A * x;
    for (m = 0; m < rdata -> deg_denom/2; m++)
      R *= (2*(m+1) > rdata -> deg_num ? ONE : x*x - rdata -> a[m]) /
 	(x*x - rdata -> ap[m]);
  } else {
    R = rdata -> A / x;
    for (m = 0; m < rdata -> deg_num/2; m++)
      R *= (x*x - rdata -> a[m]) /
 	(2*(m+1) > rdata -> deg_denom ? ONE : x*x - rdata -> ap[m]);
  }
  return R;
 }
 /* Evaluate the rational approximation R(x) using the partial fraction form */
 static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
  PRECISION R = rdata -> alpha[rdata -> da - 1];
  for (m = 0; m < rdata -> dd; m++)
    R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
  if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
  return R * x;
 }    
 /* Evaluate the rational approximation R(x) using continued fraction form. 
 *
 * If x = 0 and type = 1 then the result should be INF, whereas if x = 0 and
 * type = 0 then the result should be 0, but division by zero will occur at
 * intermediate stages of the evaluation. For IEEE implementations with
 * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
 * but with signalling overflow you will get an error message. */
 static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
  PRECISION R = rdata -> beta[0] * x;
  for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
  return R;
 }    
 /* Evaluate the rational approximation R(x) using Cayley form */
 static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
  PRECISION T;
  T = rdata -> type == 0 ? ONE : -ONE;
  for (m = 0; m < rdata -> n; m++)
    T *= (rdata -> gamma[m] - x) / (rdata -> gamma[m] + x);
  return (ONE - T) / (ONE + T);
 }
 /* Test program. Apart from printing out the parameters for R(x) it produces
 * the following data files for plotting (unless NPLOT is defined):
 *
 * zolotarev-fn is a plot of R(x) for |x| < 1.2. This should look like sgn(x).
 *
 * zolotarev-err is a plot of the error |R(x) - sgn(x)| scaled by 1/Delta. This
 * should oscillate deg_num + deg_denom + 2 times between +1 and -1 over the
 * domain epsilon <= |x| <= 1.
 *
 * If ALLPLOTS is defined then zolotarev-partfrac (zolotarev-contfrac) is a
 * plot of the difference between the values of R(x) computed using the
 * factored form and the partial fraction (continued fraction) form, scaled by
 * 1/Delta. It should be zero everywhere. */
 int main(int argc, char** argv) {
  int m, n, plotpts = 5000, type = 0;
  float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
  zolotarev_data *rdata;
  PRECISION y;
  FILE *plot_function, *plot_error, 
    *plot_partfrac, *plot_contfrac, *plot_cayley;
  if (argc < 3 || argc > 4) {
    fprintf(stderr, "Usage: %s epsilon n [type]\n", *argv);
    exit(EXIT_FAILURE);
  }
  sscanf(argv[1], "%g", &eps);	/* First argument is epsilon */
  sscanf(argv[2], "%d", &n);	/* Second argument is n */
  if (argc == 4) sscanf(argv[3], "%d", &type); /* Third argument is type */
  if (type < 0 || type > 2) {
    fprintf(stderr, "%s: type must be 0 (Zolotarev R(0) = 0),\n"
 	    "\t\t1 (Zolotarev R(0) = Inf, or 2 (Higham)\n", *argv);
    exit(EXIT_FAILURE);
  }
  rdata = type == 2 
    ? higham((PRECISION) eps, n) 
    : zolotarev((PRECISION) eps, n, type);
  printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
 	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
 	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
 	 "\tPRECISION = " STRINGIFY(PRECISION)
 	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
 	 "\tDelta = %g (maximum error)\n\n"
 	 "\tA = %g (overall factor)\n",
 	 (float) rdata -> epsilon, rdata -> n, type,
 	 rdata -> deg_num, rdata -> deg_denom,
 	 rdata -> type == 1 ? "infinite" : "zero",
 	 (float) rdata -> Delta, (float) rdata -> A);
  for (m = 0; m < MIN(rdata -> dd, rdata -> dn); m++)
    printf("\ta[%2d] = %14.8g\t\ta'[%2d] = %14.8g\n",
 	   m + 1, (float) rdata -> a[m], m + 1, (float) rdata -> ap[m]);
  if (rdata -> dd > rdata -> dn)
    printf("\t\t\t\t\ta'[%2d] = %14.8g\n",
 	   rdata -> dn + 1, (float) rdata -> ap[rdata -> dn]);
  if (rdata -> dd < rdata -> dn)
    printf("\ta[%2d] = %14.8g\n",
 	   rdata -> dd + 1, (float) rdata -> a[rdata -> dd]);
  printf("\n\tPartial fraction coefficients\n");
  printf("\talpha[ 0] = %14.8g\n",
 	 (float) rdata -> alpha[rdata -> da - 1]);
  for (m = 0; m < rdata -> dd; m++)
    printf("\talpha[%2d] = %14.8g\ta'[%2d] = %14.8g\n",
 	   m + 1, (float) rdata -> alpha[m], m + 1, (float) rdata -> ap[m]);
  if (rdata -> type == 1)
    printf("\talpha[%2d] = %14.8g\ta'[%2d] = %14.8g\n",
 	   rdata -> dd + 1, (float) rdata -> alpha[rdata -> dd],
 	   rdata -> dd + 1, (float) ZERO);
  printf("\n\tContinued fraction coefficients\n");
  for (m = 0; m < rdata -> db; m++)
    printf("\tbeta[%2d] = %14.8g\n", m, (float) rdata -> beta[m]);
  printf("\n\tCayley transform coefficients\n");
  for (m = 0; m < rdata -> n; m++)
    printf("\tgamma[%2d] = %14.8g\n", m, (float) rdata -> gamma[m]);
 #ifndef NPLOT
  plot_function = fopen("zolotarev-fn.dat", "w");
  plot_error = fopen("zolotarev-err.dat", "w");
 #ifdef ALLPLOTS
  plot_partfrac = fopen("zolotarev-partfrac.dat", "w");
  plot_contfrac = fopen("zolotarev-contfrac.dat", "w");
  plot_cayley = fopen("zolotarev-cayley.dat", "w");
 #endif /* ALLPLOTS */
  for (m = 0, maxypferr = maxycferr = maxycaylerr = 0.0; m <= plotpts; m++) {
    x = 2.4 * (float) m / plotpts - 1.2;
    if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
      /* skip x = 0 for type 1, as R(0) is singular */
      y = zolotarev_eval((PRECISION) x, rdata);
      fprintf(plot_function, "%g %g\n", x, (float) y);
      fprintf(plot_error, "%g %g\n",
 	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
      ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
      ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
      ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
      if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
 	maxypferr = MAX(maxypferr, fabs(ypferr));
 	maxycferr = MAX(maxycferr, fabs(ycferr));
 	maxycaylerr = MAX(maxycaylerr, fabs(ycaylerr));
      }
 #ifdef ALLPLOTS
      fprintf(plot_partfrac, "%g %g\n", x, ypferr);
      fprintf(plot_contfrac, "%g %g\n", x, ycferr);
      fprintf(plot_cayley, "%g %g\n", x, ycaylerr);
 #endif /* ALLPLOTS */
    }
  }
 #ifdef ALLPLOTS
  fclose(plot_cayley);
  fclose(plot_contfrac);
  fclose(plot_partfrac);
 #endif /* ALLPLOTS */
  fclose(plot_error);
  fclose(plot_function);
  printf("\n\tMaximum PF error = %g (relative to Delta)\n", maxypferr);
  printf("\tMaximum CF error = %g (relative to Delta)\n", maxycferr);
  printf("\tMaximum Cayley error = %g (relative to Delta)\n", maxycaylerr);
 #endif /* NPLOT */
  free(rdata -> a);
  free(rdata -> ap);
  free(rdata -> alpha);
  free(rdata -> beta);
  free(rdata -> gamma);
  free(rdata);
  return EXIT_SUCCESS;
 }
 #endif /* TEST */
@@ -0,0 +1,87 @@
 /* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */
 #ifdef __cplusplus
 namespace Grid {
 namespace Approx {
 #endif
 #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
 #ifndef ZOLOTAREV_INTERNAL
 #ifndef PRECISION
 #define PRECISION double
 #endif
 #define ZPRECISION PRECISION
 #define ZOLOTAREV_DATA zolotarev_data
 #endif
 /* This struct contains the coefficients which parameterise an optimal rational
 * approximation to the signum function.
 *
 * The parameterisations used are:
 *
 * Factored form for type 0 (R0(0) = 0)
 *
 * R0(x) = A * x * prod(x^2 - a[j], j = 0 .. dn-1) / prod(x^2 - ap[j], j = 0
 * .. dd-1),
 *
 * where deg_num = 2*dn + 1 and deg_denom = 2*dd.
 *
 * Factored form for type 1 (R1(0) = infinity)
 *
 * R1(x) = (A / x) * prod(x^2 - a[j], j = 0 .. dn-1) / prod(x^2 - ap[j], j = 0
 * .. dd-1),
 *
 * where deg_num = 2*dn and deg_denom = 2*dd + 1. 
 *
 * Partial fraction form
 *
 * R(x) = alpha[da] * x + sum(alpha[j] * x / (x^2 - ap[j]), j = 0 .. da-1)
 *
 * where da = dd for type 0 and da = dd + 1 with ap[dd] = 0 for type 1.
 *
 * Continued fraction form 
 *
 * R(x) = beta[db-1] * x + 1 / (beta[db-2] * x + 1 / (beta[db-3] * x + ...))
 *
 * with the final coefficient being beta[0], with d' = 2 * dd + 1 for type 0
 * and db = 2 * dd + 2 for type 1.
 *
 * Cayley form (Chiu's domain wall formulation)
 *
 * R(x) = (1 - T(x)) / (1 + T(x))
 *
 * where T(x) = prod((x - gamma[j]) / (x + gamma[j]), j = 0 .. n-1)
 */
 typedef struct {
  ZPRECISION *a,      /* zeros of numerator, a[0 .. dn-1] */
    *ap,	      /* poles (zeros of denominator), ap[0 .. dd-1] */
    A,		      /* overall factor */
    *alpha,	      /* coefficients of partial fraction, alpha[0 .. da-1] */
    *beta,	      /* coefficients of continued fraction, beta[0 .. db-1] */
    *gamma,	      /* zeros of numerator of T in Cayley form */
    Delta,	      /* maximum error, |R(x) - sgn(x)| <= Delta */
    epsilon;	      /* minimum x value, epsilon < |x| < 1 */
  int n,	      /* approximation degree */
    type,	      /* 0: R(0) = 0, 1: R(0) = infinity */
    dn, dd, da, db,   /* number of elements of a, ap, alpha, and beta */
    deg_num,	      /* degree of numerator = deg_denom +/- 1 */
    deg_denom;	      /* degree of denominator */
 } ZOLOTAREV_DATA;
 #ifndef ZOLOTAREV_INTERNAL
 /* zolotarev(epsilon, n, type) returns a pointer to an initialised
 * zolotarev_data structure. The arguments must satisfy the constraints that
 * epsilon > 0, n > 0, and type = 0 or 1. */
 ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
 ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
 void zolotarev_free(zolotarev_data *zdata);
 #endif
 #ifdef __cplusplus
 }}
 #endif
@@ -0,0 +1,206 @@
 /*
  Mike Clark - 25th May 2005
  bigfloat.h
  Simple C++ wrapper for multiprecision datatype used by AlgRemez
  algorithm
 */
 #ifndef INCLUDED_BIGFLOAT_H
 #define INCLUDED_BIGFLOAT_H
 #include <gmp.h>
 #include <mpf2mpfr.h>
 #include <mpfr.h>
 class bigfloat {
 private:
  mpf_t x;
 public:
  bigfloat() { mpf_init(x); }
  bigfloat(const bigfloat& y) { mpf_init_set(x, y.x); }
  bigfloat(const unsigned long u) { mpf_init_set_ui(x, u); }
  bigfloat(const long i) { mpf_init_set_si(x, i); }
  bigfloat(const int i) {mpf_init_set_si(x,(long)i);}
  bigfloat(const float d) { mpf_init_set_d(x, (double)d); }
  bigfloat(const double d) { mpf_init_set_d(x, d); }  
  bigfloat(const char *str) { mpf_init_set_str(x, (char*)str, 10); }
  ~bigfloat(void) { mpf_clear(x); }
  operator double (void) const { return (double)mpf_get_d(x); }
  static void setDefaultPrecision(unsigned long dprec) {
    unsigned long bprec =  (unsigned long)(3.321928094 * (double)dprec);
    mpf_set_default_prec(bprec);
  }
  void setPrecision(unsigned long dprec) {
    unsigned long bprec =  (unsigned long)(3.321928094 * (double)dprec);
    mpf_set_prec(x,bprec);
  }
  unsigned long getPrecision(void) const { return mpf_get_prec(x); }
  unsigned long getDefaultPrecision(void) const { return mpf_get_default_prec(); }
  bigfloat& operator=(const bigfloat& y) {
    mpf_set(x, y.x); 
    return *this;
  }
  bigfloat& operator=(const unsigned long y) { 
    mpf_set_ui(x, y);
    return *this; 
  }
  bigfloat& operator=(const signed long y) {
    mpf_set_si(x, y); 
    return *this;
  }
  bigfloat& operator=(const float y) {
    mpf_set_d(x, (double)y); 
    return *this;
  }
  bigfloat& operator=(const double y) {
    mpf_set_d(x, y); 
    return *this;
  }
  size_t write(void);
  size_t read(void);
  /* Arithmetic Functions */
  bigfloat& operator+=(const bigfloat& y) { return *this = *this + y; }
  bigfloat& operator-=(const bigfloat& y) { return *this = *this - y; }
  bigfloat& operator*=(const bigfloat& y) { return *this = *this * y; }
  bigfloat& operator/=(const bigfloat& y) { return *this = *this / y; }
  friend bigfloat operator+(const bigfloat& x, const bigfloat& y) {
    bigfloat a;
    mpf_add(a.x,x.x,y.x);
    return a;
  }
  friend bigfloat operator+(const bigfloat& x, const unsigned long y) {
    bigfloat a;
    mpf_add_ui(a.x,x.x,y);
    return a;
  }
  friend bigfloat operator-(const bigfloat& x, const bigfloat& y) {
    bigfloat a;
    mpf_sub(a.x,x.x,y.x);
    return a;
  }
  friend bigfloat operator-(const unsigned long x, const bigfloat& y) {
    bigfloat a;
    mpf_ui_sub(a.x,x,y.x);
    return a;
  }
  friend bigfloat operator-(const bigfloat& x, const unsigned long y) {
    bigfloat a;
    mpf_sub_ui(a.x,x.x,y);
    return a;
  }
  friend bigfloat operator-(const bigfloat& x) {
    bigfloat a;
    mpf_neg(a.x,x.x);
    return a;
  }
  friend bigfloat operator*(const bigfloat& x, const bigfloat& y) {
    bigfloat a;
    mpf_mul(a.x,x.x,y.x);
    return a;
  }
  friend bigfloat operator*(const bigfloat& x, const unsigned long y) {
    bigfloat a;
    mpf_mul_ui(a.x,x.x,y);
    return a;
  }
  friend bigfloat operator/(const bigfloat& x, const bigfloat& y){
    bigfloat a;
    mpf_div(a.x,x.x,y.x);
    return a;
  }
  friend bigfloat operator/(const unsigned long x, const bigfloat& y){
    bigfloat a;
    mpf_ui_div(a.x,x,y.x);
    return a;
  }
  friend bigfloat operator/(const bigfloat& x, const unsigned long y){
    bigfloat a;
    mpf_div_ui(a.x,x.x,y);
    return a;
  }
  friend bigfloat sqrt_bf(const bigfloat& x){
    bigfloat a;
    mpf_sqrt(a.x,x.x);
    return a;
  }
  friend bigfloat sqrt_bf(const unsigned long x){
    bigfloat a;
    mpf_sqrt_ui(a.x,x);
    return a;
  }
  friend bigfloat abs_bf(const bigfloat& x){
    bigfloat a;
    mpf_abs(a.x,x.x);
    return a;
  }
  friend bigfloat pow_bf(const bigfloat& a, long power) {
    bigfloat b;
    mpf_pow_ui(b.x,a.x,power);
    return b;
  }
  friend bigfloat pow_bf(const bigfloat& a, bigfloat &power) {
    bigfloat b;
    mpfr_pow(b.x,a.x,power.x,GMP_RNDN);
    return b;
  }
  friend bigfloat exp_bf(const bigfloat& a) {
    bigfloat b;
    mpfr_exp(b.x,a.x,GMP_RNDN);
    return b;
  }
  /* Comparison Functions */
  friend int operator>(const bigfloat& x, const bigfloat& y) {
    int test;
    test = mpf_cmp(x.x,y.x);
    if (test > 0) return 1;
    else return 0;
  }
  friend int operator<(const bigfloat& x, const bigfloat& y) {
    int test;
    test = mpf_cmp(x.x,y.x);
    if (test < 0) return 1;
    else return 0;
  }
  friend int sgn(const bigfloat&);
 };
 #endif
@@ -0,0 +1,189 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/approx/bigfloat_double.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <math.h>
 typedef double mfloat; 
 class bigfloat {
 private:
  mfloat x;
 public:
  bigfloat() { }
  bigfloat(const bigfloat& y) { x=y.x; }
  bigfloat(const unsigned long u) { x=u; }
  bigfloat(const long i) { x=i; }
  bigfloat(const int i) { x=i;}
  bigfloat(const float d) { x=d;}
  bigfloat(const double d) {  x=d;}
  bigfloat(const char *str) { x=std::stod(std::string(str));}
  ~bigfloat(void) { }
  operator double (void) const { return (double)x; }
  static void setDefaultPrecision(unsigned long dprec) {
  }
  void setPrecision(unsigned long dprec) {
  }
  unsigned long getPrecision(void) const { return 64; }
  unsigned long getDefaultPrecision(void) const { return 64; }
  bigfloat& operator=(const bigfloat& y)     { x=y.x;    return *this;  }
  bigfloat& operator=(const unsigned long y) { x=y; return *this; }
  bigfloat& operator=(const signed long y)   { x=y; return *this; }
  bigfloat& operator=(const float y)    { x=y; return *this; }
  bigfloat& operator=(const double y)   { x=y; return *this; }
  size_t write(void);
  size_t read(void);
  /* Arithmetic Functions */
  bigfloat& operator+=(const bigfloat& y) { return *this = *this + y; }
  bigfloat& operator-=(const bigfloat& y) { return *this = *this - y; }
  bigfloat& operator*=(const bigfloat& y) { return *this = *this * y; }
  bigfloat& operator/=(const bigfloat& y) { return *this = *this / y; }
  friend bigfloat operator+(const bigfloat& x, const bigfloat& y) { 
    bigfloat a;
    a.x=x.x+y.x;
    return a;
  }
  friend bigfloat operator+(const bigfloat& x, const unsigned long y) {
    bigfloat a;
    a.x=x.x+y;
    return a;
  }
  friend bigfloat operator-(const bigfloat& x, const bigfloat& y) {
    bigfloat a;
    a.x=x.x-y.x;
    return a;
  }
  friend bigfloat operator-(const unsigned long x, const bigfloat& y) {
    bigfloat bx(x);
    return bx-y;
  }
  friend bigfloat operator-(const bigfloat& x, const unsigned long y) {
    bigfloat by(y);
    return x-by;
  }
  friend bigfloat operator-(const bigfloat& x) {
    bigfloat a;
    a.x=-x.x;
    return a;
  }
  friend bigfloat operator*(const bigfloat& x, const bigfloat& y) {
    bigfloat a;
    a.x=x.x*y.x;
    return a;
  }
  friend bigfloat operator*(const bigfloat& x, const unsigned long y) {
    bigfloat a;
    a.x=x.x*y;
    return a;
  }
  friend bigfloat operator/(const bigfloat& x, const bigfloat& y){
    bigfloat a;
    a.x=x.x/y.x;
    return a;
  }
  friend bigfloat operator/(const unsigned long x, const bigfloat& y){
    bigfloat bx(x);
    return bx/y;
  }
  friend bigfloat operator/(const bigfloat& x, const unsigned long y){
    bigfloat by(y);
    return x/by;
  }
  friend bigfloat sqrt_bf(const bigfloat& x){
    bigfloat a;
    a.x= sqrt(x.x);
    return a;
  }
  friend bigfloat sqrt_bf(const unsigned long x){
    bigfloat a(x);
    return sqrt_bf(a);
  }
  friend bigfloat abs_bf(const bigfloat& x){
    bigfloat a;
    a.x=fabs(x.x);
    return a;
  }
  friend bigfloat pow_bf(const bigfloat& a, long power) {
    bigfloat b;
    b.x=pow(a.x,power);
    return b;
  }
  friend bigfloat pow_bf(const bigfloat& a, bigfloat &power) {
    bigfloat b;
    b.x=pow(a.x,power.x);
    return b;
  }
  friend bigfloat exp_bf(const bigfloat& a) {
    bigfloat b;
    b.x=exp(a.x);
    return b;
  }
  /* Comparison Functions */
  friend int operator>(const bigfloat& x, const bigfloat& y) {
    return x.x>y.x;
  }
  friend int operator<(const bigfloat& x, const bigfloat& y) {
    return x.x<y.x;
  }
  friend int sgn(const bigfloat& x) {
    if ( x.x>=0 )  return 1;   
    else return 0;
  }
  /* Miscellaneous Functions */
  //  friend bigfloat& random(void);
 };
@@ -0,0 +1,397 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/AdefGeneric.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
 #define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
  /*
   * Compared to Tang-2009:  P=Pleft. P^T = PRight Q=MssInv. 
   * Script A = SolverMatrix 
   * Script P = Preconditioner
   *
   * Deflation methods considered
   *      -- Solve P A x = P b        [ like Luscher ]
   * DEF-1        M P A x = M P b     [i.e. left precon]
   * DEF-2        P^T M A x = P^T M b
   * ADEF-1       Preconditioner = M P + Q      [ Q + M + M A Q]
   * ADEF-2       Preconditioner = P^T M + Q
   * BNN          Preconditioner = P^T M P + Q
   * BNN2         Preconditioner = M P + P^TM +Q - M P A M 
   * 
   * Implement ADEF-2
   *
   * Vstart = P^Tx + Qb
   * M1 = P^TM + Q
   * M2=M3=1
   * Vout = x
   */
 // abstract base
 template<class Field, class CoarseField>
 class TwoLevelFlexiblePcg : public LinearFunction<Field>
 {
 public:
  int verbose;
  RealD   Tolerance;
  Integer MaxIterations;
  const int mmax = 5;
  GridBase *grid;
  GridBase *coarsegrid;
  LinearOperatorBase<Field>   *_Linop
  OperatorFunction<Field>     *_Smoother,
  LinearFunction<CoarseField> *_CoarseSolver;
  // Need somthing that knows how to get from Coarse to fine and back again
  // more most opertor functions
  TwoLevelFlexiblePcg(RealD tol,
 		     Integer maxit,
 		     LinearOperatorBase<Field> *Linop,
 		     LinearOperatorBase<Field> *SmootherLinop,
 		     OperatorFunction<Field>   *Smoother,
 		     OperatorFunction<CoarseField>  CoarseLinop
 		     ) : 
      Tolerance(tol), 
      MaxIterations(maxit),
      _Linop(Linop),
      _PreconditionerLinop(PrecLinop),
      _Preconditioner(Preconditioner)
  { 
    verbose=0;
  };
  // The Pcg routine is common to all, but the various matrices differ from derived 
  // implementation to derived implmentation
  void operator() (const Field &src, Field &psi){
  void operator() (const Field &src, Field &psi){
    psi.checkerboard = src.checkerboard;
    grid             = src._grid;
    RealD f;
    RealD rtzp,rtz,a,d,b;
    RealD rptzp;
    RealD tn;
    RealD guess = norm2(psi);
    RealD ssq   = norm2(src);
    RealD rsq   = ssq*Tolerance*Tolerance;
    /////////////////////////////
    // Set up history vectors
    /////////////////////////////
    std::vector<Field> p  (mmax,grid);
    std::vector<Field> mmp(mmax,grid);
    std::vector<RealD> pAp(mmax);
    Field x  (grid); x = psi;
    Field z  (grid);
    Field tmp(grid);
    Field r  (grid);
    Field mu (grid);
    //////////////////////////
    // x0 = Vstart -- possibly modify guess
    //////////////////////////
    x=src;
    Vstart(x,src);
    // r0 = b -A x0
    HermOp(x,mmp); // Shouldn't this be something else?
    axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
    //////////////////////////////////
    // Compute z = M1 x
    //////////////////////////////////
    M1(r,z,tmp,mp,SmootherMirs);
    rtzp =real(innerProduct(r,z));
    ///////////////////////////////////////
    // Solve for Mss mu = P A z and set p = z-mu
    // Def2: p = 1 - Q Az = Pright z 
    // Other algos M2 is trivial
    ///////////////////////////////////////
    M2(z,p[0]);
    for (int k=0;k<=MaxIterations;k++){
      int peri_k  = k % mmax;
      int peri_kp = (k+1) % mmax;
      rtz=rtzp;
      d= M3(p[peri_k],mp,mmp[peri_k],tmp);
      a = rtz/d;
      // Memorise this
      pAp[peri_k] = d;
      axpy(x,a,p[peri_k],x);
      RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
      // Compute z = M x
      M1(r,z,tmp,mp);
      rtzp =real(innerProduct(r,z));
      M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
      p[peri_kp]=p[peri_k];
      // Standard search direction  p -> z + b p    ; b = 
      b = (rtzp)/rtz;
      int northog;
      //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
      northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
      for(int back=0; back < northog; back++){
 	int peri_back = (k-back)%mmax;
 	RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
 	RealD beta = -pbApk/pAp[peri_back];
 	axpy(p[peri_kp],beta,p[peri_back],p[peri_kp]);
      }
      RealD rrn=sqrt(rn/ssq);
      std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
      // Stopping condition
      if ( rn <= rsq ) { 
 	HermOp(x,mmp); // Shouldn't this be something else?
 	axpy(tmp,-1.0,src,mmp[0]);
 	RealD psinorm = sqrt(norm2(x));
 	RealD srcnorm = sqrt(norm2(src));
 	RealD tmpnorm = sqrt(norm2(tmp));
 	RealD true_residual = tmpnorm/srcnorm;
 	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
 	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
 	return k;
      }
    }
    // Non-convergence
    assert(0);
  }
 public:
  virtual void M(Field & in,Field & out,Field & tmp) {
  }
  virtual void M1(Field & in, Field & out) {// the smoother
    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
    Field tmp(grid);
    Field Min(grid);
    PcgM(in,Min); // Smoother call
    HermOp(Min,out);
    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min
    ProjectToSubspace(tmp,PleftProj);     
    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
    PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
    axpy(out,1.0,Min,tmp); // Min+tmp
  }
  virtual void M2(const Field & in, Field & out) {
    out=in;
    // Must override for Def2 only
    //  case PcgDef2:
    //    Pright(in,out);
    //    break;
  }
  virtual RealD M3(const Field & p, Field & mmp){
    double d,dd;
    HermOpAndNorm(p,mmp,d,dd);
    return dd;
    // Must override for Def1 only
    //  case PcgDef1:
    //    d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
    //      linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
    //    Pleft(mp,mmp);
    //    d=real(linop_d->inner(p,mmp));
  }
  virtual void VstartDef2(Field & xconst Field & src){
    //case PcgDef2:
    //case PcgAdef2: 
    //case PcgAdef2f:
    //case PcgV11f:
    ///////////////////////////////////
    // Choose x_0 such that 
    // x_0 = guess +  (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
    //                               = [1 - Ass_inv A] Guess + Assinv src
    //                               = P^T guess + Assinv src 
    //                               = Vstart  [Tang notation]
    // This gives:
    // W^T (src - A x_0) = src_s - A guess_s - r_s
    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
    //                   = 0 
    ///////////////////////////////////
    Field r(grid);
    Field mmp(grid);
    HermOp(x,mmp);
    axpy (r, -1.0, mmp, src);        // r_{-1} = src - A x
    ProjectToSubspace(r,PleftProj);     
    ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
    PromoteFromSubspace(PleftMss_proj,mmp);  
    x=x+mmp;
  }
  virtual void Vstart(Field & x,const Field & src){
    return;
  }
  /////////////////////////////////////////////////////////////////////
  // Only Def1 has non-trivial Vout. Override in Def1
  /////////////////////////////////////////////////////////////////////
  virtual void   Vout  (Field & in, Field & out,Field & src){
    out = in;
    //case PcgDef1:
    //    //Qb + PT x
    //    ProjectToSubspace(src,PleftProj);     
    //    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
    //    PromoteFromSubspace(PleftMss_proj,tmp);  
    //    
    //    Pright(in,out);
    //    
    //    linop_d->axpy(out,tmp,out,1.0);
    //    break;
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // Pright and Pleft are common to all implementations
  ////////////////////////////////////////////////////////////////////////////////////////////////
  virtual void Pright(Field & in,Field & out){
    // P_R  = [ 1              0 ] 
    //        [ -Mss^-1 Msb    0 ] 
    Field in_sbar(grid);
    ProjectToSubspace(in,PleftProj);     
    PromoteFromSubspace(PleftProj,out);  
    axpy(in_sbar,-1.0,out,in);       // in_sbar = in - in_s 
    HermOp(in_sbar,out);
    ProjectToSubspace(out,PleftProj);           // Mssbar in_sbar  (project)
    ApplyInverse     (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
    PromoteFromSubspace(PleftMss_proj,out);     // 
    axpy(out,-1.0,out,in_sbar);     // in_sbar - Mss^{-1} Mssbar in_sbar
  }
  virtual void Pleft (Field & in,Field & out){
    // P_L  = [ 1  -Mbs Mss^-1] 
    //        [ 0   0         ] 
    Field in_sbar(grid);
    Field    tmp2(grid);
    Field    Mtmp(grid);
    ProjectToSubspace(in,PleftProj);     
    PromoteFromSubspace(PleftProj,out);  
    axpy(in_sbar,-1.0,out,in);      // in_sbar = in - in_s
    ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
    PromoteFromSubspace(PleftMss_proj,out);
    HermOp(out,Mtmp);
    ProjectToSubspace(Mtmp,PleftProj);      // Msbar s Mss^{-1}
    PromoteFromSubspace(PleftProj,tmp2);
    axpy(out,-1.0,tmp2,Mtmp);
    axpy(out,-1.0,out,in_sbar);     // in_sbar - Msbars Mss^{-1} in_s
  }
 }
 template<class Field>
 class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
 public:
  virtual void M(Field & in,Field & out,Field & tmp){
  } 
  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
  }
  virtual void M2(Field & in, Field & out){
  }
  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
  }
  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
  }
 }
 /*
 template<class Field>
 class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
 public:
  virtual void M(Field & in,Field & out,Field & tmp); 
  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
  virtual void M2(Field & in, Field & out);
  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
 }
 template<class Field>
 class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
 public:
  virtual void M(Field & in,Field & out,Field & tmp); 
  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
  virtual void M2(Field & in, Field & out);
  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
  virtual void   Vout  (Field & in, Field & out,Field & src,Field & tmp);
 }
 template<class Field>
 class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
 public:
  virtual void M(Field & in,Field & out,Field & tmp); 
  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
  virtual void M2(Field & in, Field & out);
  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
 }
 template<class Field>
 class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
 public:
  virtual void M(Field & in,Field & out,Field & tmp); 
  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
  virtual void M2(Field & in, Field & out);
  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
 }
 */
 #endif
@@ -0,0 +1,606 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
 Copyright (C) 2017
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
 #define GRID_BLOCK_CONJUGATE_GRADIENT_H
 namespace Grid {
 enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
 template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
 public:
  typedef typename Field::scalar_type scomplex;
  int blockDim ;
  int Nblock;
  BlockCGtype CGtype;
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
  {};
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 Eigen::MatrixXcd &C,
 		 Eigen::MatrixXcd &Cinv,
 		 Field & Q,
 		 const Field & R)
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  //Dimensions
  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
  //
  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
  //
  //   Q  C = R => Q = R C^{-1}
  //
  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
  //
  // Set C = L^{dag}, and then Q^dag Q = ident 
  //
  // Checks:
  // Cdag C = Rdag R ; passes.
  // QdagQ  = 1      ; passes
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  // Force manifest hermitian to avoid rounding related
  m_rr = 0.5*(m_rr+m_rr.adjoint());
 #if 0
  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
  auto  D_ldlt = m_rr.ldlt().vectorD(); 
  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
 #endif
  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Q = R C^{-1}
  //
  // Q_j  = R_i Cinv(i,j) 
  //
  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Call one of several implementations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  if ( CGtype == BlockCGrQ ) {
    BlockCGrQsolve(Linop,Src,Psi);
  } else if (CGtype == BlockCG ) {
    BlockCGsolve(Linop,Src,Psi);
  } else if (CGtype == CGmultiRHS ) {
    CGmultiRHSsolve(Linop,Src,Psi);
  } else {
    assert(0);
  }
 }
 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQ implementation:
 //--------------------------
 // X is guess/Solution
 // B is RHS
 // Solve A X_i = B_i    ;        i refers to Nblock index
 ////////////////////////////////////////////////////////////////////////////
 void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = B._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  X.checkerboard = B.checkerboard;
  conformable(X, B);
  Field tmp(B);
  Field Q(B);
  Field D(B);
  Field Z(B);
  Field AD(B);
  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,B,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,B,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,X,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  /************************************************************************
   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
   ************************************************************************
   * Dimensions:
   *
   *   X,B==(Nferm x Nblock)
   *   A==(Nferm x Nferm)
   *  
   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
   * 
   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
   * for k: 
   *   Z  = AD
   *   M  = [D^dag Z]^{-1}
   *   X  = X + D MC
   *   QS = Q - ZM
   *   D  = Q + D S^dag
   *   C  = S C
   */
  ///////////////////////////////////////
  // Initial block: initial search dir is guess
  ///////////////////////////////////////
  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
  Linop.HermOp(X, AD);
  tmp = B - AD;  
  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;
  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch QRTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    //3. Z  = AD
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
    sliceMaddTimer.Start();
    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
    sliceMaddTimer.Stop();
    //6. QS = Q - ZM
    sliceMaddTimer.Start();
    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
    sliceMaddTimer.Stop();
    QRTimer.Start();
    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
    QRTimer.Stop();
    //7. D  = Q + D S^dag
    m_tmp = m_S.adjoint();
    sliceMaddTimer.Start();
    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
    sliceMaddTimer.Stop();
    //8. C  = S C
    m_C = m_S*m_C;
    /*********************
     * convergence monitor
     *********************
     */
    m_rr = m_C.adjoint() * m_C;
    RealD max_resid=0;
    RealD rrsum=0;
    RealD rr;
    for(int b=0;b<Nblock;b++) {
      rrsum+=real(m_rr(b,b));
      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
 		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(X, AD);
      AD = AD-B;
      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
 void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  Psi.checkerboard = Src.checkerboard;
  conformable(Psi, Src);
  Field P(Src);
  Field AP(Src);
  Field R(Src);
  Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,Src,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,Src,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,Psi,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  // Initial search dir is guess
  Linop.HermOp(Psi, AP);
  /************************************************************************
   * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
   ************************************************************************
   * O'Leary : R = B - A X
   * O'Leary : P = M R ; preconditioner M = 1
   * O'Leary : alpha = PAP^{-1} RMR
   * O'Leary : beta  = RMR^{-1}_old RMR_new
   * O'Leary : X=X+Palpha
   * O'Leary : R_new=R_old-AP alpha
   * O'Leary : P=MR_new+P beta
   */
  R = Src - AP;  
  P = R;
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    RealD rrsum=0;
    for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
    MatrixTimer.Start();
    Linop.HermOp(P, AP);
    MatrixTimer.Stop();
    // Alpha
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
    sliceInnerTimer.Stop();
    m_pAp_inv = m_pAp.inverse();
    m_alpha   = m_pAp_inv * m_rr ;
    // Psi, R update
    sliceMaddTimer.Start();
    sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi
    sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
    sliceMaddTimer.Stop();
    // Beta
    m_rr_inv = m_rr.inverse();
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_rr,R,R,Orthog);
    sliceInnerTimer.Stop();
    m_beta = m_rr_inv *m_rr;
    // Search update
    sliceMaddTimer.Start();
    sliceMaddMatrix(AP,m_beta,P,R,Orthog);
    sliceMaddTimer.Stop();
    P= AP;
    /*********************
     * convergence monitor
     *********************
     */
    RealD max_resid=0;
    RealD rr;
    for(int b=0;b<Nblock;b++){
      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
 		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(Psi, AP);
      AP = AP-Src;
      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
 // Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
 void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  int Orthog = blockDim; // First dimension is block dim
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  Psi.checkerboard = Src.checkerboard;
  conformable(Psi, Src);
  Field P(Src);
  Field AP(Src);
  Field R(Src);
  std::vector<ComplexD> v_pAp(Nblock);
  std::vector<RealD> v_rr (Nblock);
  std::vector<RealD> v_rr_inv(Nblock);
  std::vector<RealD> v_alpha(Nblock);
  std::vector<RealD> v_beta(Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,Src,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,Src,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,Psi,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  // Initial search dir is guess
  Linop.HermOp(Psi, AP);
  R = Src - AP;  
  P = R;
  sliceNorm(v_rr,R,Orthog);
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch sliceNormTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    RealD rrsum=0;
    for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
    MatrixTimer.Start();
    Linop.HermOp(P, AP);
    MatrixTimer.Stop();
    // Alpha
    sliceInnerTimer.Start();
    sliceInnerProductVector(v_pAp,P,AP,Orthog);
    sliceInnerTimer.Stop();
    for(int b=0;b<Nblock;b++){
      v_alpha[b] = v_rr[b]/real(v_pAp[b]);
    }
    // Psi, R update
    sliceMaddTimer.Start();
    sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi
    sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
    sliceMaddTimer.Stop();
    // Beta
    for(int b=0;b<Nblock;b++){
      v_rr_inv[b] = 1.0/v_rr[b];
    }
    sliceNormTimer.Start();
    sliceNorm(v_rr,R,Orthog);
    sliceNormTimer.Stop();
    for(int b=0;b<Nblock;b++){
      v_beta[b] = v_rr_inv[b] *v_rr[b];
    }
    // Search update
    sliceMaddTimer.Start();
    sliceMaddVector(P,v_beta,P,R,Orthog);
    sliceMaddTimer.Stop();
    /*********************
     * convergence monitor
     *********************
     */
    RealD max_resid=0;
    for(int b=0;b<Nblock;b++){
      RealD rr = v_rr[b]/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(Psi, AP);
      AP = AP-Src;
      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 };
 }
 #endif
@@ -0,0 +1,177 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/ConjugateGradient.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_H
 #define GRID_CONJUGATE_GRADIENT_H
 namespace Grid {
 /////////////////////////////////////////////////////////////
 // Base classes for iterative processes based on operators
 // single input vec, single output vec.
 /////////////////////////////////////////////////////////////
 template <class Field>
 class ConjugateGradient : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
      : Tolerance(tol),
        MaxIterations(maxit),
        ErrorOnNoConverge(err_on_no_conv){};
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD cp, c, a, d, b, ssq, qq, b_pred;
    Field p(src);
    Field mmp(src);
    Field r(src);
    // Initial residual computation & set up
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    Linop.HermOpAndNorm(psi, mmp, d, b);
    r = src - mmp;
    p = r;
    a = norm2(p);
    cp = a;
    ssq = norm2(src);
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   mmp " << b << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:  cp,r " << cp << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:     p " << a << std::endl;
    RealD rsq = Tolerance * Tolerance * ssq;
    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
      return;
    }
    std::cout << GridLogIterative << std::setprecision(8)
              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
    GridStopWatch LinalgTimer;
    GridStopWatch InnerTimer;
    GridStopWatch AxpyNormTimer;
    GridStopWatch LinearCombTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations*1000; k++) {
      c = cp;
      MatrixTimer.Start();
      Linop.HermOp(p, mmp);
      MatrixTimer.Stop();
      LinalgTimer.Start();
      InnerTimer.Start();
      ComplexD dc  = innerProduct(p,mmp);
      InnerTimer.Stop();
      d = dc.real();
      a = c / d;
      AxpyNormTimer.Start();
      cp = axpy_norm(r, -a, mmp, r);
      AxpyNormTimer.Stop();
      b = cp / c;
      LinearCombTimer.Start();
      parallel_for(int ss=0;ss<src._grid->oSites();ss++){
 	vstream(psi[ss], a      *  p[ss] + psi[ss]);
 	vstream(p  [ss], b      *  p[ss] + r[ss]);
      }
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
                << " residual " << cp << " target " << rsq << std::endl;
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
        RealD srcnorm = sqrt(norm2(src));
        RealD resnorm = sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
        std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
 	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
 	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
        std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
 	std::cout << GridLogPerformance << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	std::cout << GridLogPerformance << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
 	std::cout << GridLogPerformance << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
 	std::cout << GridLogPerformance << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
 	std::cout << GridLogPerformance << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 	IterationsToComplete = k;	
        return;
      }
    }
    std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
              << std::endl;
    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;
  }
 };
 }
 #endif
@@ -0,0 +1,154 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
 #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
 namespace Grid {
  //Mixed precision restarted defect correction CG
  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
  public:                                                
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
      Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
      OuterLoopNormMult(100.), guesser(NULL){ };
    void useGuesser(LinearFunction<FieldF> &g){
      guesser = &g;
    }
    void operator() (const FieldD &src_d_in, FieldD &sol_d){
      TotalInnerIterations = 0;
      GridStopWatch TotalTimer;
      TotalTimer.Start();
      int cb = src_d_in.checkerboard;
      sol_d.checkerboard = cb;
      RealD src_norm = norm2(src_d_in);
      RealD stop = src_norm * Tolerance*Tolerance;
      GridBase* DoublePrecGrid = src_d_in._grid;
      FieldD tmp_d(DoublePrecGrid);
      tmp_d.checkerboard = cb;
      FieldD tmp2_d(DoublePrecGrid);
      tmp2_d.checkerboard = cb;
      FieldD src_d(DoublePrecGrid);
      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
      RealD inner_tol = InnerTolerance;
      FieldF src_f(SinglePrecGrid);
      src_f.checkerboard = cb;
      FieldF sol_f(SinglePrecGrid);
      sol_f.checkerboard = cb;
      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
      CG_f.ErrorOnNoConverge = false;
      GridStopWatch InnerCGtimer;
      GridStopWatch PrecChangeTimer;
      Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
      for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
 	//Compute double precision rsd and also new RHS vector.
 	Linop_d.HermOp(sol_d, tmp_d);
 	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
 	if(norm < OuterLoopNormMult * stop){
 	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 	  break;
 	}
 	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 	PrecChangeTimer.Start();
 	precisionChange(src_f, src_d);
 	PrecChangeTimer.Stop();
 	zeroit(sol_f);
 	//Optionally improve inner solver guess (eg using known eigenvectors)
 	if(guesser != NULL)
 	  (*guesser)(src_f, sol_f);
 	//Inner CG
 	CG_f.Tolerance = inner_tol;
 	InnerCGtimer.Start();
 	CG_f(Linop_f, src_f, sol_f);
 	InnerCGtimer.Stop();
 	TotalInnerIterations += CG_f.IterationsToComplete;
 	//Convert sol back to double and add to double prec solution
 	PrecChangeTimer.Start();
 	precisionChange(tmp_d, sol_f);
 	PrecChangeTimer.Stop();
 	axpy(sol_d, 1.0, tmp_d, sol_d);
      }
      //Final trial CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
      CG_d(Linop_d, src_d_in, sol_d);
      TotalFinalStepIterations = CG_d.IterationsToComplete;
      TotalTimer.Stop();
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
    }
  };
 }
 #endif
@@ -0,0 +1,322 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
 #define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Base classes for iterative processes based on operators
    // single input vec, single output vec.
    /////////////////////////////////////////////////////////////
  template<class Field> 
    class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
                                        public OperatorFunction<Field>
    {
 public:                                                
    RealD   Tolerance;
    Integer MaxIterations;
    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
    int verbose;
    MultiShiftFunction shifts;
    ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
 	MaxIterations(maxit),
 	shifts(_shifts)
    { 
      verbose=1;
    }
 void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
 {
  GridBase *grid = src._grid;
  int nshift = shifts.order;
  std::vector<Field> results(nshift,grid);
  (*this)(Linop,src,results,psi);
 }
 void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
 {
  int nshift = shifts.order;
  (*this)(Linop,src,results);
  psi = shifts.norm*src;
  for(int i=0;i<nshift;i++){
    psi = psi + shifts.residues[i]*results[i];
  }
  return;
 }
 void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
 {
  GridBase *grid = src._grid;
  ////////////////////////////////////////////////////////////////////////
  // Convenience references to the info stored in "MultiShiftFunction"
  ////////////////////////////////////////////////////////////////////////
  int nshift = shifts.order;
  std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
  std::vector<RealD> &mresidual(shifts.tolerances);
  std::vector<RealD> alpha(nshift,1.0);
  std::vector<Field>   ps(nshift,grid);// Search directions
  assert(psi.size()==nshift);
  assert(mass.size()==nshift);
  assert(mresidual.size()==nshift);
  // dynamic sized arrays on stack; 2d is a pain with vector
  RealD  bs[nshift];
  RealD  rsq[nshift];
  RealD  z[nshift][2];
  int     converged[nshift];
  const int       primary =0;
  //Primary shift fields CG iteration
  RealD a,b,c,d;
  RealD cp,bp,qq; //prev
  // Matrix mult fields
  Field r(grid);
  Field p(grid);
  Field tmp(grid);
  Field mmp(grid);
  // Check lightest mass
  for(int s=0;s<nshift;s++){
    assert( mass[s]>= mass[primary] );
    converged[s]=0;
  }
  // Wire guess to zero
  // Residuals "r" are src
  // First search direction "p" is also src
  cp = norm2(src);
  for(int s=0;s<nshift;s++){
    rsq[s] = cp * mresidual[s] * mresidual[s];
    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
 	     <<" target resid "<<rsq[s]<<std::endl;
    ps[s] = src;
  }
  // r and p for primary
  r=src;
  p=src;
  //MdagM+m[0]
  Linop.HermOpAndNorm(p,mmp,d,qq);
  axpy(mmp,mass[0],p,mmp);
  RealD rn = norm2(p);
  d += rn*mass[0];
  // have verified that inner product of 
  // p and mmp is equal to d after this since
  // the d computation is tricky
  //  qq = real(innerProduct(p,mmp));
  //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
  b = -cp /d;
  // Set up the various shift variables
  int       iz=0;
  z[0][1-iz] = 1.0;
  z[0][iz]   = 1.0;
  bs[0]      = b;
  for(int s=1;s<nshift;s++){
    z[s][1-iz] = 1.0;
    z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
    bs[s]      = b*z[s][iz]; 
  }
  // r += b[0] A.p[0]
  // c= norm(r)
  c=axpy_norm(r,b,mmp,r);
  for(int s=0;s<nshift;s++) {
    axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
  }
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
  GridStopWatch AXPYTimer;
  GridStopWatch ShiftTimer;
  GridStopWatch QRTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  // Iteration loop
  int k;
  for (k=1;k<=MaxIterations;k++){
    a = c /cp;
    AXPYTimer.Start();
    axpy(p,a,p,r);
    AXPYTimer.Stop();
    // Note to self - direction ps is iterated seperately
    // for each shift. Does not appear to have any scope
    // for avoiding linear algebra in "single" case.
    // 
    // However SAME r is used. Could load "r" and update
    // ALL ps[s]. 2/3 Bandwidth saving
    // New Kernel: Load r, vector of coeffs, vector of pointers ps
    AXPYTimer.Start();
    for(int s=0;s<nshift;s++){
      if ( ! converged[s] ) { 
 	if (s==0){
 	  axpy(ps[s],a,ps[s],r);
 	} else{
 	  RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	  axpby(ps[s],z[s][iz],as,r,ps[s]);
 	}
      }
    }
    AXPYTimer.Stop();
    cp=c;
    MatrixTimer.Start();  
    //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
    // The below is faster on KNL
    Linop.HermOp(p,mmp); 
    d=real(innerProduct(p,mmp));
    MatrixTimer.Stop();  
    AXPYTimer.Start();
    axpy(mmp,mass[0],p,mmp);
    AXPYTimer.Stop();
    RealD rn = norm2(p);
    d += rn*mass[0];
    bp=b;
    b=-cp/d;
    AXPYTimer.Start();
    c=axpy_norm(r,b,mmp,r);
    AXPYTimer.Stop();
    // Toggle the recurrence history
    bs[0] = b;
    iz = 1-iz;
    ShiftTimer.Start();
    for(int s=1;s<nshift;s++){
      if((!converged[s])){
 	RealD z0 = z[s][1-iz];
 	RealD z1 = z[s][iz];
 	z[s][iz] = z0*z1*bp
 	  / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
      }
    }
    ShiftTimer.Stop();
    for(int s=0;s<nshift;s++){
      int ss = s;
      // Scope for optimisation here in case of "single".
      // Could load psi[0] and pull all ps[s] in.
      //      if ( single ) ss=primary;
      // Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
      // Pipelined CG gain:
      //
      // New Kernel: Load r, vector of coeffs, vector of pointers ps
      // New Kernel: Load psi[0], vector of coeffs, vector of pointers ps
      // If can predict the coefficient bs then we can fuse these and avoid write reread cyce
      //  on ps[s].
      // Before:  3 x npole  + 3 x npole
      // After :  2 x npole (ps[s])        => 3x speed up of multishift CG.
      if( (!converged[s]) ) { 
 	axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]);
      }
    }
    // Convergence checks
    int all_converged = 1;
    for(int s=0;s<nshift;s++){
      if ( (!converged[s]) ){
 	RealD css  = c * z[s][iz]* z[s][iz];
 	if(css<rsq[s]){
 	  if ( ! converged[s] )
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	      converged[s]=1;
 	} else {
 	  all_converged=0;
 	}
      }
    }
    if ( all_converged ){
    SolverTimer.Stop();
      std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
      std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
      // Check answers 
      for(int s=0; s < nshift; s++) { 
 	Linop.HermOpAndNorm(psi[s],mmp,d,qq);
 	axpy(tmp,mass[s],psi[s],mmp);
 	axpy(r,-alpha[s],src,tmp);
 	RealD rn = norm2(r);
 	RealD cn = norm2(src);
 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
      }
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
      IterationsToComplete = k;	
      return;
    }
  }
  // ugly hack
  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
 //  assert(0);
 }
  };
 }
 #endif
@@ -0,0 +1,256 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
 #define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
 namespace Grid {
  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
  public:
    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
    // Defaults true.
    RealD Tolerance;
    Integer MaxIterations;
    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
    Integer ReliableUpdatesPerformed;
    bool DoFinalCleanup; //Final DP cleanup, defaults to true
    Integer IterationsToCleanup; //Final DP cleanup step iterations
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    GridBase* SinglePrecGrid;
    RealD Delta; //reliable update parameter
    //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
    LinearOperatorBase<FieldF> *Linop_fallback;
    RealD fallback_transition_tol;
    ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
      : Tolerance(tol),
        MaxIterations(maxit),
 	Delta(_delta),
 	Linop_f(_Linop_f),
 	Linop_d(_Linop_d),
 	SinglePrecGrid(_sp_grid),
        ErrorOnNoConverge(err_on_no_conv),
 	DoFinalCleanup(true),
 	Linop_fallback(NULL)
    {};
    void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
      Linop_fallback = &_Linop_fallback;
      fallback_transition_tol = _fallback_transition_tol;      
    }
    void operator()(const FieldD &src, FieldD &psi) {
      LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
      bool using_fallback = false;
      psi.checkerboard = src.checkerboard;
      conformable(psi, src);
      RealD cp, c, a, d, b, ssq, qq, b_pred;
      FieldD p(src);
      FieldD mmp(src);
      FieldD r(src);
      // Initial residual computation & set up
      RealD guess = norm2(psi);
      assert(std::isnan(guess) == 0);
      Linop_d.HermOpAndNorm(psi, mmp, d, b);
      r = src - mmp;
      p = r;
      a = norm2(p);
      cp = a;
      ssq = norm2(src);
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   src " << ssq << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:    mp " << d << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   mmp " << b << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:  cp,r " << cp << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:     p " << a << std::endl;
      RealD rsq = Tolerance * Tolerance * ssq;
      // Check if guess is really REALLY good :)
      if (cp <= rsq) {
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
 	std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
 	return;
      }
      //Single prec initialization
      FieldF r_f(SinglePrecGrid);
      r_f.checkerboard = r.checkerboard;
      precisionChange(r_f, r);
      FieldF psi_f(r_f);
      psi_f = zero;
      FieldF p_f(r_f);
      FieldF mmp_f(r_f);
      RealD MaxResidSinceLastRelUp = cp; //initial residual    
      std::cout << GridLogIterative << std::setprecision(4)
 		<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
      GridStopWatch LinalgTimer;
      GridStopWatch MatrixTimer;
      GridStopWatch SolverTimer;
      SolverTimer.Start();
      int k = 0;
      int l = 0;
      for (k = 1; k <= MaxIterations; k++) {
 	c = cp;
 	MatrixTimer.Start();
 	Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
 	MatrixTimer.Stop();
 	LinalgTimer.Start();
 	a = c / d;
 	b_pred = a * (a * qq - d) / c;
 	cp = axpy_norm(r_f, -a, mmp_f, r_f);
 	b = cp / c;
 	// Fuse these loops ; should be really easy
 	psi_f = a * p_f + psi_f;
 	//p_f = p_f * b + r_f;
 	LinalgTimer.Stop();
 	std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
 		  << " residual " << cp << " target " << rsq << std::endl;
 	std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl;
 	std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl;
 	if(cp > MaxResidSinceLastRelUp){
 	  std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
 	  MaxResidSinceLastRelUp = cp;
 	}
 	// Stopping condition
 	if (cp <= rsq) {
 	  //Although not written in the paper, I assume that I have to add on the final solution
 	  precisionChange(mmp, psi_f);
 	  psi = psi + mmp;
 	  SolverTimer.Stop();
 	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 	  p = mmp - src;
 	  RealD srcnorm = sqrt(norm2(src));
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm / srcnorm;
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
 	  std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
 	  std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
 	  std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
 	  std::cout << GridLogMessage << "Time breakdown "<<std::endl;
 	  std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	  std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	  std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
 	  IterationsToComplete = k;	
 	  ReliableUpdatesPerformed = l;
 	  if(DoFinalCleanup){
 	    //Do a final CG to cleanup
 	    std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
 	    ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
 	    CG.ErrorOnNoConverge = ErrorOnNoConverge;
 	    CG(Linop_d,src,psi);
 	    IterationsToCleanup = CG.IterationsToComplete;
 	  }
 	  else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
 	  return;
 	}
 	else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 		    << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
 	  precisionChange(mmp, psi_f);
 	  psi = psi + mmp;
 	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 	  r = src - mmp;
 	  psi_f = zero;
 	  precisionChange(r_f, r);
 	  cp = norm2(r);
 	  MaxResidSinceLastRelUp = cp;
 	  b = cp/c;
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
 	  l = l+1;
 	}
 	p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
 	if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
 	  Linop_f_use = Linop_fallback;
 	  using_fallback = true;
 	}
      }
      std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
 		<< std::endl;
      if (ErrorOnNoConverge) assert(0);
      IterationsToComplete = k;
      ReliableUpdatesPerformed = l;      
    }    
  };
 };
 #endif
@@ -0,0 +1,111 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateResidual.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_RESIDUAL_H
 #define GRID_CONJUGATE_RESIDUAL_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Base classes for iterative processes based on operators
    // single input vec, single output vec.
    /////////////////////////////////////////////////////////////
  template<class Field> 
    class ConjugateResidual : public OperatorFunction<Field> {
  public:                                                
    RealD   Tolerance;
    Integer MaxIterations;
    int verbose;
    ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
      verbose=0;
    };
    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
      RealD a, b, c, d;
      RealD cp, ssq,rsq;
      RealD rAr, rAAr, rArp;
      RealD pAp, pAAp;
      GridBase *grid = src._grid;
      psi=zero;
      Field r(grid),  p(grid), Ap(grid), Ar(grid);
      r=src;
      p=src;
      Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
      Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
      cp =norm2(r);
      ssq=norm2(src);
      rsq=Tolerance*Tolerance*ssq;
      if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
      for(int k=1;k<MaxIterations;k++){
 	a = rAr/pAAp;
 	axpy(psi,a,p,psi);
 	cp = axpy_norm(r,-a,Ap,r);
 	rArp=rAr;
 	Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
 	b   =rAr/rArp;
 	axpy(p,b,p,r);
 	pAAp=axpy_norm(Ap,b,Ap,Ar);
 	if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 	if(cp<rsq) {
 	  Linop.HermOp(psi,Ap);
 	  axpy(r,-1.0,src,Ap);
 	  RealD true_resid = norm2(r)/ssq;
 	  std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "<<sqrt(true_resid)
 	           << " target "       <<Tolerance <<std::endl;
 	  return;
 	}
      }
      std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
      assert(0);
    }
  };
 }
 #endif
@@ -0,0 +1,104 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_DEFLATION_H
 #define GRID_DEFLATION_H
 namespace Grid { 
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
  virtual void operator()(const Field &src, Field &guess) { guess = zero; };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
 ////////////////////////////////
 // Fine grid deflation
 ////////////////////////////////
 template<class Field>
 class DeflatedGuesser: public LinearFunction<Field> {
 private:
  const std::vector<Field> &evec;
  const std::vector<RealD> &eval;
 public:
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
  virtual void operator()(const Field &src,Field &guess) {
    guess = zero;
    assert(evec.size()==eval.size());
    auto N = evec.size();
    for (int i=0;i<N;i++) {
      const Field& tmp = evec[i];
      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
    }
    guess.checkerboard = src.checkerboard;
  }
 };
 template<class FineField, class CoarseField>
 class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
 private:
  const std::vector<FineField>   &subspace;
  const std::vector<CoarseField> &evec_coarse;
  const std::vector<RealD>       &eval_coarse;
 public:
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
    : subspace(_subspace), 
      evec_coarse(_evec_coarse), 
      eval_coarse(_eval_coarse)  
  {
  }
  void operator()(const FineField &src,FineField &guess) { 
    int N = (int)evec_coarse.size();
    CoarseField src_coarse(evec_coarse[0]._grid);
    CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero;
    blockProject(src_coarse,src,subspace);    
    for (int i=0;i<N;i++) {
      const CoarseField & tmp = evec_coarse[i];
      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
    }
    blockPromote(guess_coarse,guess,subspace);
    guess.checkerboard = src.checkerboard;
  };
 };
 }
 #endif
@@ -0,0 +1,842 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 Author: Christoph Lehner <clehner@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_BIRL_H
 #define GRID_BIRL_H
 #include <string.h> //memset
 //#include <zlib.h>
 #include <sys/stat.h>
 namespace Grid { 
  ////////////////////////////////////////////////////////
  // Move following 100 LOC to lattice/Lattice_basis.h
  ////////////////////////////////////////////////////////
 template<class Field>
 void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
 {
  for(int j=0; j<k; ++j){
    auto ip = innerProduct(basis[j],w);
    w = w - ip*basis[j];
  }
 }
 template<class Field>
 void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
 {
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0]._grid;
  parallel_region
  {
    std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
    parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
      for(int j=j0; j<j1; ++j) B[j]=0.;
      for(int j=j0; j<j1; ++j){
 	for(int k=k0; k<k1; ++k){
 	  B[j] +=Qt(j,k) * basis[k]._odata[ss];
 	}
      }
      for(int j=j0; j<j1; ++j){
 	  basis[j]._odata[ss] = B[j];
      }
    }
  }
 }
 // Extract a single rotated vector
 template<class Field>
 void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 {
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0]._grid;
  result.checkerboard = basis[0].checkerboard;
  parallel_for(int ss=0;ss < grid->oSites();ss++){
    vobj B = zero;
    for(int k=k0; k<k1; ++k){
      B +=Qt(j,k) * basis[k]._odata[ss];
    }
    result._odata[ss] = B;
  }
 }
 template<class Field>
 void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
 {
  int vlen = idx.size();
  assert(vlen>=1);
  assert(vlen<=sort_vals.size());
  assert(vlen<=_v.size());
  for (size_t i=0;i<vlen;i++) {
    if (idx[i] != i) {
      //////////////////////////////////////
      // idx[i] is a table of desired sources giving a permutation.
      // Swap v[i] with v[idx[i]].
      // Find  j>i for which _vnew[j] = _vold[i],
      // track the move idx[j] => idx[i]
      // track the move idx[i] => i
      //////////////////////////////////////
      size_t j;
      for (j=i;j<idx.size();j++)
 	if (idx[j]==i)
 	  break;
      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
      std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
      std::swap(sort_vals[i],sort_vals[idx[i]]);
      idx[j] = idx[i];
      idx[i] = i;
    }
  }
 }
 inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
 {
  std::vector<int> idx(sort_vals.size());
  std::iota(idx.begin(), idx.end(), 0);
  // sort indexes based on comparing values in v
  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
  });
  return idx;
 }
 template<class Field>
 void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
 {
  std::vector<int> idx = basisSortGetIndex(sort_vals);
  if (reverse)
    std::reverse(idx.begin(), idx.end());
  basisReorderInPlace(_v,sort_vals,idx);
 }
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
 template<class Field> class ImplicitlyRestartedLanczosTester 
 {
 public:
  virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
  virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
 };
 enum IRLdiagonalisation { 
  IRLdiagonaliseWithDSTEGR,
  IRLdiagonaliseWithQR,
  IRLdiagonaliseWithEigen
 };
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
 public:
  LinearFunction<Field>       &_HermOp;
  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
  {
    return TestConvergence(j,resid,B,eval,evalMaxApprox);
  }
  int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox)
  {
    Field v(B);
    RealD eval_poly = eval;
    // Apply operator
    _HermOp(B,v);
    RealD vnum = real(innerProduct(B,v)); // HermOp.
    RealD vden = norm2(B);
    RealD vv0  = norm2(v);
    eval   = vnum/vden;
    v -= eval*B;
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
 };
 template<class Field> 
 class ImplicitlyRestartedLanczos {
 private:
  const RealD small = 1.0e-8;
  int MaxIter;
  int MinRestart; // Minimum number of restarts; only check for convergence after
  int Nstop;   // Number of evecs checked for convergence
  int Nk;      // Number of converged sought
  //  int Np;      // Np -- Number of spare vecs in krylov space //  == Nm - Nk
  int Nm;      // Nm -- total number of vectors
  IRLdiagonalisation diagonalisation;
  int orth_period;
  RealD OrthoTime;
  RealD eresid, betastp;
  ////////////////////////////////
  // Embedded objects
  ////////////////////////////////
  LinearFunction<Field>       &_PolyOp;
  LinearFunction<Field>       &_HermOp;
  ImplicitlyRestartedLanczosTester<Field> &_Tester;
  // Default tester provided (we need a ref to something in default case)
  ImplicitlyRestartedLanczosHermOpTester<Field> SimpleTester;
  /////////////////////////
  // Constructor
  /////////////////////////
 public:       
  //////////////////////////////////////////////////////////////////
  // PAB:
  //////////////////////////////////////////////////////////////////
  // Too many options  & knobs. 
  // Eliminate:
  //   orth_period
  //   betastp
  //   MinRestart
  //
  // Do we really need orth_period
  // What is the theoretical basis & guarantees of betastp ?
  // Nstop=Nk viable?
  // MinRestart avoidable with new convergence test?
  // Could cut to PolyOp, HermOp, Tester, Nk, Nm, resid, maxiter (+diagonalisation)
  // HermOp could be eliminated if we dropped the Power method for max eval.
  // -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear
  //////////////////////////////////////////////////////////////////
 ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
 			    LinearFunction<Field> & HermOp,
 			    ImplicitlyRestartedLanczosTester<Field> & Tester,
 			    int _Nstop, // sought vecs
 			    int _Nk, // sought vecs
 			    int _Nm, // spare vecs
 			    RealD _eresid, // resid in lmdue deficit 
 			    int _MaxIter, // Max iterations
 			    RealD _betastp=0.0, // if beta(k) < betastp: converged
 			    int _MinRestart=1, int _orth_period = 1,
 			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
    ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
 			       LinearFunction<Field> & HermOp,
 			       int _Nstop, // sought vecs
 			       int _Nk, // sought vecs
 			       int _Nm, // spare vecs
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _MaxIter, // Max iterations
 			       RealD _betastp=0.0, // if beta(k) < betastp: converged
 			       int _MinRestart=1, int _orth_period = 1,
 			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp),  _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(SimpleTester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
  ////////////////////////////////
  // Helpers
  ////////////////////////////////
  template<typename T>  static RealD normalise(T& v) 
  {
    RealD nn = norm2(v);
    nn = sqrt(nn);
    v = v * (1.0/nn);
    return nn;
  }
  void orthogonalize(Field& w, std::vector<Field>& evec,int k)
  {
    OrthoTime-=usecond()/1e6;
    basisOrthogonalize(evec,w,k);
    normalise(w);
    OrthoTime+=usecond()/1e6;
  }
 /* Rudy Arthur's thesis pp.137
 ------------------------
 Require: M > K P = M − K †
 Compute the factorization AVM = VM HM + fM eM 
 repeat
  Q=I
  for i = 1,...,P do
    QiRi =HM −θiI Q = QQi
    H M = Q †i H M Q i
  end for
  βK =HM(K+1,K) σK =Q(M,K)
  r=vK+1βK +rσK
  VK =VM(1:M)Q(1:M,1:K)
  HK =HM(1:K,1:K)
  →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
 until convergence
 */
  void calc(std::vector<RealD>& eval, std::vector<Field>& evec,  const Field& src, int& Nconv, bool reverse=false)
  {
    GridBase *grid = src._grid;
    assert(grid == evec[0]._grid);
    GridLogIRL.TimingMode(1);
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL <<" -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
    std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
    std::cout << GridLogIRL <<" -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
    std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl;
    std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl;
    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
      std::cout << GridLogIRL << "Diagonalisation is DSTEGR "<<std::endl;
    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
      std::cout << GridLogIRL << "Diagonalisation is QR "<<std::endl;
    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
      std::cout << GridLogIRL << "Diagonalisation is Eigen "<<std::endl;
    }
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    assert(Nm <= evec.size() && Nm <= eval.size());
    // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
    RealD evalMaxApprox = 0.0;
    {
      auto src_n = src;
      auto tmp = src;
      const int _MAX_ITER_IRL_MEVAPP_ = 50;
      for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
 	normalise(src_n);
 	_HermOp(src_n,tmp);
 	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
 	RealD vden = norm2(src_n);
 	RealD na = vnum/vden;
 	if (fabs(evalMaxApprox/na - 1.0) < 0.05)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
 	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
 	src_n = tmp;
      }
    }
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
    std::vector<RealD> eval2(Nm);
    std::vector<RealD> eval2_copy(Nm);
    Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm);
    Field f(grid);
    Field v(grid);
    int k1 = 1;
    int k2 = Nk;
    RealD beta_k;
    Nconv = 0;
    // Set initial vector
    evec[0] = src;
    normalise(evec[0]);
    // Initial Nk steps
    OrthoTime=0.;
    for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
    std::cout<<GridLogIRL <<"Initial "<< Nk <<"steps done "<<std::endl;
    std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
    //////////////////////////////////
    // Restarting loop begins
    //////////////////////////////////
    int iter;
    for(iter = 0; iter<MaxIter; ++iter){
      OrthoTime=0.;
      std::cout<< GridLogMessage <<" **********************"<< std::endl;
      std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
      std::cout<< GridLogMessage <<" **********************"<< std::endl;
      std::cout<<GridLogIRL <<" running "<<Nm-Nk <<" steps: "<<std::endl;
      for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
      f *= lme[Nm-1];
      std::cout<<GridLogIRL <<" "<<Nm-Nk <<" steps done "<<std::endl;
      std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
      //////////////////////////////////
      // getting eigenvalues
      //////////////////////////////////
      for(int k=0; k<Nm; ++k){
 	eval2[k] = eval[k+k1-1];
 	lme2[k] = lme[k+k1-1];
      }
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
      std::cout<<GridLogIRL <<" diagonalized "<<std::endl;
      //////////////////////////////////
      // sorting
      //////////////////////////////////
      eval2_copy = eval2;
      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
      std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
      const int chunk=8;
      for(int io=0; io<k2;io+=chunk){
 	std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
 	for(int ii=0;ii<chunk;ii++){
 	  if ( (io+ii)<k2 )
 	    std::cout<< " "<< std::setw(12)<< eval2[io+ii];
 	}
 	std::cout << std::endl;
      }
      //////////////////////////////////
      // Implicitly shifted QR transformations
      //////////////////////////////////
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      for(int ip=k2; ip<Nm; ++ip){ 
 	QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
      }
      std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
      assert(k2<Nm);      assert(k2<Nm);      assert(k1>0);
      basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
      std::cout<<GridLogIRL <<"basisRotated  by Qt"<<std::endl;
      ////////////////////////////////////////////////////
      // Compressed vector f and beta(k2)
      ////////////////////////////////////////////////////
      f *= Qt(k2-1,Nm-1);
      f += lme[k2-1] * evec[k2];
      beta_k = norm2(f);
      beta_k = sqrt(beta_k);
      std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
      RealD betar = 1.0/beta_k;
      evec[k2] = betar * f;
      lme[k2-1] = beta_k;
      ////////////////////////////////////////////////////
      // Convergence test
      ////////////////////////////////////////////////////
      for(int k=0; k<Nm; ++k){    
 	eval2[k] = eval[k];
 	lme2[k] = lme[k];
      }
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
      std::cout<<GridLogIRL <<" Diagonalized "<<std::endl;
      Nconv = 0;
      if (iter >= MinRestart) {
 	std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
 	Field B(grid); B.checkerboard = evec[0].checkerboard;
 	//  power of two search pattern;  not every evalue in eval2 is assessed.
 	int allconv =1;
 	for(int jj = 1; jj<=Nstop; jj*=2){
 	  int j = Nstop-jj;
 	  RealD e = eval2_copy[j]; // Discard the evalue
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
 	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
 	    allconv=0;
 	  }
 	}
 	// Do evec[0] for good measure
 	{ 
 	  int j=0;
 	  RealD e = eval2_copy[0]; 
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
 	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
 	}
 	if ( allconv ) Nconv = Nstop;
 	// test if we converged, if so, terminate
 	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
 	//	if( Nconv>=Nstop || beta_k < betastp){
 	if( Nconv>=Nstop){
 	  goto converged;
 	}
      } else {
 	std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n";
      } // end of iter loop
    }
    std::cout<<GridLogError<<"\n NOT converged.\n";
    abort();
  converged:
    {
      Field B(grid); B.checkerboard = evec[0].checkerboard;
      basisRotate(evec,Qt,0,Nk,0,Nk,Nm);	    
      std::cout << GridLogIRL << " Rotated basis"<<std::endl;
      Nconv=0;
      //////////////////////////////////////////////////////////////////////
      // Full final convergence test; unconditionally applied
      //////////////////////////////////////////////////////////////////////
      for(int j = 0; j<=Nk; j++){
 	B=evec[j];
 	if( _Tester.ReconstructEval(j,eresid,B,eval2[j],evalMaxApprox) ) {
 	  Nconv++;
 	}
      }
      if ( Nconv < Nstop )
 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
      eval=eval2;
      //Keep only converged
      eval.resize(Nconv);// Nstop?
      evec.resize(Nconv,grid);// Nstop?
      basisSortInPlace(evec,eval,reverse);
    }
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL << " -- Iterations  = "<< iter   << "\n";
    std::cout << GridLogIRL << " -- beta(k)     = "<< beta_k << "\n";
    std::cout << GridLogIRL << " -- Nconv       = "<< Nconv  << "\n";
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
  }
 private:
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
 2. For k = 1,2,...,m Do:
 3. wk:=Avk−βkv_{k−1}      
 4. αk:=(wk,vk)       // 
 5. wk:=wk−αkvk       // wk orthog vk 
 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
 7. vk+1 := wk/βk+1
 8. EndDo
 */
  void step(std::vector<RealD>& lmd,
 	    std::vector<RealD>& lme, 
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
    const RealD tiny = 1.0e-20;
    assert( k< Nm );
    GridStopWatch gsw_op,gsw_o;
    Field& evec_k = evec[k];
    _PolyOp(evec_k,w);    std::cout<<GridLogIRL << "PolyOp" <<std::endl;
    if(k>0) w -= lme[k-1] * evec[k-1];
    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
    RealD     alph = real(zalph);
    w = w - alph * evec_k;// 5. wk:=wk−αkvk
    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
    // 7. vk+1 := wk/βk+1
    lmd[k] = alph;
    lme[k] = beta;
    if (k>0 && k % orth_period == 0) {
      orthogonalize(w,evec,k); // orthonormalise
      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
    }
    if(k < Nm-1) evec[k+1] = w;
    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
  }
  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
 			 int Nk, int Nm,  
 			 Eigen::MatrixXd & Qt, // Nm x Nm
 			 GridBase *grid)
  {
    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
    for (int i = 0; i < Nk; i++) {
      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
    }
    for (int i = 0; i < Nk; i++) {
      for (int j = 0; j < Nk; j++) {
 	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
      }
    }
  }
  ///////////////////////////////////////////////////////////////////////////
  // File could end here if settle on Eigen ??? !!!
  ///////////////////////////////////////////////////////////////////////////
  void QR_decomp(std::vector<RealD>& lmd,   // Nm 
 		 std::vector<RealD>& lme,   // Nm 
 		 int Nk, int Nm,            // Nk, Nm
 		 Eigen::MatrixXd& Qt,       // Nm x Nm matrix
 		 RealD Dsh, int kmin, int kmax)
  {
    int k = kmin-1;
    RealD x;
    RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
    RealD c = ( lmd[k] -Dsh) *Fden;
    RealD s = -lme[k] *Fden;
    RealD tmpa1 = lmd[k];
    RealD tmpa2 = lmd[k+1];
    RealD tmpb  = lme[k];
    lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
    lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
    lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
    x        =-s*lme[k+1];
    lme[k+1] = c*lme[k+1];
    for(int i=0; i<Nk; ++i){
      RealD Qtmp1 = Qt(k,i);
      RealD Qtmp2 = Qt(k+1,i);
      Qt(k,i)  = c*Qtmp1 - s*Qtmp2;
      Qt(k+1,i)= s*Qtmp1 + c*Qtmp2; 
    }
    // Givens transformations
    for(int k = kmin; k < kmax-1; ++k){
      RealD Fden = 1.0/hypot(x,lme[k-1]);
      RealD c = lme[k-1]*Fden;
      RealD s = - x*Fden;
      RealD tmpa1 = lmd[k];
      RealD tmpa2 = lmd[k+1];
      RealD tmpb  = lme[k];
      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
      lme[k-1] = c*lme[k-1] -s*x;
      if(k != kmax-2){
 	x = -s*lme[k+1];
 	lme[k+1] = c*lme[k+1];
      }
      for(int i=0; i<Nk; ++i){
 	RealD Qtmp1 = Qt(k,i);
 	RealD Qtmp2 = Qt(k+1,i);
 	Qt(k,i)     = c*Qtmp1 -s*Qtmp2;
 	Qt(k+1,i)   = s*Qtmp1 +c*Qtmp2;
      }
    }
  }
  void diagonalize(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
 		   int Nk, int Nm,   
 		   Eigen::MatrixXd & Qt,
 		   GridBase *grid)
  {
    Qt = Eigen::MatrixXd::Identity(Nm,Nm);
    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
      diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid);
    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
      diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid);
    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
      diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
    } else { 
      assert(0);
    }
  }
 #ifdef USE_LAPACK
 void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
                   double *vl, double *vu, int *il, int *iu, double *abstol,
                   int *m, double *w, double *z, int *ldz, int *isuppz,
                   double *work, int *lwork, int *iwork, int *liwork,
                   int *info);
 #endif
 void diagonalize_lapack(std::vector<RealD>& lmd,
 			std::vector<RealD>& lme, 
 			int Nk, int Nm,  
 			Eigen::MatrixXd& Qt,
 			GridBase *grid)
 {
 #ifdef USE_LAPACK
  const int size = Nm;
  int NN = Nk;
  double evals_tmp[NN];
  double evec_tmp[NN][NN];
  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
  double DD[NN];
  double EE[NN];
  for (int i = 0; i< NN; i++) {
    for (int j = i - 1; j <= i + 1; j++) {
      if ( j < NN && j >= 0 ) {
 	if (i==j) DD[i] = lmd[i];
 	if (i==j) evals_tmp[i] = lmd[i];
 	if (j==(i-1)) EE[j] = lme[j];
      }
    }
  }
  int evals_found;
  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
  int liwork =  3+NN*10 ;
  int iwork[liwork];
  double work[lwork];
  int isuppz[2*NN];
  char jobz = 'V'; // calculate evals & evecs
  char range = 'I'; // calculate all evals
  //    char range = 'A'; // calculate all evals
  char uplo = 'U'; // refer to upper half of original matrix
  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
  int ifail[NN];
  int info;
  int total = grid->_Nprocessors;
  int node  = grid->_processor;
  int interval = (NN/total)+1;
  double vl = 0.0, vu = 0.0;
  int il = interval*node+1 , iu = interval*(node+1);
  if (iu > NN)  iu=NN;
  double tol = 0.0;
  if (1) {
    memset(evals_tmp,0,sizeof(double)*NN);
    if ( il <= NN){
      LAPACK_dstegr(&jobz, &range, &NN,
 		    (double*)DD, (double*)EE,
 		    &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
 		    &tol, // tolerance
 		    &evals_found, evals_tmp, (double*)evec_tmp, &NN,
 		    isuppz,
 		    work, &lwork, iwork, &liwork,
 		    &info);
      for (int i = iu-1; i>= il-1; i--){
 	evals_tmp[i] = evals_tmp[i - (il-1)];
 	if (il>1) evals_tmp[i-(il-1)]=0.;
 	for (int j = 0; j< NN; j++){
 	  evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
 	  if (il>1) evec_tmp[i-(il-1)][j]=0.;
 	}
      }
    }
    {
      grid->GlobalSumVector(evals_tmp,NN);
      grid->GlobalSumVector((double*)evec_tmp,NN*NN);
    }
  } 
  // Safer to sort instead of just reversing it, 
  // but the document of the routine says evals are sorted in increasing order. 
  // qr gives evals in decreasing order.
  for(int i=0;i<NN;i++){
    lmd [NN-1-i]=evals_tmp[i];
    for(int j=0;j<NN;j++){
      Qt((NN-1-i),j)=evec_tmp[i][j];
    }
  }
 #else 
  assert(0);
 #endif
 }
 void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
 		    int Nk, int Nm,   
 		    Eigen::MatrixXd & Qt,
 		    GridBase *grid)
 {
  int QRiter = 100*Nm;
  int kmin = 1;
  int kmax = Nk;
  // (this should be more sophisticated)
  for(int iter=0; iter<QRiter; ++iter){
    // determination of 2x2 leading submatrix
    RealD dsub = lmd[kmax-1]-lmd[kmax-2];
    RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
    RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
    // (Dsh: shift)
    // transformation
    QR_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
    // Convergence criterion (redef of kmin and kamx)
    for(int j=kmax-1; j>= kmin; --j){
      RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
      if(fabs(lme[j-1])+dds > dds){
 	kmax = j+1;
 	goto continued;
      }
    }
    QRiter = iter;
    return;
  continued:
    for(int j=0; j<kmax-1; ++j){
      RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
      if(fabs(lme[j])+dds > dds){
 	kmin = j+1;
 	break;
      }
    }
  }
  std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<QRiter<<"\n";
  abort();
 }
 };
 }
 #endif
@@ -0,0 +1,406 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h
    Copyright (C) 2015
 Author: Christoph Lehner <clehner@bnl.gov>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LOCAL_COHERENCE_IRL_H
 #define GRID_LOCAL_COHERENCE_IRL_H
 namespace Grid { 
 struct LanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
 				  ChebyParams, Cheby,/*Chebyshev*/
 				  int, Nstop,    /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
 				  int, Nk,       /*Vecs in Lanczos seek converge*/
 				  int, Nm,       /*Total vecs in Lanczos include restart*/
 				  RealD, resid,  /*residual*/
 				  int, MaxIt, 
 				  RealD, betastp,  /* ? */
 				  int, MinRes);    // Must restart
 };
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
 				  bool, saveEvecs,
 				  bool, doFine,
 				  bool, doFineRead,
 				  bool, doCoarse,
 	       			  bool, doCoarseRead,
 				  LanczosParams, FineParams,
 				  LanczosParams, CoarseParams,
 				  ChebyParams,   Smoother,
 				  RealD        , coarse_relax_tol,
 				  std::vector<int>, blockSize,
 				  std::string, config,
 				  std::vector < std::complex<double>  >, omega,
 				  RealD, mass,
 				  RealD, M5);
 };
 // Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  LinearOperatorBase<FineField> &_Linop;
  std::vector<FineField>        &subspace;
  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
    _Linop(linop), subspace(_subspace)
  {  
    assert(subspace.size() >0);
  };
  void operator()(const CoarseField& in, CoarseField& out) {
    GridBase *FineGrid = subspace[0]._grid;    
    int   checkerboard = subspace[0].checkerboard;
    FineField fin (FineGrid);     fin.checkerboard= checkerboard;
    FineField fout(FineGrid);   fout.checkerboard = checkerboard;
    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
    _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  OperatorFunction<FineField>   & _poly;
  LinearOperatorBase<FineField> &_Linop;
  std::vector<FineField>        &subspace;
  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
 			  LinearOperatorBase<FineField>& linop, 
 			  std::vector<FineField> & _subspace) :
    _poly(poly),
    _Linop(linop),
    subspace(_subspace)
  {  };
  void operator()(const CoarseField& in, CoarseField& out) {
    GridBase *FineGrid = subspace[0]._grid;    
    int   checkerboard = subspace[0].checkerboard;
    FineField fin (FineGrid); fin.checkerboard =checkerboard;
    FineField fout(FineGrid);fout.checkerboard =checkerboard;
    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
 {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  LinearFunction<CoarseField> & _Poly;
  OperatorFunction<FineField>   & _smoother;
  LinearOperatorBase<FineField> &_Linop;
  RealD                          _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
 					   RealD coarse_relax_tol=5.0e3) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
      _coarse_relax_tol(coarse_relax_tol)  
  {    };
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
    RealD eval_poly = eval;
    // Apply operator
    _Poly(B,v);
    RealD vnum = real(innerProduct(B,v)); // HermOp.
    RealD vden = norm2(B);
    RealD vv0  = norm2(v);
    eval   = vnum/vden;
    v -= eval*B;
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    GridBase *FineGrid = _subspace[0]._grid;    
    int checkerboard   = _subspace[0].checkerboard;
    FineField fB(FineGrid);fB.checkerboard =checkerboard;
    FineField fv(FineGrid);fv.checkerboard =checkerboard;
    blockPromote(B,fv,_subspace);  
    _smoother(_Linop,fv,fB); 
    RealD eval_poly = eval;
    _Linop.HermOp(fB,fv);
    RealD vnum = real(innerProduct(fB,fv)); // HermOp.
    RealD vden = norm2(fB);
    RealD vv0  = norm2(fv);
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
 };
 ////////////////////////////////////////////
 // Make serializable Lanczos params
 ////////////////////////////////////////////
 template<class Fobj,class CComplex,int nbasis>
 class LocalCoherenceLanczos 
 {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CComplex>                   CoarseScalar; // used for inner products on fine field
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<Fobj>                       FineField;
 protected:
  GridBase *_CoarseGrid;
  GridBase *_FineGrid;
  int _checkerboard;
  LinearOperatorBase<FineField>                 & _FineOp;
  std::vector<RealD>                              &evals_fine;
  std::vector<RealD>                              &evals_coarse; 
  std::vector<FineField>                          &subspace;
  std::vector<CoarseField>                        &evec_coarse;
 private:
  std::vector<RealD>                              _evals_fine;
  std::vector<RealD>                              _evals_coarse; 
  std::vector<FineField>                          _subspace;
  std::vector<CoarseField>                        _evec_coarse;
 public:
  LocalCoherenceLanczos(GridBase *FineGrid,
 			GridBase *CoarseGrid,
 			LinearOperatorBase<FineField> &FineOp,
 			int checkerboard) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
    _FineOp(FineOp),
    _checkerboard(checkerboard),
    evals_fine  (_evals_fine),
    evals_coarse(_evals_coarse),
    subspace    (_subspace),
    evec_coarse(_evec_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
  //////////////////////////////////////////////////////////////////////////
  // Alternate constructore, external storage for use by Hadrons module
  //////////////////////////////////////////////////////////////////////////
  LocalCoherenceLanczos(GridBase *FineGrid,
 			GridBase *CoarseGrid,
 			LinearOperatorBase<FineField> &FineOp,
 			int checkerboard,
 			std::vector<FineField>   &ext_subspace,
 			std::vector<CoarseField> &ext_coarse,
 			std::vector<RealD>       &ext_eval_fine,
 			std::vector<RealD>       &ext_eval_coarse
 			) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
    _FineOp(FineOp),
    _checkerboard(checkerboard),
    evals_fine  (ext_eval_fine), 
    evals_coarse(ext_eval_coarse),
    subspace    (ext_subspace),
    evec_coarse (ext_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
  };
  template<typename T>  static RealD normalise(T& v) 
  {
    RealD nn = norm2(v);
    nn = ::sqrt(nn);
    v = v * (1.0/nn);
    return nn;
  }
  /*
  void fakeFine(void)
  {
    int Nk = nbasis;
    subspace.resize(Nk,_FineGrid);
    subspace[0]=1.0;
    subspace[0].checkerboard=_checkerboard;
    normalise(subspace[0]);
    PlainHermOp<FineField>    Op(_FineOp);
    for(int k=1;k<Nk;k++){
      subspace[k].checkerboard=_checkerboard;
      Op(subspace[k-1],subspace[k]);
      normalise(subspace[k]);
    }
  }
  */
  void testFine(RealD resid) 
  {
    assert(evals_fine.size() == nbasis);
    assert(subspace.size() == nbasis);
    PlainHermOp<FineField>    Op(_FineOp);
    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
    for(int k=0;k<nbasis;k++){
      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
    }
  }
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
    assert(subspace.size() == nbasis);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    for(int k=0;k<evec_coarse.size();k++){
      if ( k < nbasis ) { 
 	assert(ChebySmoothTester.ReconstructEval(k,resid,evec_coarse[k],evals_coarse[k],1.0)==1);
      } else { 
 	assert(ChebySmoothTester.ReconstructEval(k,resid*relax,evec_coarse[k],evals_coarse[k],1.0)==1);
      }
    }
  }
  void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, 
 		RealD MaxIt, RealD betastp, int MinRes)
  {
    assert(nbasis<=Nm);
    Chebyshev<FineField>      Cheby(cheby_parms);
    FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
    PlainHermOp<FineField>    Op(_FineOp);
    evals_fine.resize(Nm);
    subspace.resize(Nm,_FineGrid);
    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
    int Nconv;
    IRL.calc(evals_fine,subspace,src,Nconv,false);
    // Shrink down to number saved
    assert(Nstop>=nbasis);
    assert(Nconv>=nbasis);
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
    Chebyshev<FineField>                          Cheby(cheby_op);
    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
    CoarseField src(_CoarseGrid);     src=1.0; 
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
    assert(Nconv>=Nstop);
    evals_coarse.resize(Nstop);
    evec_coarse.resize (Nstop,_CoarseGrid);
    for (int i=0;i<Nstop;i++){
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
 };
 }
 #endif
@@ -0,0 +1,60 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/NormalEquations.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_NORMAL_EQUATIONS_H
 #define GRID_NORMAL_EQUATIONS_H
 namespace Grid {
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form an NE solver calling a Herm solver
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class NormalEquations : public OperatorFunction<Field>{
  private:
    SparseMatrixBase<Field> & _Matrix;
    OperatorFunction<Field> & _HermitianSolver;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations trick
    /////////////////////////////////////////////////////
  NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver) 
    :  _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; 
    void operator() (const Field &in, Field &out){
      Field src(in._grid);
      _Matrix.Mdag(in,src);
      _HermitianSolver(src,out);  // Mdag M out = Mdag in
    }     
  };
 }
 #endif
@@ -0,0 +1,119 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/PrecConjugateResidual.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
 #define GRID_PREC_CONJUGATE_RESIDUAL_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Base classes for iterative processes based on operators
    // single input vec, single output vec.
    /////////////////////////////////////////////////////////////
  template<class Field> 
    class PrecConjugateResidual : public OperatorFunction<Field> {
  public:                                                
    RealD   Tolerance;
    Integer MaxIterations;
    int verbose;
    LinearFunction<Field> &Preconditioner;
    PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit),      Preconditioner(Prec)
    { 
      verbose=1;
    };
    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
      RealD a, b, c, d;
      RealD cp, ssq,rsq;
      RealD rAr, rAAr, rArp;
      RealD pAp, pAAp;
      GridBase *grid = src._grid;
      Field r(grid),  p(grid), Ap(grid), Ar(grid), z(grid);
      psi=zero;
      r  = src;
      Preconditioner(r,p);
      Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
      Ar=Ap;
      rAr=pAp;
      rAAr=pAAp;
      cp =norm2(r);
      ssq=norm2(src);
      rsq=Tolerance*Tolerance*ssq;
      if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
      for(int k=0;k<MaxIterations;k++){
 	Preconditioner(Ap,z);
 	RealD rq= real(innerProduct(Ap,z)); 
 	a = rAr/rq;
   	axpy(psi,a,p,psi);
   cp = axpy_norm(r,-a,z,r);
 	rArp=rAr;
 	Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
 	b   =rAr/rArp;
 	axpy(p,b,p,r);
 	pAAp=axpy_norm(Ap,b,Ap,Ar);
 	if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 	if(cp<rsq) {
 	  Linop.HermOp(psi,Ap);
 	  axpy(r,-1.0,src,Ap);
 	  RealD true_resid = norm2(r)/ssq;
 	  std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "<<sqrt(true_resid)
 	           << " target "       <<Tolerance <<std::endl;
 	  return;
 	}
      }
      std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
      assert(0);
    }
  };
 }
 #endif
@@ -0,0 +1,230 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_PREC_GCR_H
 #define GRID_PREC_GCR_H
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 //VPGCR Abe and Zhang, 2005.
 //INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
 //Computing and Information Volume 2, Number 2, Pages 147-161
 //NB. Likely not original reference since they are focussing on a preconditioner variant.
 //    but VPGCR was nicely written up in their paper
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 namespace Grid {
  template<class Field>
    class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
  public:                                                
    RealD   Tolerance;
    Integer MaxIterations;
    int verbose;
    int mmax;
    int nstep;
    int steps;
    GridStopWatch PrecTimer;
    GridStopWatch MatTimer;
    GridStopWatch LinalgTimer;
    LinearFunction<Field> &Preconditioner;
   PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
      Tolerance(tol), 
      MaxIterations(maxit),
      Preconditioner(Prec),
      mmax(_mmax),
      nstep(_nstep)
    { 
      verbose=1;
    };
    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
      psi=zero;
      RealD cp, ssq,rsq;
      ssq=norm2(src);
      rsq=Tolerance*Tolerance*ssq;
      Field r(src._grid);
        PrecTimer.Reset();
         MatTimer.Reset();
      LinalgTimer.Reset();
      GridStopWatch SolverTimer;
      SolverTimer.Start();
      steps=0;
      for(int k=0;k<MaxIterations;k++){
 	cp=GCRnStep(Linop,src,psi,rsq);
 	std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
 	if(cp<rsq) {
 	  SolverTimer.Stop();
 	  Linop.HermOp(psi,r);
 	  axpy(r,-1.0,src,r);
 	  RealD tr = norm2(r);
 	  std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "    <<sqrt(tr/ssq)
 	           << " target "           <<Tolerance <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
 	  return;
 	}
      }
      std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
      assert(0);
    }
    RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
      RealD cp;
      RealD a, b, c, d;
      RealD zAz, zAAz;
      RealD rAq, rq;
      GridBase *grid = src._grid;
      Field r(grid);
      Field z(grid);
      Field tmp(grid);
      Field ttmp(grid);
      Field Az(grid);
      ////////////////////////////////
      // history for flexible orthog
      ////////////////////////////////
      std::vector<Field> q(mmax,grid);
      std::vector<Field> p(mmax,grid);
      std::vector<RealD> qq(mmax);
      //////////////////////////////////
      // initial guess x0 is taken as nonzero.
      // r0=src-A x0 = src
      //////////////////////////////////
      MatTimer.Start();
      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
      MatTimer.Stop();
      r=src-Az;
      /////////////////////
      // p = Prec(r)
      /////////////////////
      PrecTimer.Start();
      Preconditioner(r,z);
      PrecTimer.Stop();
      MatTimer.Start();
      Linop.HermOp(z,tmp); 
      MatTimer.Stop();
      ttmp=tmp;
      tmp=tmp-r;
      /*
      std::cout<<GridLogMessage<<r<<std::endl;
      std::cout<<GridLogMessage<<z<<std::endl;
      std::cout<<GridLogMessage<<ttmp<<std::endl;
      std::cout<<GridLogMessage<<tmp<<std::endl;
      */
      MatTimer.Start();
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
      MatTimer.Stop();
      //p[0],q[0],qq[0] 
      p[0]= z;
      q[0]= Az;
      qq[0]= zAAz;
      cp =norm2(r);
      for(int k=0;k<nstep;k++){
 	steps++;
 	int kp     = k+1;
 	int peri_k = k %mmax;
 	int peri_kp= kp%mmax;
 	rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
 	a = rq/qq[peri_k];
 	axpy(psi,a,p[peri_k],psi);         
 	cp = axpy_norm(r,-a,q[peri_k],r);  
 	if((k==nstep-1)||(cp<rsq)){
 	  return cp;
 	}
 	std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " <<sqrt(cp/rsq)<<std::endl; 
 	PrecTimer.Start();
 	Preconditioner(r,z);// solve Az = r
 	PrecTimer.Stop();
 	MatTimer.Start();
 	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
 	Linop.HermOp(z,tmp);
 	MatTimer.Stop();
        tmp=tmp-r;
 	std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
 	q[peri_kp]=Az;
 	p[peri_kp]=z;
 	int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history.
 	for(int back=0;back<northog;back++){
 	  int peri_back=(k-back)%mmax;   	  assert((k-back)>=0);
 	  b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
 	  p[peri_kp]=p[peri_kp]+b*p[peri_back];
 	  q[peri_kp]=q[peri_kp]+b*q[peri_back];
 	}
 	qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
      }
      assert(0); // never reached
      return cp;
    }
  };
 }
 #endif
@@ -0,0 +1,503 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_SCHUR_RED_BLACK_H
 #define GRID_SCHUR_RED_BLACK_H
  /*
   * Red black Schur decomposition
   *
   *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
   *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
   *                =         L                     D                     U
   *
   * L^-1 = (1              0 )
   *        (-MoeMee^{-1}   1 )   
   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   *
   * U^-1 = (1   -Mee^{-1} Meo)
   *        (0    1           )
   * U^{dag} = ( 1                 0)
   *           (Meo^dag Mee^{-dag} 1)
   * U^{-dag} = (  1                 0)
   *            (-Meo^dag Mee^{-dag} 1)
   ***********************
   *     M psi = eta
   ***********************
   *Odd
   * i)                 D_oo psi_o =  L^{-1}  eta_o
   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *
   * Wilson:
   *      (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
   * Stag:
   *      D_oo psi_o = L^{-1}  eta =    (eta_o - Moe Mee^{-1} eta_e)
   *
   * L^-1 eta_o= (1              0 ) (e
   *             (-MoeMee^{-1}   1 )   
   *
   *Even
   * ii)  Mee psi_e + Meo psi_o = src_e
   *
   *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
   *
   * 
   * TODO: Other options:
   * 
   * a) change checkerboards for Schur e<->o
   *
   * Left precon by Moo^-1
   * b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 =  (D_oo)^dag M_oo^-dag Moo^-1 L^{-1}  eta_o
   *                              eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
   *
   * Right precon by Moo^-1
   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *                              psi_o = M_oo^-1 phi_o
   * TODO: Deflation 
   */
 namespace Grid {
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Now make the norm reflect extra factor of Mee
  template<class Field> class SchurRedBlackStaggeredSolve {
  private:
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
     _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise=0;
      subtractGuess(initSubGuess);
    };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
      /////////////////////////////////////////////////////
      // src_o = (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      //src_o = tmp;     assert(src_o.checkerboard ==Odd);
      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
      guess(src_o, sol_o);
      Mtmp = sol_o;
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
      // Fionn A2A boolean behavioural control
      if (subGuess)        sol_o = sol_o-Mtmp;
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagMooeeSolve {
  private:
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0, const bool initSubGuess = false)  :  _HermitianRBSolver(HermitianRBSolver) 
  { 
    CBfactorise=cb;
    subtractGuess(initSubGuess);
  };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
      guess(src_o,sol_o);
      Mtmp = sol_o;
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      // Fionn A2A boolean behavioural control
      if (subGuess)        sol_o = sol_o-Mtmp;
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagTwoSolve {
  private:
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
     _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise = 0;
      subtractGuess(initSubGuess);
    };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix,class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      guess(src_o,tmp);
      Mtmp = tmp;
      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      // Fionn A2A boolean behavioural control
      if (subGuess)      tmp = tmp-Mtmp;
      _Matrix.MooeeInv(tmp,sol_o);       assert(  sol_o.checkerboard   ==Odd);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagTwoMixed {
  private:
    LinearFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
  public:
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
     _HermitianRBSolver(HermitianRBSolver) 
    { 
      CBfactorise=0;
      subtractGuess(initSubGuess);
    };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field src_e(grid);
      Field src_o(grid);
      Field sol_e(grid);
      Field sol_o(grid);
      Field   tmp(grid);
      Field  Mtmp(grid);
      Field resid(fgrid);
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      guess(src_o,tmp);
      Mtmp = tmp;
      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      // Fionn A2A boolean behavioural control
      if (subGuess)      tmp = tmp-Mtmp;
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout << GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid " << std::sqrt(nr / ns) << " nr " << nr << " ns " << ns << std::endl;
      } else {
        std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
      }
    }     
  };
 }
 #endif
@@ -0,0 +1,125 @@
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 namespace Grid {
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 int PointerCache::victim;
 PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
 void *PointerCache::Insert(void *ptr,size_t bytes) {
  if (bytes < 4096 ) return ptr;
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
  void * ret = NULL;
  int v = -1;
  for(int e=0;e<Ncache;e++) {
    if ( Entries[e].valid==0 ) {
      v=e; 
      break;
    }
  }
  if ( v==-1 ) {
    v=victim;
    victim = (victim+1)%Ncache;
  }
  if ( Entries[v].valid ) {
    ret = Entries[v].address;
    Entries[v].valid = 0;
    Entries[v].address = NULL;
    Entries[v].bytes = 0;
  }
  Entries[v].address=ptr;
  Entries[v].bytes  =bytes;
  Entries[v].valid  =1;
  return ret;
 }
 void *PointerCache::Lookup(size_t bytes) {
 if (bytes < 4096 ) return NULL;
 #ifdef _OPENMP
  assert(omp_in_parallel()==0);
 #endif 
  for(int e=0;e<Ncache;e++){
    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
      Entries[e].valid = 0;
      return Entries[e].address;
    }
  }
  return NULL;
 }
 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
 #ifdef __linux__
  int fd = open("/proc/self/pagemap", O_RDONLY);
  assert(fd >= 0);
  const int page_size = 4096;
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
  uint64_t pagedata[npages];
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
  n4ktotal = 0;
  nnothuge = 0;
  for (int i = 0; i < nhugepages; ++i) {
    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
    for (int j = 0; j < 512; ++j) {
      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
      ++n4ktotal;
      if (pageaddr != baseaddr + j * page_size)
 	++nnothuge;
      }
  }
  int rank = CartesianCommunicator::RankWorld();
  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
 #endif
 }
 std::string sizeString(const size_t bytes)
 {
  constexpr unsigned int bufSize = 256;
  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
  char                   buf[256];
  size_t                 s     = 0;
  double                 count = bytes;
  while (count >= 1024 && s < 7)
  {
      s++;
      count /= 1024;
  }
  if (count - floor(count) == 0.0)
  {
      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
  }
  else
  {
      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
  }
  return std::string(buf);
 }
 }
@@ -0,0 +1,315 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/AlignedAllocator.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_ALIGNED_ALLOCATOR_H
 #define GRID_ALIGNED_ALLOCATOR_H
 #ifdef HAVE_MALLOC_MALLOC_H
 #include <malloc/malloc.h>
 #endif
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
 #endif
 #ifdef HAVE_MM_MALLOC_H
 #include <mm_malloc.h>
 #endif
 namespace Grid {
  class PointerCache {
  private:
    static const int Ncache=8;
    static int victim;
    typedef struct { 
      void *address;
      size_t bytes;
      int valid;
    } PointerCacheEntry;
    static PointerCacheEntry Entries[Ncache];
  public:
    static void *Insert(void *ptr,size_t bytes) ;
    static void *Lookup(size_t bytes) ;
  };
  std::string sizeString(size_t bytes);
  struct MemoryStats
  {
    size_t totalAllocated{0}, maxAllocated{0}, 
           currentlyAllocated{0}, totalFreed{0};
  };
  class MemoryProfiler
  {
  public:
    static MemoryStats *stats;
    static bool        debug;
  };
  #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
  #define profilerDebugPrint \
  if (MemoryProfiler::stats)\
  {\
    auto s = MemoryProfiler::stats;\
    std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
              << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
              << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
              << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
              << std::endl;\
  }
  #define profilerAllocate(bytes)\
  if (MemoryProfiler::stats)\
  {\
    auto s = MemoryProfiler::stats;\
    s->totalAllocated     += (bytes);\
    s->currentlyAllocated += (bytes);\
    s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated);\
  }\
  if (MemoryProfiler::debug)\
  {\
    std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\
    profilerDebugPrint;\
  }
  #define profilerFree(bytes)\
  if (MemoryProfiler::stats)\
  {\
    auto s = MemoryProfiler::stats;\
    s->totalFreed         += (bytes);\
    s->currentlyAllocated -= (bytes);\
  }\
  if (MemoryProfiler::debug)\
  {\
    std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\
    profilerDebugPrint;\
  }
  void check_huge_pages(void *Buf,uint64_t BYTES);
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
 ////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class alignedAllocator {
 public: 
  typedef std::size_t     size_type;
  typedef std::ptrdiff_t  difference_type;
  typedef _Tp*       pointer;
  typedef const _Tp* const_pointer;
  typedef _Tp&       reference;
  typedef const _Tp& const_reference;
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; };
  alignedAllocator() throw() { }
  alignedAllocator(const alignedAllocator&) throw() { }
  template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
  ~alignedAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
    //    if ( ptr != NULL ) 
    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
    //////////////////
    // Hack 2MB align; could make option probably doesn't need configurability
    //////////////////
 //define GRID_ALLOC_ALIGN (128)
 #define GRID_ALLOC_ALIGN (2*1024*1024)
 #ifdef HAVE_MM_MALLOC_H
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 #else
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 #endif
    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
    // First touch optimise in threaded loop
    uint8_t *cp = (uint8_t *)ptr;
 #ifdef GRID_OMP
 #pragma omp parallel for
 #endif
    for(size_type n=0;n<bytes;n+=4096){
      cp[n]=0;
    }
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n * sizeof(_Tp);
    profilerFree(bytes);
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 #ifdef HAVE_MM_MALLOC_H
    if ( __freeme ) _mm_free((void *)__freeme); 
 #else
    if ( __freeme ) free((void *)__freeme);
 #endif
  }
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 //////////////////////////////////////////////////////////////////////////////////////////
 // MPI3 : comms must use shm region
 // SHMEM: comms must use symmetric heap
 //////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_COMMS_SHMEM
 extern "C" { 
 #include <mpp/shmem.h>
 extern void * shmem_align(size_t, size_t);
 extern void  shmem_free(void *);
 }
 #define PARANOID_SYMMETRIC_HEAP
 #endif
 template<typename _Tp>
 class commAllocator {
 public: 
  typedef std::size_t     size_type;
  typedef std::ptrdiff_t  difference_type;
  typedef _Tp*       pointer;
  typedef const _Tp* const_pointer;
  typedef _Tp&       reference;
  typedef const _Tp& const_reference;
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef commAllocator<_Tp1> other; };
  commAllocator() throw() { }
  commAllocator(const commAllocator&) throw() { }
  template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
  ~commAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 #ifdef GRID_COMMS_SHMEM
  pointer allocate(size_type __n, const void* _p= 0)
  {
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
 #ifdef CRAY
    _Tp *ptr = (_Tp *) shmem_align(bytes,64);
 #else
    _Tp *ptr = (_Tp *) shmem_align(64,bytes);
 #endif
 #ifdef PARANOID_SYMMETRIC_HEAP
    static void * bcast;
    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
    bcast = (void *) ptr;
    shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
    if ( bcast != ptr ) {
      std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
      //      BACKTRACEFILE();
      exit(0);
    }
    assert( bcast == (void *) ptr);
 #endif 
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n*sizeof(_Tp);
    profilerFree(bytes);
    shmem_free((void *)__p);
  }
 #else
  pointer allocate(size_type __n, const void* _p= 0) 
  {
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
 #else
    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
 #endif
    uint8_t *cp = (uint8_t *)ptr;
    if ( ptr ) { 
    // One touch per 4k page, static OMP loop to catch same loop order
 #ifdef GRID_OMP
 #pragma omp parallel for schedule(static)
 #endif
      for(size_type n=0;n<bytes;n+=4096){
 	cp[n]=0;
      }
    }
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) {
    size_type bytes = __n*sizeof(_Tp);
    profilerFree(bytes);
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
    free((void *)__p);
 #endif
  }
 #endif
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
 template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
 template<class T> using commVector = std::vector<T,commAllocator<T> >;              
 template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 }; // namespace Grid
 #endif
@@ -0,0 +1,35 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Cartesian.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CARTESIAN_H
 #define GRID_CARTESIAN_H
 #include <Grid/cartesian/Cartesian_base.h>
 #include <Grid/cartesian/Cartesian_full.h>
 #include <Grid/cartesian/Cartesian_red_black.h> 
 #endif
@@ -0,0 +1,292 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cartesian/Cartesian_base.h
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CARTESIAN_BASE_H
 #define GRID_CARTESIAN_BASE_H
 namespace Grid{
  //////////////////////////////////////////////////////////////////////
  // Commicator provides information on the processor grid
  //////////////////////////////////////////////////////////////////////
  //    unsigned long _ndimension;
  //    std::vector<int> _processors; // processor grid
  //    int              _processor;  // linear processor rank
  //    std::vector<int> _processor_coor;  // linear processor rank
  //////////////////////////////////////////////////////////////////////
  class GridBase : public CartesianCommunicator , public GridThread {
 public:
    int dummy;
    // Give Lattice access
    template<class object> friend class Lattice;
    GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
    GridBase(const std::vector<int> & processor_grid,
 	     const CartesianCommunicator &parent,
 	     int &split_rank) 
      : CartesianCommunicator(processor_grid,parent,split_rank) {};
    GridBase(const std::vector<int> & processor_grid,
 	     const CartesianCommunicator &parent) 
      : CartesianCommunicator(processor_grid,parent,dummy) {};
    virtual ~GridBase() = default;
    // Physics Grid information.
    std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
    std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
    std::vector<int> _gdimensions;// Global dimensions of array after cb removal
    std::vector<int> _ldimensions;// local dimensions of array with processor images removed
    std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed 
    std::vector<int> _ostride;    // Outer stride for each dimension
    std::vector<int> _istride;    // Inner stride i.e. within simd lane
    int _osites;                  // _isites*_osites = product(dimensions).
    int _isites;
    int _fsites;                  // _isites*_osites = product(dimensions).
    int _gsites;
    std::vector<int> _slice_block;// subslice information
    std::vector<int> _slice_stride;
    std::vector<int> _slice_nblock;
    std::vector<int> _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
    std::vector<int> _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
    bool _isCheckerBoarded; 
 public:
    ////////////////////////////////////////////////////////////////
    // Checkerboarding interface is virtual and overridden by 
    // GridCartesian / GridRedBlackCartesian
    ////////////////////////////////////////////////////////////////
    virtual int CheckerBoarded(int dim)=0;
    virtual int CheckerBoard(const std::vector<int> &site)=0;
    virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
    virtual int CheckerBoardFromOindex (int Oindex)=0;
    virtual int CheckerBoardFromOindexTable (int Oindex)=0;
    //////////////////////////////////////////////////////////////////////////////////////////////
    // Local layout calculations
    //////////////////////////////////////////////////////////////////////////////////////////////
    // These routines are key. Subdivide the linearised cartesian index into
    //      "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
    //      "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
    //
    // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
    // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
    // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
    // lanes are operated upon simultaneously.
    virtual int oIndex(std::vector<int> &coor)
    {
        int idx=0;
        // Works with either global or local coordinates
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
    virtual int iIndex(std::vector<int> &lcoor)
    {
        int idx=0;
        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
        return idx;
    }
    inline int oIndexReduced(std::vector<int> &ocoor)
    {
      int idx=0; 
      // ocoor is already reduced so can eliminate the modulo operation
      // for fast indexing and inline the routine
      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
      return idx;
    }
    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
      Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
    }
    inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) {
      lcoor.resize(_ndimension);
      for (int d = 0; d < _ndimension; d++)
        lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
    }
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
    {
      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
    }
    inline int PermuteDim(int dimension){
      return _simd_layout[dimension]>1;
    }
    inline int PermuteType(int dimension){
      int permute_type=0;
      //
      // FIXME:
      //
      // Best way to encode this would be to present a mask 
      // for which simd dimensions are rotated, and the rotation
      // size. If there is only one simd dimension rotated, this is just 
      // a permute. 
      //
      // Cases: PermuteType == 1,2,4,8
      // Distance should be either 0,1,2..
      //
      if ( _simd_layout[dimension] > 2 ) { 
        for(int d=0;d<_ndimension;d++){
          if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
        }
        permute_type = RotateBit; // How to specify distance; this is not just direction.
        return permute_type;
      }
      for(int d=_ndimension-1;d>dimension;d--){
        if (_simd_layout[d]>1 ) permute_type++;
      }
      return permute_type;
    }
    ////////////////////////////////////////////////////////////////
    // Array sizing queries
    ////////////////////////////////////////////////////////////////
    inline int iSites(void) const { return _isites; };
    inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
    inline int oSites(void) const { return _osites; };
    inline int lSites(void) const { return _isites*_osites; }; 
    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
    inline int Nd    (void) const { return _ndimension;};
    inline const std::vector<int> LocalStarts(void)             { return _lstart;    };
    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
    inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
    inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
    ////////////////////////////////////////////////////////////////
    // Utility to print the full decomposition details 
    ////////////////////////////////////////////////////////////////
    void show_decomposition(){
      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
    } 
    ////////////////////////////////////////////////////////////////
    // Global addressing
    ////////////////////////////////////////////////////////////////
    void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
      assert(gidx< gSites());
      Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
    }
    void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
      assert(lidx<lSites());
      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
    }
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
      gidx=0;
      int mult=1;
      for(int mu=0;mu<_ndimension;mu++) {
        gidx+=mult*gcoor[mu];
        mult*=_gdimensions[mu];
      }
    }
    void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
    {
      pcoor.resize(_ndimension);
      lcoor.resize(_ndimension);
      for(int mu=0;mu<_ndimension;mu++){
        int _fld  = _fdimensions[mu]/_processors[mu];
        pcoor[mu] = gcoor[mu]/_fld;
        lcoor[mu] = gcoor[mu]%_fld;
      }
    }
    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
    {
      std::vector<int> pcoor;
      std::vector<int> lcoor;
      GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
      rank = RankFromProcessorCoor(pcoor);
      /*
      std::vector<int> cblcoor(lcoor);
      for(int d=0;d<cblcoor.size();d++){
        if( this->CheckerBoarded(d) ) {
          cblcoor[d] = lcoor[d]/2;
        }
      }
      */
      i_idx= iIndex(lcoor);
      o_idx= oIndex(lcoor);
    }
    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
    {
      gcoor.resize(_ndimension);
      std::vector<int> coor(_ndimension);
      ProcessorCoorFromRank(rank,coor);
      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu];
      iCoorFromIindex(coor,i_idx);
      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu];
      oCoorFromOindex (coor,o_idx);
      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
    }
    void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
    {
      RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
      if(CheckerBoarded(0)){
        fcoor[0] = fcoor[0]*2+cb;
      }
    }
    void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
    {
      gcoor.resize(_ndimension);
      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
    }
 };
 }
 #endif
@@ -0,0 +1,174 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cartesian/Cartesian_full.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CARTESIAN_FULL_H
 #define GRID_CARTESIAN_FULL_H
 namespace Grid{
 /////////////////////////////////////////////////////////////////////////////////////////
 // Grid Support.
 /////////////////////////////////////////////////////////////////////////////////////////
 class GridCartesian: public GridBase {
 public:
    int dummy;
    virtual int  CheckerBoardFromOindexTable (int Oindex) {
      return 0;
    }
    virtual int  CheckerBoardFromOindex (int Oindex)
    {
      return 0;
    }
    virtual int CheckerBoarded(int dim){
      return 0;
    }
    virtual int CheckerBoard(const std::vector<int> &site){
        return 0;
    }
    virtual int CheckerBoardDestination(int cb,int shift,int dim){
        return 0;
    }
    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
      return shift;
    }
    virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
      return shift;
    }
    /////////////////////////////////////////////////////////////////////////
    // Constructor takes a parent grid and possibly subdivides communicator.
    /////////////////////////////////////////////////////////////////////////
    GridCartesian(const std::vector<int> &dimensions,
 		  const std::vector<int> &simd_layout,
 		  const std::vector<int> &processor_grid,
 		  const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
    {
      Init(dimensions,simd_layout,processor_grid);
    }
    GridCartesian(const std::vector<int> &dimensions,
 		  const std::vector<int> &simd_layout,
 		  const std::vector<int> &processor_grid,
 		  const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
    {
      Init(dimensions,simd_layout,processor_grid);
    }
    /////////////////////////////////////////////////////////////////////////
    // Construct from comm world
    /////////////////////////////////////////////////////////////////////////
    GridCartesian(const std::vector<int> &dimensions,
 		  const std::vector<int> &simd_layout,
 		  const std::vector<int> &processor_grid) : GridBase(processor_grid)
    {
      Init(dimensions,simd_layout,processor_grid);
    }
    virtual ~GridCartesian() = default;
    void Init(const std::vector<int> &dimensions,
 	      const std::vector<int> &simd_layout,
 	      const std::vector<int> &processor_grid)
    {
      ///////////////////////
      // Grid information
      ///////////////////////
      _isCheckerBoarded = false;
      _ndimension = dimensions.size();
      _fdimensions.resize(_ndimension);
      _gdimensions.resize(_ndimension);
      _ldimensions.resize(_ndimension);
      _rdimensions.resize(_ndimension);
      _simd_layout.resize(_ndimension);
      _lstart.resize(_ndimension);
      _lend.resize(_ndimension);
      _ostride.resize(_ndimension);
      _istride.resize(_ndimension);
      _fsites = _gsites = _osites = _isites = 1;
      for (int d = 0; d < _ndimension; d++)
      {
        _fdimensions[d] = dimensions[d];   // Global dimensions
        _gdimensions[d] = _fdimensions[d]; // Global dimensions
        _simd_layout[d] = simd_layout[d];
        _fsites = _fsites * _fdimensions[d];
        _gsites = _gsites * _gdimensions[d];
        // Use a reduced simd grid
        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
        //std::cout << _ldimensions[d] << "  " << _gdimensions[d] << "  " << _processors[d] << std::endl;
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
        _osites *= _rdimensions[d];
        _isites *= _simd_layout[d];
        // Addressing support
        if (d == 0)
        {
          _ostride[d] = 1;
          _istride[d] = 1;
        }
        else
        {
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        }
      }
      ///////////////////////
      // subplane information
      ///////////////////////
      _slice_block.resize(_ndimension);
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
      int block = 1;
      int nblock = 1;
      for (int d = 0; d < _ndimension; d++)
        nblock *= _rdimensions[d];
      for (int d = 0; d < _ndimension; d++)
      {
        nblock /= _rdimensions[d];
        _slice_block[d] = block;
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
        _slice_nblock[d] = nblock;
        block = block * _rdimensions[d];
      }
    };
 };
 }
 #endif
@@ -0,0 +1,320 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cartesian/Cartesian_red_black.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CARTESIAN_RED_BLACK_H
 #define GRID_CARTESIAN_RED_BLACK_H
 namespace Grid {
  static const int CbRed  =0;
  static const int CbBlack=1;
  static const int Even   =CbRed;
  static const int Odd    =CbBlack;
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
 {
 public:
    std::vector<int> _checker_dim_mask;
    int              _checker_dim;
    std::vector<int> _checker_board;
    virtual int CheckerBoarded(int dim){
      if( dim==_checker_dim) return 1;
      else return 0;
    }
    virtual int CheckerBoard(const std::vector<int> &site){
      int linear=0;
      assert(site.size()==_ndimension);
      for(int d=0;d<_ndimension;d++){ 
 	if(_checker_dim_mask[d])
 	  linear=linear+site[d];
      }
      return (linear&0x1);
    }
    // Depending on the cb of site, we toggle source cb.
    // for block #b, element #e = (b, e)
    // we need 
    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){
      if(dim != _checker_dim) return shift;
      int fulldim =_fdimensions[dim];
      shift = (shift+fulldim)%fulldim;
      // Probably faster with table lookup;
      // or by looping over x,y,z and multiply rather than computing checkerboard.
      if ( (source_cb+ocb)&1 ) {
 	return (shift)/2;
      } else {
 	return (shift+1)/2;
      }
    }
    virtual int  CheckerBoardFromOindexTable (int Oindex) {
      return _checker_board[Oindex];
    }
    virtual int  CheckerBoardFromOindex (int Oindex)
    {
      std::vector<int> ocoor;
      oCoorFromOindex(ocoor,Oindex);
      return CheckerBoard(ocoor);
    }
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
      if(dim != _checker_dim) return shift;
      int ocb=CheckerBoardFromOindex(osite);
      return CheckerBoardShiftForCB(source_cb,dim,shift,ocb);
    }
    virtual int CheckerBoardDestination(int source_cb,int shift,int dim){
      if ( _checker_dim_mask[dim]  ) {
 	// If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims
 	// does NOT cause a parity hop.
 	int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim];
        if ( (shift+add) &0x1) {
            return 1-source_cb;
        } else {
            return source_cb;
        }
      } else {
 	return source_cb;
      }
    };
    ////////////////////////////////////////////////////////////
    // Create Redblack from original grid; require full grid pointer ?
    ////////////////////////////////////////////////////////////
    GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
    {
      int dims = base->_ndimension;
      std::vector<int> checker_dim_mask(dims,1);
      int checker_dim = 0;
      Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
    };
    ////////////////////////////////////////////////////////////
    // Create redblack from original grid, with non-trivial checker dim mask
    ////////////////////////////////////////////////////////////
    GridRedBlackCartesian(const GridBase *base,
 			  const std::vector<int> &checker_dim_mask,
 			  int checker_dim
 			  ) :  GridBase(base->_processors,*base) 
    {
      Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim)  ;
    }
    virtual ~GridRedBlackCartesian() = default;
 #if 0
    ////////////////////////////////////////////////////////////
    // Create redblack grid ;; deprecate these. Should not
    // need direct creation of redblack without a full grid to base on
    ////////////////////////////////////////////////////////////
    GridRedBlackCartesian(const GridBase *base,
 			  const std::vector<int> &dimensions,
 			  const std::vector<int> &simd_layout,
 			  const std::vector<int> &processor_grid,
 			  const std::vector<int> &checker_dim_mask,
 			  int checker_dim
 			  ) :  GridBase(processor_grid,*base) 
    {
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
    }
    ////////////////////////////////////////////////////////////
    // Create redblack grid
    ////////////////////////////////////////////////////////////
    GridRedBlackCartesian(const GridBase *base,
 			  const std::vector<int> &dimensions,
 			  const std::vector<int> &simd_layout,
 			  const std::vector<int> &processor_grid) : GridBase(processor_grid,*base) 
    {
      std::vector<int> checker_dim_mask(dimensions.size(),1);
      int checker_dim = 0;
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
    }
 #endif
    void Init(const std::vector<int> &dimensions,
              const std::vector<int> &simd_layout,
              const std::vector<int> &processor_grid,
              const std::vector<int> &checker_dim_mask,
              int checker_dim)
    {
      _isCheckerBoarded = true;
      _checker_dim = checker_dim;
      assert(checker_dim_mask[checker_dim] == 1);
      _ndimension = dimensions.size();
      assert(checker_dim_mask.size() == _ndimension);
      assert(processor_grid.size() == _ndimension);
      assert(simd_layout.size() == _ndimension);
      _fdimensions.resize(_ndimension);
      _gdimensions.resize(_ndimension);
      _ldimensions.resize(_ndimension);
      _rdimensions.resize(_ndimension);
      _simd_layout.resize(_ndimension);
      _lstart.resize(_ndimension);
      _lend.resize(_ndimension);
      _ostride.resize(_ndimension);
      _istride.resize(_ndimension);
      _fsites = _gsites = _osites = _isites = 1;
      _checker_dim_mask = checker_dim_mask;
      for (int d = 0; d < _ndimension; d++)
      {
        _fdimensions[d] = dimensions[d];
        _gdimensions[d] = _fdimensions[d];
        _fsites = _fsites * _fdimensions[d];
        _gsites = _gsites * _gdimensions[d];
        if (d == _checker_dim)
        {
          assert((_gdimensions[d] & 0x1) == 0);
          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
 	  _gsites /= 2;
        }
        _ldimensions[d] = _gdimensions[d] / _processors[d];
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
        // Use a reduced simd grid
        _simd_layout[d] = simd_layout[d];
        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
        assert(_rdimensions[d] > 0);
        // all elements of a simd vector must have same checkerboard.
        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
        if (_simd_layout[d] > 1)
        {
          if (checker_dim_mask[d])
          {
            assert((_rdimensions[d] & 0x1) == 0);
          }
        }
        _osites *= _rdimensions[d];
        _isites *= _simd_layout[d];
        // Addressing support
        if (d == 0)
        {
          _ostride[d] = 1;
          _istride[d] = 1;
        }
        else
        {
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        }
      }
      ////////////////////////////////////////////////////////////////////////////////////////////
      // subplane information
      ////////////////////////////////////////////////////////////////////////////////////////////
      _slice_block.resize(_ndimension);
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
      int block = 1;
      int nblock = 1;
      for (int d = 0; d < _ndimension; d++)
        nblock *= _rdimensions[d];
      for (int d = 0; d < _ndimension; d++)
      {
        nblock /= _rdimensions[d];
        _slice_block[d] = block;
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
        _slice_nblock[d] = nblock;
        block = block * _rdimensions[d];
      }
      ////////////////////////////////////////////////
      // Create a checkerboard lookup table
      ////////////////////////////////////////////////
      int rvol = 1;
      for (int d = 0; d < _ndimension; d++)
      {
        rvol = rvol * _rdimensions[d];
      }
      _checker_board.resize(rvol);
      for (int osite = 0; osite < _osites; osite++)
      {
        _checker_board[osite] = CheckerBoardFromOindex(osite);
      }
    };
  protected:
    virtual int oIndex(std::vector<int> &coor)
    {
      int idx = 0;
      for (int d = 0; d < _ndimension; d++)
      {
        if (d == _checker_dim)
        {
          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
        }
        else
        {
          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
        }
      }
      return idx;
    };
    virtual int iIndex(std::vector<int> &lcoor)
    {
      int idx = 0;
      for (int d = 0; d < _ndimension; d++)
      {
        if (d == _checker_dim)
        {
          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
        }
        else
        {
          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
        }
      }
      return idx;
    }
 };
 }
 #endif
@@ -0,0 +1,34 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Communicator.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_COMMUNICATOR_H
 #define GRID_COMMUNICATOR_H
 #include <Grid/communicator/SharedMemory.h>
 #include <Grid/communicator/Communicator_base.h>
 #endif
@@ -0,0 +1,76 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_none.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/mman.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
 CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
 /////////////////////////////////
 // Grid information queries
 /////////////////////////////////
 int                      CartesianCommunicator::Dimensions(void)        { return _ndimension; };
 int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 int                      CartesianCommunicator::BossRank(void)          { return 0; };
 int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
 const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
 const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; };
 int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; };
 ////////////////////////////////////////////////////////////////////////////////
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 {
  GlobalSumVector((float *)c,2*N);
 }
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumVector((double *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
 }
 }
@@ -0,0 +1,207 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_base.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_COMMUNICATOR_BASE_H
 #define GRID_COMMUNICATOR_BASE_H
 ///////////////////////////////////
 // Processor layout information
 ///////////////////////////////////
 #include <Grid/communicator/SharedMemory.h>
 namespace Grid {
 class CartesianCommunicator : public SharedMemory {
 public:    
  ////////////////////////////////////////////
  // Policies
  ////////////////////////////////////////////
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  static int       nCommThreads;
  ////////////////////////////////////////////
  // Communicator should know nothing of the physics grid, only processor grid.
  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long    _ndimension;
  static Grid_MPI_Comm      communicator_world;
  Grid_MPI_Comm             communicator;
  std::vector<Grid_MPI_Comm> communicator_halo;
  ////////////////////////////////////////////////
  // Must call in Grid startup
  ////////////////////////////////////////////////
  static void Init(int *argc, char ***argv);
  ////////////////////////////////////////////////
  // Constructors to sub-divide a parent communicator
  // and default to comm world
  ////////////////////////////////////////////////
  CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank);
  CartesianCommunicator(const std::vector<int> &pdimensions_in);
  virtual ~CartesianCommunicator();
 private:
  ////////////////////////////////////////////////
  // Private initialise from an MPI communicator
  // Can use after an MPI_Comm_split, but hidden from user so private
  ////////////////////////////////////////////////
  void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
 public:
  ////////////////////////////////////////////////////////////////////////////////////////
  // Wraps MPI_Cart routines, or implements equivalent on other impls
  ////////////////////////////////////////////////////////////////////////////////////////
  void ShiftedRanks(int dim,int shift,int & source, int & dest);
  int  RankFromProcessorCoor(std::vector<int> &coor);
  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
  int                      Dimensions(void)        ;
  int                      IsBoss(void)            ;
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
  ////////////////////////////////////////////////////////////
  // Reduction
  ////////////////////////////////////////////////////////////
  void GlobalSum(RealF &);
  void GlobalSumVector(RealF *,int N);
  void GlobalSum(RealD &);
  void GlobalSumVector(RealD *,int N);
  void GlobalSum(uint32_t &);
  void GlobalSum(uint64_t &);
  void GlobalSum(ComplexF &c);
  void GlobalSumVector(ComplexF *c,int N);
  void GlobalSum(ComplexD &c);
  void GlobalSumVector(ComplexD *c,int N);
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
    scalar_type * ptr = (scalar_type *)& o;
    GlobalSumVector(ptr,words);
  }
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
  void SendToRecvFrom(void *xmit,
 		      int xmit_to_rank,
 		      void *recv,
 		      int recv_from_rank,
 		      int bytes);
  void SendRecvPacket(void *xmit,
 		      void *recv,
 		      int xmit_to_rank,
 		      int recv_from_rank,
 		      int bytes);
  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			   void *xmit,
 			   int xmit_to_rank,
 			   void *recv,
 			   int recv_from_rank,
 			   int bytes);
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,
 			       void *recv,
 			       int recv_from_rank,
 			       int bytes,int dir);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,
 				    void *recv,
 				    int recv_from_rank,
 				    int bytes,int dir);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
  void StencilBarrier(void);
  ////////////////////////////////////////////////////////////
  // Barrier
  ////////////////////////////////////////////////////////////
  void Barrier(void);
  ////////////////////////////////////////////////////////////
  // Broadcast a buffer and composite larger
  ////////////////////////////////////////////////////////////
  void Broadcast(int root,void* data, int bytes);
  ////////////////////////////////////////////////////////////
  // All2All down one dimension
  ////////////////////////////////////////////////////////////
  template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
    assert(dim>=0);
    assert(dim<_ndimension);
    assert(in.size()==out.size());
    int numnode = _processors[dim];
    uint64_t bytes=sizeof(T);
    uint64_t words=in.size()/numnode;
    assert(numnode * words == in.size());
    assert(words < (1ULL<<31));
    AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
  }
  void AllToAll(int dim  ,void *in,void *out,uint64_t words,uint64_t bytes);
  void AllToAll(void  *in,void *out,uint64_t words         ,uint64_t bytes);
  template<class obj> void Broadcast(int root,obj &data)
    {
      Broadcast(root,(void *)&data,sizeof(data));
    };
 }; 
 }
 #endif
@@ -0,0 +1,514 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/communicator/SharedMemory.h>
 namespace Grid {
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 ////////////////////////////////////////////
 // First initialise of comms system
 ////////////////////////////////////////////
 void CartesianCommunicator::Init(int *argc, char ***argv) 
 {
  int flag;
  int provided;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
      assert(0);
  }
  Grid_quiesce_nodes();
  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  GlobalSharedMemory::Init(communicator_world);
  GlobalSharedMemory::SharedMemoryAllocate(
 		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
 		   GlobalSharedMemory::Hugepages);
 }
 ///////////////////////////////////////////////////////////////////////////
 // Use cartesian communicators now even in MPI3
 ///////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Initialises from communicator_world
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
 {
  MPI_Comm optimal_comm;
  ////////////////////////////////////////////////////
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
  // Free the temp communicator
  ///////////////////////////////////////////////////
  MPI_Comm_free(&optimal_comm);
 }
 //////////////////////////////////
 // Try to subdivide communicator
 //////////////////////////////////
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)    
 {
  _ndimension = processors.size();
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  std::vector<int> parent_processor_coor(_ndimension,0);
  std::vector<int> parent_processors    (_ndimension,1);
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // split the communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  int Nparent = parent._processors ; 
  //  std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
  int Nparent;
  MPI_Comm_size(parent.communicator,&Nparent);
  //  std::cout << " Parent size  "<<Nparent <<std::endl;
  int childsize=1;
  for(int d=0;d<processors.size();d++) {
    childsize *= processors[d];
  }
  int Nchild = Nparent/childsize;
  assert (childsize * Nchild == Nparent);
  //  std::cout << " child size  "<<childsize <<std::endl;
  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
  std::vector<int> scoor(_ndimension); // coor of split within parent
  std::vector<int> ssize(_ndimension); // coor of split within parent
  for(int d=0;d<_ndimension;d++){
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
  }
  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
  int crank;  
  // Mpi uses the reverse Lexico convention to us; so reversed routines called
  Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
  Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);      // ssize is the number of split grids
  MPI_Comm comm_split;
  if ( Nchild > 1 ) { 
    if(0){
      std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
      std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    ";
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
      std::cout<<std::endl;
      //////////////////////////////////////////////////////////////////////////////////////////////////////
      // Declare victory
      //////////////////////////////////////////////////////////////////////////////////////////////////////
      std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
 		<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
      std::cout << " Split communicator " <<comm_split <<std::endl;
    }
    ////////////////////////////////////////////////////////////////
    // Split the communicator
    ////////////////////////////////////////////////////////////////
    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
    assert(ierr==0);
  } else {
    srank = 0;
    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
    assert(ierr==0);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Set up from the new split communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  InitFromMPICommunicator(processors,comm_split);
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take the right SHM buffers
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  SetCommunicator(comm_split);
  ///////////////////////////////////////////////
  // Free the temp communicator 
  ///////////////////////////////////////////////
  MPI_Comm_free(&comm_split);
  if(0){ 
    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
    for(int d=0;d<processors.size();d++){
      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
    }
  }
  for(int d=0;d<processors.size();d++){
    assert(_processor_coor[d] == ccoor[d] );
  }
 }
 void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
 {
  ////////////////////////////////////////////////////
  // Creates communicator, and the communicator_halo
  ////////////////////////////////////////////////////
  _ndimension = processors.size();
  _processor_coor.resize(_ndimension);
  /////////////////////////////////
  // Count the requested nodes
  /////////////////////////////////
  _Nprocessors=1;
  _processors = processors;
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  std::vector<int> periodic(_ndimension,1);
  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
  if ( 0 && (communicator_base != communicator_world) ) {
    std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
    for(int d=0;d<_processors.size();d++){
      std::cout << _processor_coor[d]<<" ";
    }
    std::cout << std::endl;
  }
  int Size;
  MPI_Comm_size(communicator,&Size);
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
  assert(Size==_Nprocessors);
 }
 CartesianCommunicator::~CartesianCommunicator()
 {
  int MPI_is_finalised;
  MPI_Finalized(&MPI_is_finalised);
  if (communicator && !MPI_is_finalised) {
    MPI_Comm_free(&communicator);
    for(int i=0;i<communicator_halo.size();i++){
      MPI_Comm_free(&communicator_halo[i]);
    }
  }  
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    MPI_Request xrq;
    MPI_Request rrq;
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
    // Give the CPU to MPI immediately; can use threads to overlap optionally
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
 }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int dest,
 						     void *recv,
 						     int from,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,
 							 void *recv,
 							 int from,
 							 int bytes,int dir)
 {
  int ncomm  =communicator_halo.size(); 
  int commdir=dir%ncomm;
  MPI_Request xrq;
  MPI_Request rrq;
  int ierr;
  int gdest = ShmRanks[dest];
  int gfrom = ShmRanks[from];
  int gme   = ShmRanks[_processor];
  assert(dest != _processor);
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
  if ( gfrom ==MPI_UNDEFINED) {
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }
  if ( gdest == MPI_UNDEFINED ) {
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
    this->StencilSendToRecvFromComplete(list,dir);
  }
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  if (nreq==0) return;
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankWorld(void){ 
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  std::vector<int> row(_ndimension,1);
  assert(dim>=0 && dim<_ndimension);
  //  Split the communicator
  row[dim] = _processors[dim];
  int me;
  CartesianCommunicator Comm(row,*this,me);
  Comm.AllToAll(in,out,words,bytes);
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
  // (Turns up on 32^3 x 64 Gparity too)
  MPI_Datatype object;
  int iwords; 
  int ibytes;
  iwords = words;
  ibytes = bytes;
  assert(words == iwords); // safe to cast to int ?
  assert(bytes == ibytes); // safe to cast to int ?
  MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
  MPI_Type_commit(&object);
  MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
  MPI_Type_free(&object);
 }
 }
@@ -0,0 +1,165 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_none.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
  GlobalSharedMemory::Init(communicator_world);
  GlobalSharedMemory::SharedMemoryAllocate(
 		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
 		   GlobalSharedMemory::Hugepages);
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
  : CartesianCommunicator(processors) 
 {
  srank=0;
  SetCommunicator(communicator_world);
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _processors = processors;
  _ndimension = processors.size();
  _processor_coor.resize(_ndimension);
  // Require 1^N processor grid for fake
  _Nprocessors=1;
  _processor = 0;
  for(int d=0;d<_ndimension;d++) {
    assert(_processors[d]==1);
    _processor_coor[d] = 0;
  }
  SetCommunicator(communicator_world);
 }
 CartesianCommunicator::~CartesianCommunicator(){}
 void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSum(uint64_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int xmit_to_rank,
 					   int recv_from_rank,
 					   int bytes)
 {
  assert(0);
 }
 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  assert(0);
 }
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
 }
 int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
 int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
 void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  coor = _processor_coor; }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  source =0;
  dest=0;
 }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,
 						     void *recv,
 						     int recv_from_rank,
 						     int bytes, int dir)
 {
  std::vector<CommsRequest_t> list;
  // Discard the "dir"
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
  // Discard the "dir"
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void){};
 }
@@ -0,0 +1,92 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 namespace Grid { 
 // static data
 uint64_t            GlobalSharedMemory::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
 int                 GlobalSharedMemory::Hugepages = 0;
 int                 GlobalSharedMemory::_ShmSetup;
 int                 GlobalSharedMemory::_ShmAlloc;
 uint64_t            GlobalSharedMemory::_ShmAllocBytes;
 std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
 Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
 int                 GlobalSharedMemory::WorldShmRank;
 int                 GlobalSharedMemory::WorldShmSize;
 std::vector<int>    GlobalSharedMemory::WorldShmRanks;
 Grid_MPI_Comm       GlobalSharedMemory::WorldComm;
 int                 GlobalSharedMemory::WorldSize;
 int                 GlobalSharedMemory::WorldRank;
 int                 GlobalSharedMemory::WorldNodes;
 int                 GlobalSharedMemory::WorldNode;
 void GlobalSharedMemory::SharedMemoryFree(void)
 {
  assert(_ShmAlloc);
  assert(_ShmAllocBytes>0);
  for(int r=0;r<WorldShmSize;r++){
    munmap(WorldShmCommBufs[r],_ShmAllocBytes);
  }
  _ShmAlloc = 0;
  _ShmAllocBytes = 0;
 }
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
 void *SharedMemory::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
  heap_top  += bytes;
  heap_bytes+= bytes;
  if (heap_bytes >= heap_size) {
    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
    assert(heap_bytes<heap_size);
  }
  return ptr;
 }
 void SharedMemory::ShmBufferFreeAll(void) { 
  heap_top  =(size_t)ShmBufferSelf();
  heap_bytes=0;
 }
 void *SharedMemory::ShmBufferSelf(void)
 {
  return ShmCommBufs[ShmRank];
 }
 }
@@ -0,0 +1,165 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 // TODO
 // 1) move includes into SharedMemory.cc
 //
 // 2) split shared memory into a) optimal communicator creation from comm world
 // 
 //                             b) shared memory buffers container
 //                                -- static globally shared; init once
 //                                -- per instance set of buffers.
 //                                   
 #pragma once 
 #include <Grid/GridCore.h>
 #if defined (GRID_COMMS_MPI3) 
 #include <mpi.h>
 #endif 
 #include <semaphore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/types.h>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
 #include <zlib.h>
 #ifdef HAVE_NUMAIF_H
 #include <numaif.h>
 #endif
 namespace Grid {
 #if defined (GRID_COMMS_MPI3) 
  typedef MPI_Comm    Grid_MPI_Comm;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
  typedef int Grid_MPI_Comm;
 #endif
 class GlobalSharedMemory {
 private:
  static const int     MAXLOG2RANKSPERNODE = 16;            
  // Init once lock on the buffer allocation
  static int      _ShmSetup;
  static int      _ShmAlloc;
  static uint64_t _ShmAllocBytes;
 public:
  static int      ShmSetup(void)      { return _ShmSetup; }
  static int      ShmAlloc(void)      { return _ShmAlloc; }
  static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
  static uint64_t      MAX_MPI_SHM_BYTES;
  static int           Hugepages;
  static std::vector<void *> WorldShmCommBufs;
  static Grid_MPI_Comm WorldComm;
  static int           WorldRank;
  static int           WorldSize;
  static Grid_MPI_Comm WorldShmComm;
  static int           WorldShmRank;
  static int           WorldShmSize;
  static int           WorldNodes;
  static int           WorldNode;
  static std::vector<int>  WorldShmRanks;
  //////////////////////////////////////////////////////////////////////////////////////
  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
  static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
 };
 //////////////////////////////
 // one per communicator
 //////////////////////////////
 class SharedMemory 
 {
 private:
  static const int     MAXLOG2RANKSPERNODE = 16;            
  size_t heap_top;
  size_t heap_bytes;
  size_t heap_size;
 protected:
  Grid_MPI_Comm    ShmComm; // for barriers
  int    ShmRank; 
  int    ShmSize;
  std::vector<void *> ShmCommBufs;
  std::vector<int>    ShmRanks;// Mapping comm ranks to Shm ranks
 public:
  SharedMemory() {};
  ~SharedMemory();
  ///////////////////////////////////////////////////////////////////////////////////////
  // set the buffers & sizes
  ///////////////////////////////////////////////////////////////////////////////////////
  void SetCommunicator(Grid_MPI_Comm comm);
  ////////////////////////////////////////////////////////////////////////
  // For this instance ; disjoint buffer sets between splits if split grid
  ////////////////////////////////////////////////////////////////////////
  void ShmBarrier(void); 
  ///////////////////////////////////////////////////
  // Call on any instance
  ///////////////////////////////////////////////////
  void SharedMemoryTest(void);
  void *ShmBufferSelf(void);
  void *ShmBuffer    (int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void  ShmBufferFreeAll(void) ;
  //////////////////////////////////////////////////////////////////////////
  // Make info on Nodes & ranks and Shared memory available
  //////////////////////////////////////////////////////////////////////////
  int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
  int RankCount(void) { return GlobalSharedMemory::WorldSize;};
 };
 }
@@ -0,0 +1,651 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <pwd.h>
 namespace Grid { 
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
  assert(_ShmSetup==0);
  WorldComm = comm;
  MPI_Comm_rank(WorldComm,&WorldRank);
  MPI_Comm_size(WorldComm,&WorldSize);
  // WorldComm, WorldSize, WorldRank
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
  // WorldShmComm, WorldShmSize, WorldShmRank
  // WorldNodes
  WorldNodes = WorldSize/WorldShmSize;
  assert( (WorldNodes * WorldShmSize) == WorldSize );
  // FIXME: Check all WorldShmSize are the same ?
  /////////////////////////////////////////////////////////////////////
  // find world ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
  MPI_Group WorldGroup, ShmGroup;
  MPI_Comm_group (WorldComm, &WorldGroup); 
  MPI_Comm_group (WorldShmComm, &ShmGroup);
  std::vector<int> world_ranks(WorldSize);   for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
  WorldShmRanks.resize(WorldSize); 
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]); 
  ///////////////////////////////////////////////////////////////////
  // Identify who is in my group and nominate the leader
  ///////////////////////////////////////////////////////////////////
  int g=0;
  std::vector<int> MyGroup;
  MyGroup.resize(WorldShmSize);
  for(int rank=0;rank<WorldSize;rank++){
    if(WorldShmRanks[rank]!=MPI_UNDEFINED){
      assert(g<WorldShmSize);
      MyGroup[g++] = rank;
    }
  }
  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
  int myleader = MyGroup[0];
  std::vector<int> leaders_1hot(WorldSize,0);
  std::vector<int> leaders_group(WorldNodes,0);
  leaders_1hot [ myleader ] = 1;
  ///////////////////////////////////////////////////////////////////
  // global sum leaders over comm world
  ///////////////////////////////////////////////////////////////////
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
  assert(ierr==0);
  ///////////////////////////////////////////////////////////////////
  // find the group leaders world rank
  ///////////////////////////////////////////////////////////////////
  int group=0;
  for(int l=0;l<WorldSize;l++){
    if(leaders_1hot[l]){
      leaders_group[group++] = l;
    }
  }
  ///////////////////////////////////////////////////////////////////
  // Identify the node of the group in which I (and my leader) live
  ///////////////////////////////////////////////////////////////////
  WorldNode=-1;
  for(int g=0;g<WorldNodes;g++){
    if (myleader == leaders_group[g]){
      WorldNode=g;
    }
  }
  assert(WorldNode!=-1);
  _ShmSetup=1;
 }
 // Gray encode support 
 int BinaryToGray (int  binary) {
  int gray = (binary>>1)^binary;
  return gray;
 }
 int Log2Size(int TwoToPower,int MAXLOG2)
 {
  int log2size = -1;
  for(int i=0;i<=MAXLOG2;i++){
    if ( (0x1<<i) == TwoToPower ) {
      log2size = i;
      break;
    }
  }
  return log2size;
 }
 void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
 {
 #ifdef HYPERCUBE
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
  assert(log2size != -1);
  ////////////////////////////////////////////////////////////////
  // Identify the hypercube coordinate of this node using hostname
  ////////////////////////////////////////////////////////////////
  // n runs 0...7 9...16 18...25 27...34     (8*4)  5 bits
  // i runs 0..7                                    3 bits
  // r runs 0..3                                    2 bits
  // 2^10 = 1024 nodes
  const int maxhdim = 10; 
  std::vector<int> HyperCubeCoords(maxhdim,0);
  std::vector<int> RootHyperCubeCoords(maxhdim,0);
  int R;
  int I;
  int N;
  const int namelen = _POSIX_HOST_NAME_MAX;
  char name[namelen];
  // Parse ICE-XA hostname to get hypercube location
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
  assert(nscan==3);
  int nlo = N%9;
  int nhi = N/9;
  uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
  uint32_t rootcoor  = hypercoor;
  //////////////////////////////////////////////////////////////////
  // Print debug info
  //////////////////////////////////////////////////////////////////
  for(int d=0;d<maxhdim;d++){
    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
  }
  std::string hname(name);
  std::cout << "hostname "<<hname<<std::endl;
  std::cout << "R " << R << " I " << I << " N "<< N
            << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
  //////////////////////////////////////////////////////////////////
  // broadcast node 0's base coordinate for this partition.
  //////////////////////////////////////////////////////////////////
  MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm); 
  hypercoor=hypercoor-rootcoor;
  assert(hypercoor<WorldSize);
  assert(hypercoor>=0);
  //////////////////////////////////////
  // Printing
  //////////////////////////////////////
  for(int d=0;d<maxhdim;d++){
    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
  }
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int ndimension              = processors.size();
  std::vector<int> processor_coor(ndimension);
  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
  std::vector<int> HyperCoor(ndimension);
  int dim = 0;
  for(int l2=0;l2<log2size;l2++){
    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%ndimension;
  }
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
  for(int d=0;d<ndimension;d++){
    NodeDims[d] = WorldDims[d]/ShmDims[d];
  }
  ////////////////////////////////////////////////////////////////
  // Map Hcube according to physical lattice 
  // must partition. Loop over dims and find out who would join.
  ////////////////////////////////////////////////////////////////
  int hcoor = hypercoor;
  for(int d=0;d<ndimension;d++){
     int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
     int msk  = (0x1<<bits)-1;
     HyperCoor[d]=hcoor & msk;  
     HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
     hcoor = hcoor >> bits;
  } 
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
  int Nprocessors=1;
  for(int i=0;i<ndimension;i++){
    Nprocessors*=processors[i];
  }
  assert(WorldSize==Nprocessors);
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  ////////////////////////////////////////////////////////////////
  int rank;
  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
  for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
  /////////////////////////////////////////////////////////////////
  // Build the new communicator
  /////////////////////////////////////////////////////////////////
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 #else 
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
  assert(log2size != -1);
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int ndimension              = processors.size();
  std::vector<int> processor_coor(ndimension);
  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
  int dim = 0;
  for(int l2=0;l2<log2size;l2++){
    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%ndimension;
  }
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
  for(int d=0;d<ndimension;d++){
    NodeDims[d] = WorldDims[d]/ShmDims[d];
  }
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
  int Nprocessors=1;
  for(int i=0;i<ndimension;i++){
    Nprocessors*=processors[i];
  }
  assert(WorldSize==Nprocessors);
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  ////////////////////////////////////////////////////////////////
  int rank;
  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
  /////////////////////////////////////////////////////////////////
  // Build the new communicator
  /////////////////////////////////////////////////////////////////
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 #endif
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // SHMGET
 ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMGET
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  std::vector<int> shmids(WorldShmSize);
  if ( WorldShmRank == 0 ) {
    for(int r=0;r<WorldShmSize;r++){
      size_t size = bytes;
      key_t key   = IPC_PRIVATE;
      int flags = IPC_CREAT | SHM_R | SHM_W;
 #ifdef SHM_HUGETLB
      if (Hugepages) flags|=SHM_HUGETLB;
 #endif
      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
        int errsv = errno;
        printf("Errno %d\n",errsv);
        printf("key   %d\n",key);
        printf("size  %lld\n",size);
        printf("flags %d\n",flags);
        perror("shmget");
        exit(1);
      }
    }
  }
  MPI_Barrier(WorldShmComm);
  MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
  MPI_Barrier(WorldShmComm);
  for(int r=0;r<WorldShmSize;r++){
    WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
    if (WorldShmCommBufs[r] == (uint64_t *)-1) {
      perror("Shared memory attach failure");
      shmctl(shmids[r], IPC_RMID, NULL);
      exit(2);
    }
  }
  MPI_Barrier(WorldShmComm);
  ///////////////////////////////////
  // Mark for clean up
  ///////////////////////////////////
  for(int r=0;r<WorldShmSize;r++){
    shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
  }
  MPI_Barrier(WorldShmComm);
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 }
 #endif
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hugetlbfs and others map filesystems as mappable huge pages
  ////////////////////////////////////////////////////////////////////////////////////////////
  char shm_name [NAME_MAX];
  for(int r=0;r<WorldShmSize;r++){
    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
    if ( fd == -1) { 
      printf("open %s failed\n",shm_name);
      perror("open hugetlbfs");
      exit(0);
    }
    int mmap_flag = MAP_SHARED ;
 #ifdef MAP_POPULATE    
    mmap_flag|=MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
    if ( flags ) mmap_flag |= MAP_HUGETLB;
 #endif
    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
    if ( ptr == (void *)MAP_FAILED ) {    
      printf("mmap %s failed\n",shm_name);
      perror("failed mmap");      assert(0);    
    }
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 };
 #endif // MMAP
 #ifdef GRID_MPI3_SHM_NONE
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hugetlbf and others map filesystems as mappable huge pages
  ////////////////////////////////////////////////////////////////////////////////////////////
  char shm_name [NAME_MAX];
  assert(WorldShmSize == 1);
  for(int r=0;r<WorldShmSize;r++){
    int fd=-1;
    int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
 #ifdef MAP_POPULATE    
    mmap_flag|=MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
    if ( flags ) mmap_flag |= MAP_HUGETLB;
 #endif
    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
    if ( ptr == (void *)MAP_FAILED ) {    
      printf("mmap %s failed\n",shm_name);
      perror("failed mmap");      assert(0);    
    }
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 };
 #endif // MMAP
 #ifdef GRID_MPI3_SHMOPEN
 ////////////////////////////////////////////////////////////////////////////////////////////
 // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
 // tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
 // the posix shm virtual file system
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
  std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  char shm_name [NAME_MAX];
  if ( WorldShmRank == 0 ) {
    for(int r=0;r<WorldShmSize;r++){
      size_t size = bytes;
      struct passwd *pw = getpwuid (getuid());
      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
      shm_unlink(shm_name);
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
      int mmap_flag = MAP_SHARED;
 #ifdef MAP_POPULATE 
      mmap_flag |= MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
      if (flags) mmap_flag |= MAP_HUGETLB;
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
      if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
      }
      assert(((uint64_t)ptr&0x3F)==0);
      WorldShmCommBufs[r] =ptr;
      close(fd);
    }
  }
  MPI_Barrier(WorldShmComm);
  if ( WorldShmRank != 0 ) { 
    for(int r=0;r<WorldShmSize;r++){
      size_t size = bytes ;
      struct passwd *pw = getpwuid (getuid());
      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
      int fd=shm_open(shm_name,O_RDWR,0666);
      if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
      WorldShmCommBufs[r] =ptr;
      close(fd);
    }
  }
  _ShmAlloc=1;
  _ShmAllocBytes = bytes;
 }
 #endif
  ////////////////////////////////////////////////////////
  // Global shared functionality finished
  // Now move to per communicator functionality
  ////////////////////////////////////////////////////////
 void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
 {
  int rank, size;
  MPI_Comm_rank(comm,&rank);
  MPI_Comm_size(comm,&size);
  ShmRanks.resize(size);
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
  MPI_Comm_rank(ShmComm     ,&ShmRank);
  MPI_Comm_size(ShmComm     ,&ShmSize);
  ShmCommBufs.resize(ShmSize);
  //////////////////////////////////////////////////////////////////////
  // Map ShmRank to WorldShmRank and use the right buffer
  //////////////////////////////////////////////////////////////////////
  assert (GlobalSharedMemory::ShmAlloc()==1);
  heap_size = GlobalSharedMemory::ShmAllocBytes();
  for(int r=0;r<ShmSize;r++){
    uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
    //    std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< "  wsr = "<<wsr<<std::endl;
  }
  ShmBufferFreeAll();
  /////////////////////////////////////////////////////////////////////
  // find comm ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
  MPI_Group FullGroup, ShmGroup;
  MPI_Comm_group (comm   , &FullGroup); 
  MPI_Comm_group (ShmComm, &ShmGroup);
  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
 //////////////////////////////////////////////////////////////////
 void SharedMemory::ShmBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Test the shared memory is working
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 void SharedMemory::SharedMemoryTest(void)
 {
  ShmBarrier();
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
      uint64_t * check = (uint64_t *) ShmCommBufs[r];
      check[0] = GlobalSharedMemory::WorldNode;
      check[1] = r;
      check[2] = 0x5A5A5A;
    }
  }
  ShmBarrier();
  for(int r=0;r<ShmSize;r++){
    uint64_t * check = (uint64_t *) ShmCommBufs[r];
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==0x5A5A5A);
  }
  ShmBarrier();
 }
 void *SharedMemory::ShmBuffer(int rank)
 {
  int gpeer = ShmRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    return ShmCommBufs[gpeer];
  }
 }
 void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  static int count =0;
  int gpeer = ShmRanks[rank];
  assert(gpeer!=ShmRank); // never send to self
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
    return (void *) remote;
  }
 }
 SharedMemory::~SharedMemory()
 {
  int MPI_is_finalised;  MPI_Finalized(&MPI_is_finalised);
  if ( !MPI_is_finalised ) { 
    MPI_Comm_free(&ShmComm);
  }
 };
 }
@@ -0,0 +1,128 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 namespace Grid { 
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
  assert(_ShmSetup==0);
  WorldComm = 0;
  WorldRank = 0;
  WorldSize = 1;
  WorldShmComm = 0 ;
  WorldShmRank = 0 ;
  WorldShmSize = 1 ;
  WorldNodes   = 1 ;
  WorldNode    = 0 ;
  WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
  WorldShmCommBufs.resize(1);
  _ShmSetup=1;
 }
 void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
 {
  optimal_comm = WorldComm;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended, use anonymous mmap
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  int mmap_flag =0;
 #ifdef MAP_ANONYMOUS
  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
 #endif
 #ifdef MAP_ANON
  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
 #endif
 #ifdef MAP_HUGETLB
  if ( flags ) mmap_flag |= MAP_HUGETLB;
 #endif
  ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
  if (ShmCommBuf == (void *)MAP_FAILED) {
    perror("mmap failed ");
    exit(EXIT_FAILURE);  
  }
 #ifdef MADV_HUGEPAGE
  if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
 #endif
  bzero(ShmCommBuf,bytes);
  WorldShmCommBufs[0] = ShmCommBuf;
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 };
  ////////////////////////////////////////////////////////
  // Global shared functionality finished
  // Now move to per communicator functionality
  ////////////////////////////////////////////////////////
 void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
 {
  assert(GlobalSharedMemory::ShmAlloc()==1);
  ShmRanks.resize(1);
  ShmCommBufs.resize(1);
  ShmRanks[0] = 0;
  ShmRank     = 0;
  ShmSize     = 1;
  //////////////////////////////////////////////////////////////////////
  // Map ShmRank to WorldShmRank and use the right buffer
  //////////////////////////////////////////////////////////////////////
  ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
  heap_size      = GlobalSharedMemory::ShmAllocBytes();
  ShmBufferFreeAll();
  return;
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
 //////////////////////////////////////////////////////////////////
 void SharedMemory::ShmBarrier(void){ return ; }
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Test the shared memory is working
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 void SharedMemory::SharedMemoryTest(void) { return; }
 void *SharedMemory::ShmBuffer(int rank)
 {
  return NULL;
 }
 void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  return NULL;
 }
 SharedMemory::~SharedMemory()
 {};
 }
@@ -0,0 +1,52 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Cshift.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_CSHIFT_H_
 #define _GRID_CSHIFT_H_
 #include <Grid/cshift/Cshift_common.h>
 #ifdef GRID_COMMS_NONE
 #include <Grid/cshift/Cshift_none.h>
 #endif
 #ifdef GRID_COMMS_MPI
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_MPI3
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_MPIT
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_SHMEM
 #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
 #endif
@@ -0,0 +1,391 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cshift/Cshift_common.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_CSHIFT_COMMON_H_
 #define _GRID_CSHIFT_COMMON_H_
 namespace Grid {
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
 Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
  int rd = rhs._grid->_rdimensions[dimension];
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
    cbmask = 0x3;
  }
  int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  int ent = 0;
  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o  = n*stride;
 	int bo = n*e2;
 	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
      }
    }
  } else { 
     int bo=0;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	 int o  = n*stride;
 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	 if ( ocb &cbmask ) {
 	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
 	 }
       }
     }
  }
  parallel_for(int i=0;i<ent;i++){
    buffer[table[i].first]=rhs._odata[table[i].second];
  }
 }
 ///////////////////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
 Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
    cbmask = 0x3;
  }
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  int n1=rhs._grid->_slice_stride[dimension];
  if ( cbmask ==0x3){
    parallel_for_nest2(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      =   n*n1;
 	int offset = b+n*e2;
 	vobj temp =rhs._odata[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      }
    }
  } else { 
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
    // Test_cshift_red_black code.
    std::cout << " Dense packed buffer WARNING " <<std::endl;
    parallel_for_nest2(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o=n*n1;
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	int offset = b+n*e2;
 	if ( ocb & cbmask ) {
 	  vobj temp =rhs._odata[so+o+b];
 	  extract<vobj>(temp,pointers,offset);
 	}
      }
    }
  }
 }
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
 template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
    cbmask=0x3;
  }
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  int stride=rhs._grid->_slice_stride[dimension];
  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
  int ent    =0;
  if ( cbmask ==0x3 ) {
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int bo  =n*rhs._grid->_slice_block[dimension];
 	table[ent++] = std::pair<int,int>(so+o+b,bo+b);
      }
    }
  } else { 
    int bo=0;
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
 	  table[ent++]=std::pair<int,int> (so+o+b,bo++);
 	}
      }
    }
  }
  parallel_for(int i=0;i<ent;i++){
    rhs._odata[table[i].first]=buffer[table[i].second];
  }
 }
 //////////////////////////////////////////////////////
 // Scatter for when there *is* need to SIMD split
 //////////////////////////////////////////////////////
 template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
    cbmask=0x3;
  }
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  if(cbmask ==0x3 ) {
    parallel_for_nest2(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      = n*rhs._grid->_slice_stride[dimension];
 	int offset = b+n*rhs._grid->_slice_block[dimension];
 	merge(rhs._odata[so+o+b],pointers,offset);
      }
    }
  } else { 
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
    // Test_cshift_red_black code.
    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
    std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      = n*rhs._grid->_slice_stride[dimension];
 	int offset = b+n*rhs._grid->_slice_block[dimension];
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	if ( ocb&cbmask ) {
 	  merge(rhs._odata[so+o+b],pointers,offset);
 	}
      }
    }
  }
 }
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
 template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
    cbmask=0x3;
  }
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
  int stride = rhs._grid->_slice_stride[dimension];
  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
  int ent=0;
  if(cbmask == 0x3 ){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
        if ( ocb&cbmask ) {
 	  table[ent++] = std::pair<int,int>(lo+o,ro+o);
 	}
      }
    }
  }
  parallel_for(int i=0;i<ent;i++){
    lhs._odata[table[i].first]=rhs._odata[table[i].second];
  }
 }
 template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
 {
  int rd = rhs._grid->_rdimensions[dimension];
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
    cbmask=0x3;
  }
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block [dimension];
  int stride = rhs._grid->_slice_stride[dimension];
  static std::vector<std::pair<int,int> > table;  table.resize(e1*e2);
  int ent=0;
  double t_tab,t_perm;
  if ( cbmask == 0x3 ) {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
      if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
  }
  parallel_for(int i=0;i<ent;i++){
    permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
  }
 }
 //////////////////////////////////////////////////////
 // Local to node Cshift
 //////////////////////////////////////////////////////
 template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
  double t_local;
  if ( sshift[0] == sshift[1] ) {
    Cshift_local(ret,rhs,dimension,shift,0x3);
  } else {
    Cshift_local(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_local(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
 }
 template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid = rhs._grid;
  int fd = grid->_fdimensions[dimension];
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int gd = grid->_gdimensions[dimension];
  int ly = grid->_simd_layout[dimension];
  // Map to always positive shift modulo global full dimension.
  shift = (shift+fd)%fd;
  // the permute type
  ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
  int permute_dim =grid->PermuteDim(dimension);
  int permute_type=grid->PermuteType(dimension);
  int permute_type_dist;
  for(int x=0;x<rd;x++){       
    int o   = 0;
    int bo  = x * grid->_ostride[dimension];
    int cb= (cbmask==0x2)? Odd : Even;
    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
    int sx     = (x+sshift)%rd;
    // wrap is whether sshift > rd.
    //  num is sshift mod rd.
    // 
    //  shift 7
    //
    //  XoXo YcYc 
    //  oXoX cYcY
    //  XoXo YcYc
    //  oXoX cYcY
    //
    //  sshift -- 
    //
    //  XX YY ; 3
    //  XX YY ; 0
    //  XX YY ; 3
    //  XX YY ; 0
    //
    int permute_slice=0;
    if(permute_dim){
      int wrap = sshift/rd; wrap=wrap % ly;
      int  num = sshift%rd;
      if ( x< rd-num ) permute_slice=wrap;
      else permute_slice = (wrap+1)%ly;
      if ( (ly>2) && (permute_slice) ) {
 	assert(permute_type & RotateBit);
 	permute_type_dist = permute_type|permute_slice;
      } else {
 	permute_type_dist = permute_type;
      }
    }
    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
  }
 }
 }
 #endif
@@ -0,0 +1,262 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cshift/Cshift_mpi.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_CSHIFT_MPI_H_
 #define _GRID_CSHIFT_MPI_H_
 namespace Grid { 
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  Lattice<vobj> ret(rhs._grid); 
  int fd = rhs._grid->_fdimensions[dimension];
  int rd = rhs._grid->_rdimensions[dimension];
  // Map to always positive shift modulo global full dimension.
  shift = (shift+fd)%fd;
  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
  // the permute type
  int simd_layout     = rhs._grid->_simd_layout[dimension];
  int comm_dim        = rhs._grid->_processors[dimension] >1 ;
  int splice_dim      = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
  if ( !comm_dim ) {
    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
  } else if ( splice_dim ) {
    //std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift);
  } else {
    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
  return ret;
 }
 template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
  //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
  if ( sshift[0] == sshift[1] ) {
    //    std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift,0x3);
  } else {
    //    std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
 }
 template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
  //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
  if ( sshift[0] == sshift[1] ) {
    //std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
  } else {
    //std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
 }
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  GridBase *grid=rhs._grid;
  Lattice<vobj> temp(rhs._grid);
  int fd              = rhs._grid->_fdimensions[dimension];
  int rd              = rhs._grid->_rdimensions[dimension];
  int pd              = rhs._grid->_processors[dimension];
  int simd_layout     = rhs._grid->_simd_layout[dimension];
  int comm_dim        = rhs._grid->_processors[dimension] >1 ;
  assert(simd_layout==1);
  assert(comm_dim==1);
  assert(shift>=0);
  assert(shift<fd);
  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
  commVector<vobj> send_buf(buffer_size);
  commVector<vobj> recv_buf(buffer_size);
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
  for(int x=0;x<rd;x++){       
    int sx        =  (x+sshift)%rd;
    int comm_proc = ((x+sshift)/rd)%pd;
    if (comm_proc==0) {
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
    } else {
      int words = send_buf.size();
      if (cbmask != 0x3) words=words>>1;
      int bytes = words * sizeof(vobj);
      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
      grid->Barrier();
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
    }
  }
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid=rhs._grid;
  const int Nsimd = grid->Nsimd();
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  int fd = grid->_fdimensions[dimension];
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int pd = grid->_processors[dimension];
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;
  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
  assert(comm_dim==1);
  assert(simd_layout==2);
  assert(shift>=0);
  assert(shift<fd);
  int permute_type=grid->PermuteType(dimension);
  ///////////////////////////////////////////////
  // Simd direction uses an extract/merge pair
  ///////////////////////////////////////////////
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  int words = sizeof(vobj)/sizeof(vector_type);
  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
  int bytes = buffer_size*sizeof(scalar_object);
  std::vector<scalar_object *>  pointers(Nsimd); // 
  std::vector<scalar_object *> rpointers(Nsimd); // received pointers
  ///////////////////////////////////////////
  // Work out what to send where
  ///////////////////////////////////////////
  int cb    = (cbmask==0x2)? Odd : Even;
  int sshift= grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
  // loop over outer coord planes orthog to dim
  for(int x=0;x<rd;x++){       
    // FIXME call local permute copy if none are offnode.
    for(int i=0;i<Nsimd;i++){       
      pointers[i] = &send_buf_extract[i][0];
    }
    int sx   = (x+sshift)%rd;
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
    for(int i=0;i<Nsimd;i++){
      int inner_bit = (Nsimd>>(permute_type+1));
      int ic= (i&inner_bit)? 1:0;
      int my_coor          = rd*ic + x;
      int nbr_coor         = my_coor+sshift;
      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
      int nbr_lane = (i&(~inner_bit));
      int recv_from_rank;
      int xmit_to_rank;
      if (nbr_ic) nbr_lane|=inner_bit;
      assert (sx == nbr_ox);
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 	grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
 			     xmit_to_rank,
 			     (void *)&recv_buf_extract[i][0],
 			     recv_from_rank,
 			     bytes);
 	grid->Barrier();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
      }
    }
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
  }
 }
 }
 #endif
@@ -0,0 +1,39 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cshift/Cshift_none.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_CSHIFT_NONE_H_
 #define _GRID_CSHIFT_NONE_H_
 namespace Grid {
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
  Lattice<vobj> ret(rhs._grid);
  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
  Cshift_local(ret,rhs,dimension,shift);
  return ret;
 }
 }
 #endif
@@ -0,0 +1,33 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Lattice.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_H
 #define GRID_LATTICE_H
 #include <Grid/lattice/Lattice_base.h>
 #endif
@@ -0,0 +1,466 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/lattice/Lattice_ET.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_LATTICE_ET_H
 #define GRID_LATTICE_ET_H
 #include <iostream>
 #include <tuple>
 #include <typeinfo>
 #include <vector>
 namespace Grid {
 ////////////////////////////////////////////////////
 // Predicated where support
 ////////////////////////////////////////////////////
 template <class iobj, class vobj, class robj>
 inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
                            const robj &iffalse) {
  typename std::remove_const<vobj>::type ret;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  const int Nsimd = vobj::vector_type::Nsimd();
  const int words = sizeof(vobj) / sizeof(vector_type);
  std::vector<Integer> mask(Nsimd);
  std::vector<scalar_object> truevals(Nsimd);
  std::vector<scalar_object> falsevals(Nsimd);
  extract(iftrue, truevals);
  extract(iffalse, falsevals);
  extract<vInteger, Integer>(TensorRemove(predicate), mask);
  for (int s = 0; s < Nsimd; s++) {
    if (mask[s]) falsevals[s] = truevals[s];
  }
  merge(ret, falsevals);
  return ret;
 }
 ////////////////////////////////////////////
 // recursive evaluation of expressions; Could
 // switch to generic approach with variadics, a la
 // Antonin's Lat Sim but the repack to variadic with popped
 // from tuple is hideous; C++14 introduces std::make_index_sequence for this
 ////////////////////////////////////////////
 // leaf eval of lattice ; should enable if protect using traits
 template <typename T>
 using is_lattice = std::is_base_of<LatticeBase, T>;
 template <typename T>
 using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
 template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
 //Specialization of getVectorType for lattices
 template<typename T>
 struct getVectorType<Lattice<T> >{
  typedef typename Lattice<T>::vector_object type;
 };
 template<class sobj>
 inline sobj eval(const unsigned int ss, const sobj &arg)
 {
  return arg;
 }
 template <class lobj>
 inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
  return arg._odata[ss];
 }
 // handle nodes in syntax tree
 template <typename Op, typename T1>
 auto inline eval(
    const unsigned int ss,
    const LatticeUnaryExpression<Op, T1> &expr)  // eval one operand
    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
  return expr.first.func(eval(ss, std::get<0>(expr.second)));
 }
 template <typename Op, typename T1, typename T2>
 auto inline eval(
    const unsigned int ss,
    const LatticeBinaryExpression<Op, T1, T2> &expr)  // eval two operands
    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
                                eval(ss, std::get<1>(expr.second)))) {
  return expr.first.func(eval(ss, std::get<0>(expr.second)),
                         eval(ss, std::get<1>(expr.second)));
 }
 template <typename Op, typename T1, typename T2, typename T3>
 auto inline eval(const unsigned int ss,
                 const LatticeTrinaryExpression<Op, T1, T2, T3>
                     &expr)  // eval three operands
    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
                                eval(ss, std::get<1>(expr.second)),
                                eval(ss, std::get<2>(expr.second)))) {
  return expr.first.func(eval(ss, std::get<0>(expr.second)),
                         eval(ss, std::get<1>(expr.second)),
                         eval(ss, std::get<2>(expr.second)));
 }
 //////////////////////////////////////////////////////////////////////////
 // Obtain the grid from an expression, ensuring conformable. This must follow a
 // tree recursion
 //////////////////////////////////////////////////////////////////////////
 template <class T1,
          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
 inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf
 {
  if (grid) {
    conformable(grid, lat._grid);
  }
  grid = lat._grid;
 }
 template <class T1,
          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
 inline void GridFromExpression(GridBase *&grid,
                               const T1 &notlat)  // non-lattice leaf
 {}
 template <typename Op, typename T1>
 inline void GridFromExpression(GridBase *&grid,
                               const LatticeUnaryExpression<Op, T1> &expr) {
  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
 }
 template <typename Op, typename T1, typename T2>
 inline void GridFromExpression(
    GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
  GridFromExpression(grid, std::get<1>(expr.second));
 }
 template <typename Op, typename T1, typename T2, typename T3>
 inline void GridFromExpression(
    GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
  GridFromExpression(grid, std::get<1>(expr.second));
  GridFromExpression(grid, std::get<2>(expr.second));
 }
 //////////////////////////////////////////////////////////////////////////
 // Obtain the CB from an expression, ensuring conformable. This must follow a
 // tree recursion
 //////////////////////////////////////////////////////////////////////////
 template <class T1,
          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
 inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
 {
  if ((cb == Odd) || (cb == Even)) {
    assert(cb == lat.checkerboard);
  }
  cb = lat.checkerboard;
  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
 }
 template <class T1,
          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
 inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
 {
  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
 }
 template <typename Op, typename T1>
 inline void CBFromExpression(int &cb,
                             const LatticeUnaryExpression<Op, T1> &expr) {
  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2>
 inline void CBFromExpression(int &cb,
                             const LatticeBinaryExpression<Op, T1, T2> &expr) {
  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
  CBFromExpression(cb, std::get<1>(expr.second));
  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2, typename T3>
 inline void CBFromExpression(
    int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
  CBFromExpression(cb, std::get<1>(expr.second));
  CBFromExpression(cb, std::get<2>(expr.second));
  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
 }
 ////////////////////////////////////////////
 // Unary operators and funcs
 ////////////////////////////////////////////
 #define GridUnopClass(name, ret)                                          \
  template <class arg>                                                    \
  struct name {                                                           \
    static auto inline func(const arg a) -> decltype(ret) { return ret; } \
  };
 GridUnopClass(UnarySub, -a);
 GridUnopClass(UnaryNot, Not(a));
 GridUnopClass(UnaryAdj, adj(a));
 GridUnopClass(UnaryConj, conjugate(a));
 GridUnopClass(UnaryTrace, trace(a));
 GridUnopClass(UnaryTranspose, transpose(a));
 GridUnopClass(UnaryTa, Ta(a));
 GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
 GridUnopClass(UnaryReal, real(a));
 GridUnopClass(UnaryImag, imag(a));
 GridUnopClass(UnaryToReal, toReal(a));
 GridUnopClass(UnaryToComplex, toComplex(a));
 GridUnopClass(UnaryTimesI, timesI(a));
 GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
 GridUnopClass(UnaryAbs, abs(a));
 GridUnopClass(UnarySqrt, sqrt(a));
 GridUnopClass(UnaryRsqrt, rsqrt(a));
 GridUnopClass(UnarySin, sin(a));
 GridUnopClass(UnaryCos, cos(a));
 GridUnopClass(UnaryAsin, asin(a));
 GridUnopClass(UnaryAcos, acos(a));
 GridUnopClass(UnaryLog, log(a));
 GridUnopClass(UnaryExp, exp(a));
 ////////////////////////////////////////////
 // Binary operators
 ////////////////////////////////////////////
 #define GridBinOpClass(name, combination)                      \
  template <class left, class right>                           \
  struct name {                                                \
    static auto inline func(const left &lhs, const right &rhs) \
        -> decltype(combination) const {                       \
      return combination;                                      \
    }                                                          \
  }
 GridBinOpClass(BinaryAdd, lhs + rhs);
 GridBinOpClass(BinarySub, lhs - rhs);
 GridBinOpClass(BinaryMul, lhs *rhs);
 GridBinOpClass(BinaryDiv, lhs /rhs);
 GridBinOpClass(BinaryAnd, lhs &rhs);
 GridBinOpClass(BinaryOr, lhs | rhs);
 GridBinOpClass(BinaryAndAnd, lhs &&rhs);
 GridBinOpClass(BinaryOrOr, lhs || rhs);
 ////////////////////////////////////////////////////
 // Trinary conditional op
 ////////////////////////////////////////////////////
 #define GridTrinOpClass(name, combination)                                     \
  template <class predicate, class left, class right>                          \
  struct name {                                                                \
    static auto inline func(const predicate &pred, const left &lhs,            \
                            const right &rhs) -> decltype(combination) const { \
      return combination;                                                      \
    }                                                                          \
  }
 GridTrinOpClass(
    TrinaryWhere,
    (predicatedWhere<predicate, typename std::remove_reference<left>::type,
                     typename std::remove_reference<right>::type>(pred, lhs,
                                                                  rhs)));
 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
 #define GRID_UNOP(name) name<decltype(eval(0, arg))>
 #define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
 #define GRID_TRINOP(name) \
  name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
 #define GRID_DEF_UNOP(op, name)                                             \
  template <typename T1,                                                    \
            typename std::enable_if<is_lattice<T1>::value ||                \
                                        is_lattice_expr<T1>::value,         \
                                    T1>::type * = nullptr>                  \
  inline auto op(const T1 &arg)                                             \
      ->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(       \
          std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
    return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(             \
        std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)));     \
  }
 #define GRID_BINOP_LEFT(op, name)                                             \
  template <typename T1, typename T2,                                         \
            typename std::enable_if<is_lattice<T1>::value ||                  \
                                        is_lattice_expr<T1>::value,           \
                                    T1>::type * = nullptr>                    \
  inline auto op(const T1 &lhs, const T2 &rhs)                                \
      ->decltype(                                                             \
          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
              std::make_pair(GRID_BINOP(name)(),                              \
                             std::forward_as_tuple(lhs, rhs)))) {             \
    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
  }
 #define GRID_BINOP_RIGHT(op, name)                                            \
  template <typename T1, typename T2,                                         \
            typename std::enable_if<!is_lattice<T1>::value &&                 \
                                        !is_lattice_expr<T1>::value,          \
                                    T1>::type * = nullptr,                    \
            typename std::enable_if<is_lattice<T2>::value ||                  \
                                        is_lattice_expr<T2>::value,           \
                                    T2>::type * = nullptr>                    \
  inline auto op(const T1 &lhs, const T2 &rhs)                                \
      ->decltype(                                                             \
          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
              std::make_pair(GRID_BINOP(name)(),                              \
                             std::forward_as_tuple(lhs, rhs)))) {             \
    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
  }
 #define GRID_DEF_BINOP(op, name) \
  GRID_BINOP_LEFT(op, name);     \
  GRID_BINOP_RIGHT(op, name);
 #define GRID_DEF_TRINOP(op, name)                                              \
  template <typename T1, typename T2, typename T3>                             \
  inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)                 \
      ->decltype(                                                              \
          LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,  \
                                   const T3 &>(std::make_pair(                 \
              GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) {  \
    return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
                                    const T3 &>(std::make_pair(                \
        GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)));          \
  }
 ////////////////////////
 // Operator definitions
 ////////////////////////
 GRID_DEF_UNOP(operator-, UnarySub);
 GRID_DEF_UNOP(Not, UnaryNot);
 GRID_DEF_UNOP(operator!, UnaryNot);
 GRID_DEF_UNOP(adj, UnaryAdj);
 GRID_DEF_UNOP(conjugate, UnaryConj);
 GRID_DEF_UNOP(trace, UnaryTrace);
 GRID_DEF_UNOP(transpose, UnaryTranspose);
 GRID_DEF_UNOP(Ta, UnaryTa);
 GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
 GRID_DEF_UNOP(real, UnaryReal);
 GRID_DEF_UNOP(imag, UnaryImag);
 GRID_DEF_UNOP(toReal, UnaryToReal);
 GRID_DEF_UNOP(toComplex, UnaryToComplex);
 GRID_DEF_UNOP(timesI, UnaryTimesI);
 GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
 GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
                               // abs-fabs-dabs-labs thing
 GRID_DEF_UNOP(sqrt, UnarySqrt);
 GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
 GRID_DEF_UNOP(sin, UnarySin);
 GRID_DEF_UNOP(cos, UnaryCos);
 GRID_DEF_UNOP(asin, UnaryAsin);
 GRID_DEF_UNOP(acos, UnaryAcos);
 GRID_DEF_UNOP(log, UnaryLog);
 GRID_DEF_UNOP(exp, UnaryExp);
 GRID_DEF_BINOP(operator+, BinaryAdd);
 GRID_DEF_BINOP(operator-, BinarySub);
 GRID_DEF_BINOP(operator*, BinaryMul);
 GRID_DEF_BINOP(operator/, BinaryDiv);
 GRID_DEF_BINOP(operator&, BinaryAnd);
 GRID_DEF_BINOP(operator|, BinaryOr);
 GRID_DEF_BINOP(operator&&, BinaryAndAnd);
 GRID_DEF_BINOP(operator||, BinaryOrOr);
 GRID_DEF_TRINOP(where, TrinaryWhere);
 /////////////////////////////////////////////////////////////
 // Closure convenience to force expression to evaluate
 /////////////////////////////////////////////////////////////
 template <class Op, class T1>
 auto closure(const LatticeUnaryExpression<Op, T1> &expr)
    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
      expr);
  return ret;
 }
 template <class Op, class T1, class T2>
 auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
                                        eval(0, std::get<1>(expr.second))))> {
  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
                                   eval(0, std::get<1>(expr.second))))>
      ret(expr);
  return ret;
 }
 template <class Op, class T1, class T2, class T3>
 auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
                                        eval(0, std::get<1>(expr.second)),
                                        eval(0, std::get<2>(expr.second))))> {
  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
                                   eval(0, std::get<1>(expr.second)),
                                   eval(0, std::get<2>(expr.second))))>
      ret(expr);
  return ret;
 }
 #undef GRID_UNOP
 #undef GRID_BINOP
 #undef GRID_TRINOP
 #undef GRID_DEF_UNOP
 #undef GRID_DEF_BINOP
 #undef GRID_DEF_TRINOP
 }
 #if 0
 using namespace Grid;
 int main(int argc,char **argv){
   Lattice<double> v1(16);
   Lattice<double> v2(16);
   Lattice<double> v3(16);
   BinaryAdd<double,double> tmp;
   LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &> 
     expr(std::make_pair(tmp,
    std::forward_as_tuple(v1,v2)));
   tmp.func(eval(0,v1),eval(0,v2));
   auto var = v1+v2;
   std::cout<<GridLogMessage<<typeid(var).name()<<std::endl;
   v3=v1+v2;
   v3=v1+v2+v1*v2;
 };
 void testit(Lattice<double> &v1,Lattice<double> &v2,Lattice<double> &v3)
 {
   v3=v1+v2+v1*v2;
 }
 #endif
 #endif
@@ -0,0 +1,255 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_arith.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_ARITH_H
 #define GRID_LATTICE_ARITH_H
 namespace Grid {
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  avoid copy back routines for mult, mac, sub, add
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class obj1,class obj2,class obj3> strong_inline
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else
      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 #endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else
      mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 #endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else
      sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 #endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else
      add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 #endif
    }
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  avoid copy back routines for mult, mac, sub, add
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class obj1,class obj2,class obj3> strong_inline
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(lhs,ret);
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
      obj1 tmp;
      mult(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,lhs);
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
      obj1 tmp;
      mac(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,lhs);
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
 #else 
      sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
 #endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(lhs,ret);
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
 #else 
      add(&ret._odata[ss],&lhs._odata[ss],&rhs);
 #endif
    }
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  avoid copy back routines for mult, mac, sub, add
  //////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class obj1,class obj2,class obj3> strong_inline
    void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else 
      mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 #endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else 
      mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 #endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else 
      sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 #endif
    }
  }
  template<class obj1,class obj2,class obj3> strong_inline
    void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs,&rhs._odata[ss]);
      vstream(ret._odata[ss],tmp);
 #else 
      add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 #endif
    }
  }
  template<class sobj,class vobj> strong_inline
  void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
    ret.checkerboard = x.checkerboard;
    conformable(ret,x);
    conformable(x,y);
    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+y._odata[ss];
      vstream(ret._odata[ss],tmp);
 #else
      ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
 #endif
    }
  }
  template<class sobj,class vobj> strong_inline
  void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
    ret.checkerboard = x.checkerboard;
    conformable(ret,x);
    conformable(x,y);
    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
      vstream(ret._odata[ss],tmp);
 #else
      ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
 #endif
    }
  }
  template<class sobj,class vobj> strong_inline
  RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
    return axpy_norm_fast(ret,a,x,y);
  }
  template<class sobj,class vobj> strong_inline
  RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
    return axpby_norm_fast(ret,a,b,x,y);
  }
 }
 #endif
@@ -0,0 +1,375 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/lattice/Lattice_base.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_LATTICE_BASE_H
 #define GRID_LATTICE_BASE_H
 #define STREAMING_STORES
 namespace Grid {
 // TODO: 
 //       mac,real,imag
 // Functionality:
 //     -=,+=,*=,()
 //     add,+,sub,-,mult,mac,*
 //     adj,conjugate
 //     real,imag
 //     transpose,transposeIndex  
 //     trace,traceIndex
 //     peekIndex
 //     innerProduct,outerProduct,
 //     localNorm2
 //     localInnerProduct
 extern int GridCshiftPermuteMap[4][16];
 ////////////////////////////////////////////////
 // Basic expressions used in Expression Template
 ////////////////////////////////////////////////
 class LatticeBase
 {
 public:
    virtual ~LatticeBase(void) = default;
    GridBase *_grid;
 };
 class LatticeExpressionBase {};
 template <typename Op, typename T1>                           
 class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
 public:
 LatticeUnaryExpression(const std::pair<Op,std::tuple<T1> > &arg): std::pair<Op,std::tuple<T1> >(arg) {};
 };
 template <typename Op, typename T1, typename T2>              
 class LatticeBinaryExpression : public std::pair<Op,std::tuple<T1,T2> > , public LatticeExpressionBase {
 public:
 LatticeBinaryExpression(const std::pair<Op,std::tuple<T1,T2> > &arg): std::pair<Op,std::tuple<T1,T2> >(arg) {};
 };
 template <typename Op, typename T1, typename T2, typename T3> 
 class LatticeTrinaryExpression :public std::pair<Op,std::tuple<T1,T2,T3> >, public LatticeExpressionBase {
 public:
 LatticeTrinaryExpression(const std::pair<Op,std::tuple<T1,T2,T3> > &arg): std::pair<Op,std::tuple<T1,T2,T3> >(arg) {};
 };
 void inline conformable(GridBase *lhs,GridBase *rhs)
 {
  assert(lhs == rhs);
 }
 template<class vobj>
 class Lattice : public LatticeBase
 {
 public:
    int checkerboard;
    Vector<vobj> _odata;
    // to pthread need a computable loop where loop induction is not required
    int begin(void) { return 0;};
    int end(void)   { return _odata.size(); }
    vobj & operator[](int i) { return _odata[i]; };
    const vobj & operator[](int i) const { return _odata[i]; };
 public:
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
    typedef vobj vector_object;
  ////////////////////////////////////////////////////////////////////////////////
  // Expression Template closure support
  ////////////////////////////////////////////////////////////////////////////////
  template <typename Op, typename T1>                         strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
    conformable(_grid,egrid);
    int cb=-1;
    CBFromExpression(cb,expr);
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
 #else
      _odata[ss]=eval(ss,expr);
 #endif
    }
    return *this;
  }
  template <typename Op, typename T1,typename T2> strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
    conformable(_grid,egrid);
    int cb=-1;
    CBFromExpression(cb,expr);
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
 #else
      _odata[ss]=eval(ss,expr);
 #endif
    }
    return *this;
  }
  template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
    conformable(_grid,egrid);
    int cb=-1;
    CBFromExpression(cb,expr);
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      //vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,eval(ss,expr));
 #else
      _odata[ss] = eval(ss,expr);
 #endif
    }
    return *this;
  }
  //GridFromExpression is tricky to do
  template<class Op,class T1>
    Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
    int cb=-1;
    CBFromExpression(cb,expr);
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;
    _odata.resize(_grid->oSites());
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
 #else
      _odata[ss]=eval(ss,expr);
 #endif
    }
  };
  template<class Op,class T1, class T2>
  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
    int cb=-1;
    CBFromExpression(cb,expr);
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;
    _odata.resize(_grid->oSites());
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
 #else
      _odata[ss]=eval(ss,expr);
 #endif
    }
  };
  template<class Op,class T1, class T2, class T3>
  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
    int cb=-1;
    CBFromExpression(cb,expr);
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;
    _odata.resize(_grid->oSites());
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
      vstream(_odata[ss] ,eval(ss,expr));
    }
  };
  //////////////////////////////////////////////////////////////////
  // Constructor requires "grid" passed.
  // what about a default grid?
  //////////////////////////////////////////////////////////////////
  Lattice(GridBase *grid) : _odata(grid->oSites()) {
    _grid = grid;
    //        _odata.reserve(_grid->oSites());
    //        _odata.resize(_grid->oSites());
    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
    assert((((uint64_t)&_odata[0])&0xF) ==0);
    checkerboard=0;
  }
  Lattice(const Lattice& r){ // copy constructor
    _grid = r._grid;
    checkerboard = r.checkerboard;
    _odata.resize(_grid->oSites());// essential
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
      _odata[ss]=r._odata[ss];
    }  	
  }
  Lattice(Lattice&& r){ // move constructor
    _grid = r._grid;
    checkerboard = r.checkerboard;
    _odata=std::move(r._odata);
  }
  inline Lattice<vobj> & operator = (Lattice<vobj> && r)
  {
    _grid        = r._grid;
    checkerboard = r.checkerboard;
    _odata       =std::move(r._odata);
    return *this;
  }
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
    _grid        = r._grid;
    checkerboard = r.checkerboard;
    _odata.resize(_grid->oSites());// essential
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
      _odata[ss]=r._odata[ss];
    }  	
    return *this;
  }
  template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
    this->checkerboard = r.checkerboard;
    conformable(*this,r);
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
      this->_odata[ss]=r._odata[ss];
    }
    return *this;
  }
  virtual ~Lattice(void) = default;
  void reset(GridBase* grid) {
    if (_grid != grid) {
      _grid = grid;
      _odata.resize(grid->oSites());
      checkerboard = 0;
    }
  }
  template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
      this->_odata[ss]=r;
    }
    return *this;
  }
  // *=,+=,-= operators inherit behvour from correspond */+/- operation
  template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
    *this = (*this)*r;
    return *this;
  }
  template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
    *this = (*this)-r;
    return *this;
  }
  template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
    *this = (*this)+r;
    return *this;
  }
 }; // class Lattice
  template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
    std::vector<int> gcoor;
    typedef typename vobj::scalar_object sobj;
    sobj ss;
    for(int g=0;g<o._grid->_gsites;g++){
      o._grid->GlobalIndexToGlobalCoor(g,gcoor);
      peekSite(ss,o,gcoor);
      stream<<"[";
      for(int d=0;d<gcoor.size();d++){
 	stream<<gcoor[d];
 	if(d!=gcoor.size()-1) stream<<",";
      }
      stream<<"]\t";
      stream<<ss<<std::endl;
    }
    return stream;
  }
 }
 #include "Lattice_conformable.h"
 #define GRID_LATTICE_EXPRESSION_TEMPLATES
 #ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES
 #include "Lattice_ET.h"
 #else 
 #include "Lattice_overload.h"
 #endif
 #include "Lattice_arith.h"
 #include "Lattice_trace.h"
 #include "Lattice_transpose.h"
 #include "Lattice_local.h"
 #include "Lattice_reduction.h"
 #include "Lattice_peekpoke.h"
 #include "Lattice_reality.h"
 #include "Lattice_comparison_utils.h"
 #include "Lattice_comparison.h"
 #include "Lattice_coordinate.h"
 #include "Lattice_where.h"
 #include "Lattice_rng.h"
 #include "Lattice_unary.h"
 #include "Lattice_transfer.h"
 #endif
@@ -0,0 +1,169 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_comparison.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_COMPARISON_H
 #define GRID_LATTICE_COMPARISON_H
 namespace Grid {
    //////////////////////////////////////////////////////////////////////////
    // relational operators
    // 
    // Support <,>,<=,>=,==,!=
    //
    //Query supporting bitwise &, |, ^, !
    //Query supporting logical &&, ||, 
    //////////////////////////////////////////////////////////////////////////
  //////////////////////////////////////////////////////////////////////////
  // compare lattice to lattice
  //////////////////////////////////////////////////////////////////////////
  template<class vfunctor,class lobj,class robj>  
    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
  {
    Lattice<vInteger> ret(rhs._grid);
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
    }
    return ret;
  }
  //////////////////////////////////////////////////////////////////////////
  // compare lattice to scalar
  //////////////////////////////////////////////////////////////////////////
  template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
  {
    Lattice<vInteger> ret(lhs._grid);
    parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
      ret._odata[ss]=op(lhs._odata[ss],rhs);
    }
    return ret;
  }
  //////////////////////////////////////////////////////////////////////////
  // compare scalar to lattice
  //////////////////////////////////////////////////////////////////////////
  template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
  {
    Lattice<vInteger> ret(rhs._grid);
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      ret._odata[ss]=op(lhs._odata[ss],rhs);
    }
    return ret;
  }
  //////////////////////////////////////////////////////////////////////////
  // Map to functors
  //////////////////////////////////////////////////////////////////////////
  // Less than
  template<class lobj,class robj>
    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
    return LLComparison(vlt<lobj,robj>(),lhs,rhs);
  }
  template<class lobj,class robj>
    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
    return LSComparison(vlt<lobj,robj>(),lhs,rhs);
  }
  template<class lobj,class robj>
    inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
    return SLComparison(vlt<lobj,robj>(),lhs,rhs);
  }
  // Less than equal
  template<class lobj,class robj>
    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
    return LLComparison(vle<lobj,robj>(),lhs,rhs);
  }
  template<class lobj,class robj>
    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
    return LSComparison(vle<lobj,robj>(),lhs,rhs);
  }
  template<class lobj,class robj>
    inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
    return SLComparison(vle<lobj,robj>(),lhs,rhs);
  }
  // Greater than 
  template<class lobj,class robj>
    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
    return LLComparison(vgt<lobj,robj>(),lhs,rhs);
  }
  template<class lobj,class robj>
    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
    return LSComparison(vgt<lobj,robj>(),lhs,rhs);
  }
  template<class lobj,class robj>
    inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vgt<lobj,robj>(),lhs,rhs);
  }
  // Greater than equal
   template<class lobj,class robj>
     inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(vge<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(vge<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
     inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vge<lobj,robj>(),lhs,rhs);
   }
   // equal
   template<class lobj,class robj>
     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(veq<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(veq<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
     inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(veq<lobj,robj>(),lhs,rhs);
   }
   // not equal
   template<class lobj,class robj>
     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(vne<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(vne<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
     inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vne<lobj,robj>(),lhs,rhs);
   }
 }
 #endif
@@ -0,0 +1,232 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_comparison_utils.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_COMPARISON_H
 #define GRID_COMPARISON_H
 namespace Grid {
  /////////////////////////////////////////
  // This implementation is a bit poor.
  //
  // Only support relational logical operations (<, >  etc)
  // on scalar objects. Therefore can strip any tensor structures.
  //
  // Should guard this with isGridTensor<> enable if?
  /////////////////////////////////////////
  //
  // Generic list of functors
  //
  template<class lobj,class robj> class veq {
  public:
    vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) == (rhs);
    }
  };
  template<class lobj,class robj> class vne {
  public:
    vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) != (rhs);
    }
  };
  template<class lobj,class robj> class vlt {
  public:
    vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) < (rhs);
    }
  };
  template<class lobj,class robj> class vle {
  public:
    vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) <= (rhs);
    }
  };
  template<class lobj,class robj> class vgt {
  public:
    vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) > (rhs);
    }
  };
  template<class lobj,class robj> class vge {
    public:
    vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) >= (rhs);
    }
  };
  // Generic list of functors
  template<class lobj,class robj> class seq {
  public:
    Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) == (rhs);
    }
  };
  template<class lobj,class robj> class sne {
  public:
    Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) != (rhs);
    }
  };
  template<class lobj,class robj> class slt {
  public:
    Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) < (rhs);
    }
  };
  template<class lobj,class robj> class sle {
  public:
    Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) <= (rhs);
    }
  };
  template<class lobj,class robj> class sgt {
  public:
    Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) > (rhs);
    }
  };
  template<class lobj,class robj> class sge {
  public:
    Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) >= (rhs);
    }
  };
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Integer and real get extra relational functions.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
    {
      typedef typename vsimd::scalar_type scalar;
      std::vector<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
      std::vector<scalar> vrhs(vsimd::Nsimd());
      std::vector<Integer> vpred(vsimd::Nsimd());
      vInteger ret;
      extract<vsimd,scalar>(lhs,vlhs);
      extract<vsimd,scalar>(rhs,vrhs);
      for(int s=0;s<vsimd::Nsimd();s++){
 	vpred[s] = sop(vlhs[s],vrhs[s]);
      }
      merge<vInteger,Integer>(ret,vpred);
      return ret;
    }
  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
    {
      typedef typename vsimd::scalar_type scalar;
      std::vector<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
      std::vector<Integer> vpred(vsimd::Nsimd());
      vInteger ret;
      extract<vsimd,scalar>(lhs,vlhs);
      for(int s=0;s<vsimd::Nsimd();s++){
 	vpred[s] = sop(vlhs[s],rhs);
      }
      merge<vInteger,Integer>(ret,vpred);
      return ret;
    }
  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
    inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
    {
      typedef typename vsimd::scalar_type scalar;
      std::vector<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
      std::vector<Integer> vpred(vsimd::Nsimd());
      vInteger ret;
      extract<vsimd,scalar>(rhs,vrhs);
      for(int s=0;s<vsimd::Nsimd();s++){
 	vpred[s] = sop(lhs,vrhs[s]);
      }
      merge<vInteger,Integer>(ret,vpred);
      return ret;
    }
 #define DECLARE_RELATIONAL_EQ(op,functor) \
  template<class vsimd,IfSimd<vsimd> = 0>\
    inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd,IfSimd<vsimd> = 0>\
    inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd,IfSimd<vsimd> = 0>\
    inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd>\
    inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
    {									\
      return lhs._internal op rhs;					\
    }									\
  template<class vsimd>\
    inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
    {									\
      return lhs op rhs._internal;					\
    }									\
 #define DECLARE_RELATIONAL(op,functor) \
  DECLARE_RELATIONAL_EQ(op,functor)    \
  template<class vsimd>\
    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
    {									\
      return lhs._internal op rhs._internal;				\
    }									
 DECLARE_RELATIONAL(<,slt);
 DECLARE_RELATIONAL(<=,sle);
 DECLARE_RELATIONAL(>,sgt);
 DECLARE_RELATIONAL(>=,sge);
 DECLARE_RELATIONAL_EQ(==,seq);
 DECLARE_RELATIONAL(!=,sne);
 #undef DECLARE_RELATIONAL
 }
 #endif
@@ -0,0 +1,40 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_conformable.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_CONFORMABLE_H
 #define GRID_LATTICE_CONFORMABLE_H
 namespace Grid {
    template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
    {
        assert(lhs._grid == rhs._grid);
        assert(lhs.checkerboard == rhs.checkerboard);
    }
 }
 #endif
@@ -0,0 +1,56 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_coordinate.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_COORDINATE_H
 #define GRID_LATTICE_COORDINATE_H
 namespace Grid {
    template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
    {
      typedef typename iobj::scalar_type scalar_type;
      typedef typename iobj::vector_type vector_type;
      GridBase *grid = l._grid;
      int Nsimd = grid->iSites();
      std::vector<int> gcoor;
      std::vector<scalar_type> mergebuf(Nsimd);
      vector_type vI;
      for(int o=0;o<grid->oSites();o++){
 	for(int i=0;i<grid->iSites();i++){
 	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
 	  mergebuf[i]=(Integer)gcoor[mu];
 	}
 	merge<vector_type,scalar_type>(vI,mergebuf);
 	l._odata[o]=vI;
      }
    };
 }
 #endif
@@ -0,0 +1,75 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_local.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_LOCALREDUCTION_H
 #define GRID_LATTICE_LOCALREDUCTION_H
 ///////////////////////////////////////////////
 // localInner, localNorm, outerProduct
 ///////////////////////////////////////////////
 namespace Grid {
  /////////////////////////////////////////////////////
  // Non site, reduced locally reduced routines
  /////////////////////////////////////////////////////
  // localNorm2,
  template<class vobj>
    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
    {
      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
      }
      return ret;
    }
  // localInnerProduct
  template<class vobj>
    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
    {
      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
      }
      return ret;
    }
  // outerProduct Scalar x Scalar -> Scalar
  //              Vector x Vector -> Matrix
  template<class ll,class rr>
    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
  {
    Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
    }
    return ret;
  }
 }
 #endif
@@ -0,0 +1,138 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_overload.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_OVERLOAD_H
 #define GRID_LATTICE_OVERLOAD_H
 namespace Grid {
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // unary negation
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj>
  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
  {
    Lattice<vobj> ret(r._grid);
    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
      vstream(ret._odata[ss], -r._odata[ss]);
    }
    return ret;
  } 
  /////////////////////////////////////////////////////////////////////////////////////
  // Lattice BinOp Lattice,
  //NB mult performs conformable check. Do not reapply here for performance.
  /////////////////////////////////////////////////////////////////////////////////////
  template<class left,class right>
    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
  {
    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
    mult(ret,lhs,rhs);
    return ret;
  }
  template<class left,class right>
    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
  {
    Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
    add(ret,lhs,rhs);
    return ret;
  }
  template<class left,class right>
    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
  {
    Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
    sub(ret,lhs,rhs);
    return ret;
  }
  // Scalar BinOp Lattice ;generate return type
  template<class left,class right>
  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
  {
    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
      vstream(ret._odata[ss],tmp);
 	   //      ret._odata[ss]=lhs*rhs._odata[ss];
    }
    return ret;
  }
  template<class left,class right>
    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
    {
      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
 	vstream(ret._odata[ss],tmp);
 	//	ret._odata[ss]=lhs+rhs._odata[ss];
      }
        return ret;
    }
  template<class left,class right>
    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
  {
    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
      vstream(ret._odata[ss],tmp);
    }
    return ret;
  }
    template<class left,class right>
      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
    {
      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
 	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
 	vstream(ret._odata[ss],tmp);
 	//            ret._odata[ss]=lhs._odata[ss]*rhs;
      }
      return ret;
    }
    template<class left,class right>
      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
    {
        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
 	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
 	  vstream(ret._odata[ss],tmp);
 	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
        }
        return ret;
    }
    template<class left,class right>
      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
    {
      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
 	  vstream(ret._odata[ss],tmp);
 	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
      }
      return ret;
    }
 }
 #endif
@@ -0,0 +1,205 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_peekpoke.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_PEEK_H
 #define GRID_LATTICE_PEEK_H
 ///////////////////////////////////////////////
 // Peeking and poking around
 ///////////////////////////////////////////////
 namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // Peek internal indices of a Lattice object
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    template<int Index,class vobj>
       auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
      ret.checkerboard=lhs.checkerboard;
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
      }
      return ret;
    };
    template<int Index,class vobj>
      auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
      ret.checkerboard=lhs.checkerboard;
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
      }
      return ret;
    };
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // Poke internal indices of a Lattice object
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    template<int Index,class vobj> 
    void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
    {
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
      }      
    }
    template<int Index,class vobj>
      void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
    {
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
      }      
    }
    //////////////////////////////////////////////////////
    // Poke a scalar object into the SIMD array
    //////////////////////////////////////////////////////
    template<class vobj,class sobj>
    void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){
      GridBase *grid=l._grid;
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
      int Nsimd = grid->Nsimd();
      assert( l.checkerboard== l._grid->CheckerBoard(site));
      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
      int rank,odx,idx;
      // Optional to broadcast from node 0.
      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
      grid->Broadcast(grid->BossRank(),s);
      std::vector<sobj> buf(Nsimd);
      // extract-modify-merge cycle is easiest way and this is not perf critical
      if ( rank == grid->ThisRank() ) {
 	extract(l._odata[odx],buf);
 	buf[idx] = s;
 	merge(l._odata[odx],buf);
      }
      return;
    };
    //////////////////////////////////////////////////////////
    // Peek a scalar object from the SIMD array
    //////////////////////////////////////////////////////////
    template<class vobj,class sobj>
      void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){
      GridBase *grid=l._grid;
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
      int Nsimd = grid->Nsimd();
      assert( l.checkerboard == l._grid->CheckerBoard(site));
      int rank,odx,idx;
      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
      std::vector<sobj> buf(Nsimd);
      extract(l._odata[odx],buf);
      s = buf[idx];
      grid->Broadcast(rank,s);
      return;
    };
    //////////////////////////////////////////////////////////
    // Peek a scalar object from the SIMD array
    //////////////////////////////////////////////////////////
    template<class vobj,class sobj>
    void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
      GridBase *grid = l._grid;
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
      int Nsimd = grid->Nsimd();
      assert( l.checkerboard== l._grid->CheckerBoard(site));
      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int odx,idx;
      idx= grid->iIndex(site);
      odx= grid->oIndex(site);
      scalar_type * vp = (scalar_type *)&l._odata[odx];
      scalar_type * pt = (scalar_type *)&s;
      for(int w=0;w<words;w++){
        pt[w] = vp[idx+w*Nsimd];
      }
      return;
    };
    template<class vobj,class sobj>
    void pokeLocalSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
      GridBase *grid=l._grid;
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
      int Nsimd = grid->Nsimd();
      assert( l.checkerboard== l._grid->CheckerBoard(site));
      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int odx,idx;
      idx= grid->iIndex(site);
      odx= grid->oIndex(site);
      scalar_type * vp = (scalar_type *)&l._odata[odx];
      scalar_type * pt = (scalar_type *)&s;
      for(int w=0;w<words;w++){
        vp[idx+w*Nsimd] = pt[w];
      }
      return;
    };
 }
 #endif
@@ -0,0 +1,57 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_reality.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_REALITY_H
 #define GRID_LATTICE_REALITY_H
 // FIXME .. this is the sector of the code 
 // I am most worried about the directions
 // The choice of burying complex in the SIMD
 // is making the use of "real" and "imag" very cumbersome
 namespace Grid {
    template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
        Lattice<vobj> ret(lhs._grid);
 	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
            ret._odata[ss] = adj(lhs._odata[ss]);
        }
        return ret;
    };
    template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
        Lattice<vobj> ret(lhs._grid);
 	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 	  ret._odata[ss] = conjugate(lhs._odata[ss]);
        }
        return ret;
    };
 }
 #endif
@@ -0,0 +1,733 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_reduction.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_REDUCTION_H
 #define GRID_LATTICE_REDUCTION_H
 #include <Grid/Grid_Eigen_Dense.h>
 namespace Grid {
 #ifdef GRID_WARN_SUBOPTIMAL
 #warning "Optimisation alert all these reduction loops are NOT threaded "
 #endif     
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Deterministic Reduction operations
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  auto nrm = innerProduct(arg,arg);
  return std::real(nrm); 
 }
 // Double inner product
 template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  GridBase *grid = left._grid;
  const int pad = 8;
  ComplexD  inner;
  Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
    int nwork, mywork, myoff;
    GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
    decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
    for(int ss=myoff;ss<mywork+myoff; ss++){
      vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
    }
    // All threads sum across SIMD; reduce serial work at end
    // one write per cacheline with streaming store
    ComplexD tmp = Reduce(TensorRemove(vinner)) ;
    vstream(sumarray[thr*pad],tmp);
  }
  inner=0.0;
  for(int i=0;i<grid->SumArraySize();i++){
    inner = inner+sumarray[i*pad];
  } 
  right._grid->GlobalSum(inner);
  return inner;
 }
 /////////////////////////
 // Fast axpby_norm
 // z = a x + b y
 // return norm z
 /////////////////////////
 template<class sobj,class vobj> strong_inline RealD 
 axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) 
 {
  sobj one(1.0);
  return axpby_norm_fast(z,a,one,x,y);
 }
 template<class sobj,class vobj> strong_inline RealD 
 axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) 
 {
  const int pad = 8;
  z.checkerboard = x.checkerboard;
  conformable(z,x);
  conformable(x,y);
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  RealD  nrm;
  GridBase *grid = x._grid;
  Vector<RealD> sumarray(grid->SumArraySize()*pad);
  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
    int nwork, mywork, myoff;
    GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
    // private to thread; sub summation
    decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero; 
    for(int ss=myoff;ss<mywork+myoff; ss++){
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
      vnrm = vnrm + innerProductD(tmp,tmp);
      vstream(z._odata[ss],tmp);
    }
    vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
  }
  nrm = 0.0; // sum across threads; linear in thread count but fast
  for(int i=0;i<grid->SumArraySize();i++){
    nrm = nrm+sumarray[i*pad];
  } 
  z._grid->GlobalSum(nrm);
  return nrm; 
 }
 template<class Op,class T1>
 inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
  ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
 {
  return sum(closure(expr));
 }
 template<class Op,class T1,class T2>
 inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
 {
  return sum(closure(expr));
 }
 template<class Op,class T1,class T2,class T3>
 inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
  ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
 				      eval(0,std::get<1>(expr.second)),
 				      eval(0,std::get<2>(expr.second))
 				      ))::scalar_object
 {
  return sum(closure(expr));
 }
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
  GridBase *grid=arg._grid;
  int Nsimd = grid->Nsimd();
  std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
  for(int i=0;i<grid->SumArraySize();i++){
    sumarray[i]=zero;
  }
  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
    int nwork, mywork, myoff;
    GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
    vobj vvsum=zero;
    for(int ss=myoff;ss<mywork+myoff; ss++){
      vvsum = vvsum + arg._odata[ss];
    }
    sumarray[thr]=vvsum;
  }
  vobj vsum=zero;  // sum across threads
  for(int i=0;i<grid->SumArraySize();i++){
    vsum = vsum+sumarray[i];
  } 
  typedef typename vobj::scalar_object sobj;
  sobj ssum=zero;
  std::vector<sobj>               buf(Nsimd);
  extract(vsum,buf);
  for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
  arg._grid->GlobalSum(ssum);
  return ssum;
 }
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
 {
  ///////////////////////////////////////////////////////
  // FIXME precision promoted summation
  // may be important for correlation functions
  // But easily avoided by using double precision fields
  ///////////////////////////////////////////////////////
  typedef typename vobj::scalar_object sobj;
  GridBase  *grid = Data._grid;
  assert(grid!=NULL);
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
  assert(orthogdim >= 0);
  assert(orthogdim < Nd);
  int fd=grid->_fdimensions[orthogdim];
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];
  std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
  std::vector<sobj> lsSum(ld,zero);                    // sum across these down to scalars
  std::vector<sobj> extracted(Nsimd);                  // splitting the SIMD
  result.resize(fd); // And then global sum to return the same vector to every node 
  for(int r=0;r<rd;r++){
    lvSum[r]=zero;
  }
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
  // sum over reduced dimension planes, breaking out orthog dir
  // Parallel over orthog direction
  parallel_for(int r=0;r<rd;r++){
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int ss= so+n*stride+b;
 	lvSum[r]=lvSum[r]+Data._odata[ss];
      }
    }
  }
  // Sum across simd lanes in the plane, breaking out orthog dir.
  std::vector<int> icoor(Nd);
  for(int rt=0;rt<rd;rt++){
    extract(lvSum[rt],extracted);
    for(int idx=0;idx<Nsimd;idx++){
      grid->iCoorFromIindex(icoor,idx);
      int ldx =rt+icoor[orthogdim]*rd;
      lsSum[ldx]=lsSum[ldx]+extracted[idx];
    }
  }
  // sum over nodes.
  sobj gsum;
  for(int t=0;t<fd;t++){
    int pt = t/ld; // processor plane
    int lt = t%ld;
    if ( pt == grid->_processor_coor[orthogdim] ) {
      gsum=lsSum[lt];
    } else {
      gsum=zero;
    }
    grid->GlobalSum(gsum);
    result[t]=gsum;
  }
 }
 template<class vobj>
 static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
 {
  // std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
  typedef typename vobj::scalar_type scalar_type;
  std::vector<scalar_type> lsSum;
  localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
  globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
  // std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
 }
 template <class vobj>
 static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
 {
  // std::cout << GridLogMessage << "Start prep" << std::endl;
  typedef typename vobj::vector_type   vector_type;
  typedef typename vobj::scalar_type   scalar_type;
  GridBase  *grid = lhs._grid;
  assert(grid!=NULL);
  conformable(grid,rhs._grid);
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
  assert(orthogdim >= 0);
  assert(orthogdim < Nd);
  int fd=grid->_fdimensions[orthogdim];
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];
  // std::cout << GridLogMessage << "Start alloc" << std::endl;
  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
  lsSum.resize(ld,scalar_type(0.0));                    // sum across these down to scalars
  std::vector<iScalar<scalar_type>> extracted(Nsimd);   // splitting the SIMD  
  // std::cout << GridLogMessage << "End alloc" << std::endl;
  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
  for(int r=0;r<rd;r++){
    lvSum[r]=zero;
  }
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
  // std::cout << GridLogMessage << "End prep" << std::endl;
  // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
  vector_type vv;
  parallel_for(int r=0;r<rd;r++)
  {
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int ss = so + n * stride + b;
        vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss]));
        lvSum[r] = lvSum[r] + vv;
      }
    }
  }
  // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
  // Sum across simd lanes in the plane, breaking out orthog dir.
  std::vector<int> icoor(Nd);
  for(int rt=0;rt<rd;rt++){
    iScalar<vector_type> temp; 
    temp._internal = lvSum[rt];
    extract(temp,extracted);
    for(int idx=0;idx<Nsimd;idx++){
      grid->iCoorFromIindex(icoor,idx);
      int ldx =rt+icoor[orthogdim]*rd;
      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
    }
  }
  // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
 }
 template <class vobj>
 static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
 {
  typedef typename vobj::scalar_type scalar_type;
  GridBase *grid = lhs._grid;
  int fd = result.size();
  int ld = lsSum.size();
  // sum over nodes.
  std::vector<scalar_type> gsum;
  gsum.resize(fd, scalar_type(0.0));
  // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
  for(int t=0;t<fd;t++){
    int pt = t/ld; // processor plane
    int lt = t%ld;
    if ( pt == grid->_processor_coor[orthogdim] ) {
      gsum[t]=lsSum[lt];
    }
  }
  // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
  // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
  grid->GlobalSumVector(&gsum[0], fd);
  // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
  result = gsum;
 }
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
  typedef typename vobj::vector_type   vector_type;
  typedef typename vobj::scalar_type   scalar_type;
  GridBase  *grid = lhs._grid;
  assert(grid!=NULL);
  conformable(grid,rhs._grid);
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
  assert(orthogdim >= 0);
  assert(orthogdim < Nd);
  int fd=grid->_fdimensions[orthogdim];
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];
  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
  std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
  std::vector<iScalar<scalar_type> > extracted(Nsimd);                  // splitting the SIMD
  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
  for(int r=0;r<rd;r++){
    lvSum[r]=zero;
  }
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
  parallel_for(int r=0;r<rd;r++){
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int ss= so+n*stride+b;
 	vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss]));
 	lvSum[r]=lvSum[r]+vv;
      }
    }
  }
  // Sum across simd lanes in the plane, breaking out orthog dir.
  std::vector<int> icoor(Nd);
  for(int rt=0;rt<rd;rt++){
    iScalar<vector_type> temp; 
    temp._internal = lvSum[rt];
    extract(temp,extracted);
    for(int idx=0;idx<Nsimd;idx++){
      grid->iCoorFromIindex(icoor,idx);
      int ldx =rt+icoor[orthogdim]*rd;
      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
    }
  }
  // sum over nodes.
  scalar_type gsum;
  for(int t=0;t<fd;t++){
    int pt = t/ld; // processor plane
    int lt = t%ld;
    if ( pt == grid->_processor_coor[orthogdim] ) {
      gsum=lsSum[lt];
    } else {
      gsum=scalar_type(0.0);
    }
    grid->GlobalSum(gsum);
    result[t]=gsum;
  }
 }
 template<class vobj>
 static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = rhs._grid->GlobalDimensions()[Orthog];
  std::vector<ComplexD> ip(Nblock);
  sn.resize(Nblock);
  sliceInnerProductVector(ip,rhs,rhs,Orthog);
  for(int ss=0;ss<Nblock;ss++){
    sn[ss] = real(ip[ss]);
  }
 };
 template<class vobj>
 static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 			    int orthogdim,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::tensor_reduced tensor_reduced;
  scalar_type zscale(scale);
  GridBase *grid  = X._grid;
  int Nsimd  =grid->Nsimd();
  int Nblock =grid->GlobalDimensions()[orthogdim];
  int fd     =grid->_fdimensions[orthogdim];
  int ld     =grid->_ldimensions[orthogdim];
  int rd     =grid->_rdimensions[orthogdim];
  int e1     =grid->_slice_nblock[orthogdim];
  int e2     =grid->_slice_block [orthogdim];
  int stride =grid->_slice_stride[orthogdim];
  std::vector<int> icoor;
  for(int r=0;r<rd;r++){
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
    vector_type    av;
    for(int l=0;l<Nsimd;l++){
      grid->iCoorFromIindex(icoor,l);
      int ldx =r+icoor[orthogdim]*rd;
      scalar_type *as =(scalar_type *)&av;
      as[l] = scalar_type(a[ldx])*zscale;
    }
    tensor_reduced at; at=av;
    parallel_for_nest2(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int ss= so+n*stride+b;
 	R._odata[ss] = at*X._odata[ss]+Y._odata[ss];
      }
    }
  }
 };
 /*
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
  int nsimd = BlockSolverGrid->Nsimd();
  std::vector<int> latt_phys(0);
  std::vector<int> simd_phys(0);
  std::vector<int>  mpi_phys(0);
  for(int d=0;d<NN;d++){
    if( d!=Orthog ) { 
      latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
      simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
      mpi_phys.push_back(BlockSolverGrid->_processors[d]);
    }
  }
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
 */
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  //  Lattice<vobj> Xslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
  {
    std::vector<vobj> s_x(Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X[o+i*ostride];
      }
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = Y[o+i*ostride];
 	for(int j=0;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R[o+i*ostride]=dot;
      }
    }}
  }
 };
 template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  //  Lattice<vobj> Xslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  int nl=1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
  {
    std::vector<vobj> s_x(Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X[o+i*ostride];
      }
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = s_x[0]*(scale*aa(0,i));
 	for(int j=1;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R[o+i*ostride]=dot;
      }
    }}
  }
 };
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs._grid;
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  int Nblock = FullGrid->GlobalDimensions()[Orthog];
  //  Lattice<vobj> Lslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
  typedef typename vobj::vector_typeD vector_typeD;
 #pragma omp parallel 
  {
    std::vector<vobj> Left(Nblock);
    std::vector<vobj> Right(Nblock);
    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	Left [i] = lhs[o+i*ostride];
 	Right[i] = rhs[o+i*ostride];
      }
      for(int i=0;i<Nblock;i++){
      for(int j=0;j<Nblock;j++){
 	auto tmp = innerProduct(Left[i],Right[j]);
 	auto rtmp = TensorRemove(tmp);
 	mat_thread(i,j) += Reduce(rtmp);
      }}
    }}
 #pragma omp critical
    {
      mat += mat_thread;
    }  
  }
  for(int i=0;i<Nblock;i++){
  for(int j=0;j<Nblock;j++){
    ComplexD sum = mat(i,j);
    FullGrid->GlobalSum(sum);
    mat(i,j)=sum;
  }}
  return;
 }
 } /*END NAMESPACE GRID*/
 #endif
@@ -0,0 +1,516 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_rng.h
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_RNG_H
 #define GRID_LATTICE_RNG_H
 #include <random>
 #ifdef RNG_SITMO
 #include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
 #endif 
 #if defined(RNG_SITMO)
 #define RNG_FAST_DISCARD
 #else 
 #undef  RNG_FAST_DISCARD
 #endif
 namespace Grid {
  //////////////////////////////////////////////////////////////
  // Allow the RNG state to be less dense than the fine grid
  //////////////////////////////////////////////////////////////
  inline int RNGfillable(GridBase *coarse,GridBase *fine)
  {
    int rngdims = coarse->_ndimension;
    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
    int lowerdims   = fine->_ndimension - coarse->_ndimension;
    assert(lowerdims >= 0);
    for(int d=0;d<lowerdims;d++){
      assert(fine->_simd_layout[d]==1);
      assert(fine->_processors[d]==1);
    }
    int multiplicity=1;
    for(int d=0;d<lowerdims;d++){
      multiplicity=multiplicity*fine->_rdimensions[d];
    }
    // local and global volumes subdivide cleanly after SIMDization
    for(int d=0;d<rngdims;d++){
      int fd= d+lowerdims;
      assert(coarse->_processors[d]  == fine->_processors[fd]);
      assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
      assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
      multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
    }
    return multiplicity;
  }
 // merge of April 11 2017
  // this function is necessary for the LS vectorised field
  inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
  {
    int rngdims = coarse->_ndimension;
    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
    int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0);
    // assumes that the higher dimensions are not using more processors
    // all further divisions are local
    for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
    for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
    // then divide the number of local sites
    // check that the total number of sims agree, meanse the iSites are the same
    assert(fine->Nsimd() == coarse->Nsimd());
    // check that the two grids divide cleanly
    assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
    return fine->lSites() / coarse->lSites();
  }
  // real scalars are one component
  template<class scalar,class distribution,class generator> 
  void fillScalar(scalar &s,distribution &dist,generator & gen)
  {
    s=dist(gen);
  }
  template<class distribution,class generator> 
  void fillScalar(ComplexF &s,distribution &dist, generator &gen)
  {
    s=ComplexF(dist(gen),dist(gen));
  }
  template<class distribution,class generator> 
  void fillScalar(ComplexD &s,distribution &dist,generator &gen)
  {
    s=ComplexD(dist(gen),dist(gen));
  }
  class GridRNGbase {
  public:
    // One generator per site.
    // Uniform and Gaussian distributions from these generators.
 #ifdef RNG_RANLUX
    typedef std::ranlux48 RngEngine;
    typedef uint64_t      RngStateType;
    static const int RngStateCount = 15;
 #endif 
 #ifdef RNG_MT19937 
    typedef std::mt19937 RngEngine;
    typedef uint32_t     RngStateType;
    static const int     RngStateCount = std::mt19937::state_size;
 #endif
 #ifdef RNG_SITMO
    typedef sitmo::prng_engine 	RngEngine;
    typedef uint64_t    	RngStateType;
    static const int    	RngStateCount = 13;
 #endif
    std::vector<RngEngine>                             _generators;
    std::vector<std::uniform_real_distribution<RealD> > _uniform;
    std::vector<std::normal_distribution<RealD> >       _gaussian;
    std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
    std::vector<std::uniform_int_distribution<uint32_t> > _uid;
    ///////////////////////
    // support for parallel init
    ///////////////////////
 #ifdef RNG_FAST_DISCARD
    static void Skip(RngEngine &eng,uint64_t site)
    {
      /////////////////////////////////////////////////////////////////////////////////////
      // Skip by 2^40 elements between successive lattice sites
      // This goes by 10^12.
      // Consider quenched updating; likely never exceeding rate of 1000 sweeps
      // per second on any machine. This gives us of order 10^9 seconds, or 100 years
      // skip ahead.
      // For HMC unlikely to go at faster than a solve per second, and 
      // tens of seconds per trajectory so this is clean in all reasonable cases,
      // and margin of safety is orders of magnitude.
      // We could hack Sitmo to skip in the higher order words of state if necessary
      //
      // Replace with 2^30 ; avoid problem on large volumes
      //
      /////////////////////////////////////////////////////////////////////////////////////
      //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
      const int shift = 30;
      uint64_t skip = site;
      skip = skip<<shift;
      assert((skip >> shift)==site); // check for overflow
      eng.discard(skip);
      //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
    } 
 #endif
    static RngEngine Reseed(RngEngine &eng)
    {
      std::vector<uint32_t> newseed;
      std::uniform_int_distribution<uint32_t> uid;
      return Reseed(eng,newseed,uid);
    }
    static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
 			    std::uniform_int_distribution<uint32_t> &uid)
    {
      const int reseeds=4;
      newseed.resize(reseeds);
      for(int i=0;i<reseeds;i++){
 	newseed[i] = uid(eng);
      }
      std::seed_seq sseq(newseed.begin(),newseed.end());
      return RngEngine(sseq);
    }    
    void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
      saved.resize(RngStateCount);
      std::stringstream ss;
      ss<<eng;
      ss.seekg(0,ss.beg);
      for(int i=0;i<RngStateCount;i++){
        ss>>saved[i];
      }
    }
    void GetState(std::vector<RngStateType> & saved,int gen) {
      GetState(saved,_generators[gen]);
    }
    void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
      assert(saved.size()==RngStateCount);
      std::stringstream ss;
      for(int i=0;i<RngStateCount;i++){
        ss<< saved[i]<<" ";
      }
      ss.seekg(0,ss.beg);
      ss>>eng;
    }
    void SetState(std::vector<RngStateType> & saved,int gen){
      SetState(saved,_generators[gen]);
    }
    void SetEngine(RngEngine &Eng, int gen){
      _generators[gen]=Eng;
    }
    void GetEngine(RngEngine &Eng, int gen){
      Eng=_generators[gen];
    }
    template<class source> void Seed(source &src, int gen)
    {
      _generators[gen] = RngEngine(src);
    }    
  };
  class GridSerialRNG : public GridRNGbase {
  public:
    GridSerialRNG() : GridRNGbase() {
      _generators.resize(1);
      _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
      _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
      _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
      _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
    }
    template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
      typedef typename sobj::scalar_type scalar_type;
      int words = sizeof(sobj)/sizeof(scalar_type);
      scalar_type *buf = (scalar_type *) & l;
      dist[0].reset();
      for(int idx=0;idx<words;idx++){
 	fillScalar(buf[idx],dist[0],_generators[0]);
      }
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
    };
    template <class distribution>  inline void fill(ComplexF &l,std::vector<distribution> &dist){
      dist[0].reset();
      fillScalar(l,dist[0],_generators[0]);
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
    }
    template <class distribution>  inline void fill(ComplexD &l,std::vector<distribution> &dist){
      dist[0].reset();
      fillScalar(l,dist[0],_generators[0]);
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
    }
    template <class distribution>  inline void fill(RealF &l,std::vector<distribution> &dist){
      dist[0].reset();
      fillScalar(l,dist[0],_generators[0]);
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
    }
    template <class distribution>  inline void fill(RealD &l,std::vector<distribution> &dist){
      dist[0].reset();
      fillScalar(l,dist[0],_generators[0]);
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
    }
    // vector fill
    template <class distribution>  inline void fill(vComplexF &l,std::vector<distribution> &dist){
      RealF *pointer=(RealF *)&l;
      dist[0].reset();
      for(int i=0;i<2*vComplexF::Nsimd();i++){
 	fillScalar(pointer[i],dist[0],_generators[0]);
      }
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
    }
    template <class distribution>  inline void fill(vComplexD &l,std::vector<distribution> &dist){
      RealD *pointer=(RealD *)&l;
      dist[0].reset();
      for(int i=0;i<2*vComplexD::Nsimd();i++){
 	fillScalar(pointer[i],dist[0],_generators[0]);
      }
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
    }
    template <class distribution>  inline void fill(vRealF &l,std::vector<distribution> &dist){
      RealF *pointer=(RealF *)&l;
      dist[0].reset();
      for(int i=0;i<vRealF::Nsimd();i++){
 	fillScalar(pointer[i],dist[0],_generators[0]);
      }
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
    }
    template <class distribution>  inline void fill(vRealD &l,std::vector<distribution> &dist){
      RealD *pointer=(RealD *)&l;
      dist[0].reset();
      for(int i=0;i<vRealD::Nsimd();i++){
 	fillScalar(pointer[i],dist[0],_generators[0]);
      }
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
    }
    void SeedFixedIntegers(const std::vector<int> &seeds){
      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
      std::seed_seq src(seeds.begin(),seeds.end());
      Seed(src,0);
    }
    void SeedUniqueString(const std::string &s){
      std::vector<int> seeds;
      std::stringstream sha;
      seeds = GridChecksum::sha256_seeds(s);
      for(int i=0;i<seeds.size();i++) { 
        sha << std::hex << seeds[i];
      }
      std::cout << GridLogMessage << "Intialising serial RNG with unique string '" 
                << s << "'" << std::endl;
      std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
      SeedFixedIntegers(seeds);
    }
  };
  class GridParallelRNG : public GridRNGbase {
    double _time_counter;
  public:
    GridBase *_grid;
    unsigned int _vol;
    int generator_idx(int os,int is) {
      return is*_grid->oSites()+os;
    }
    GridParallelRNG(GridBase *grid) : GridRNGbase() {
      _grid = grid;
      _vol  =_grid->iSites()*_grid->oSites();
      _generators.resize(_vol);
      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
      _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
      _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
      _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
    }
    template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
      typedef typename vobj::scalar_object scalar_object;
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
      double inner_time_counter = usecond();
      int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
      int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l._grid too
      int osites = _grid->oSites();  // guaranteed to be <= l._grid->oSites() by a factor multiplicity
      int words  = sizeof(scalar_object) / sizeof(scalar_type);
      parallel_for(int ss=0;ss<osites;ss++){
        std::vector<scalar_object> buf(Nsimd);
        for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times
          int sm = multiplicity * ss + m;  // Maps the generator site to the fine site
          for (int si = 0; si < Nsimd; si++) {
            int gdx = generator_idx(ss, si);  // index of generator state
            scalar_type *pointer = (scalar_type *)&buf[si];
            dist[gdx].reset();
            for (int idx = 0; idx < words; idx++) 
              fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
          }
          // merge into SIMD lanes, FIXME suboptimal implementation
          merge(l._odata[sm], buf);
        }
      }
      _time_counter += usecond()- inner_time_counter;
    };
    void SeedUniqueString(const std::string &s){
      std::vector<int> seeds;
      seeds = GridChecksum::sha256_seeds(s);
      std::cout << GridLogMessage << "Intialising parallel RNG with unique string '" 
                << s << "'" << std::endl;
      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
      SeedFixedIntegers(seeds);
    }
    void SeedFixedIntegers(const std::vector<int> &seeds){
      // Everyone generates the same seed_seq based on input seeds
      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
      std::seed_seq source(seeds.begin(),seeds.end());
      RngEngine master_engine(source);
 #ifdef RNG_FAST_DISCARD
      ////////////////////////////////////////////////
      // Skip ahead through a single stream.
      // Applicable to SITMO and other has based/crypto RNGs
      // Should be applicable to Mersenne Twister, but the C++11
      // MT implementation does not implement fast discard even though
      // in principle this is possible
      ////////////////////////////////////////////////
      // Everybody loops over global volume.
      parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){
 	// Where is it?
 	int rank,o_idx,i_idx;
 	std::vector<int> gcoor;
 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 	// If this is one of mine we take it
 	if( rank == _grid->ThisRank() ){
 	  int l_idx=generator_idx(o_idx,i_idx);
 	  _generators[l_idx] = master_engine;
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
      }
 #else 
      ////////////////////////////////////////////////////////////////
      // Machine and thread decomposition dependent seeding is efficient
      // and maximally parallel; but NOT reproducible from machine to machine. 
      // Not ideal, but fastest way to reseed all nodes.
      ////////////////////////////////////////////////////////////////
      {
 	// Obtain one Reseed per processor
 	int Nproc = _grid->ProcessorCount();
 	std::vector<RngEngine> seeders(Nproc);
 	int me= _grid->ThisRank();
 	for(int p=0;p<Nproc;p++){
 	  seeders[p] = Reseed(master_engine);
 	}
 	master_engine = seeders[me];
      }
      {
 	// Obtain one reseeded generator per thread
 	int Nthread = GridThread::GetThreads();
 	std::vector<RngEngine> seeders(Nthread);
 	for(int t=0;t<Nthread;t++){
 	  seeders[t] = Reseed(master_engine);
 	}
 	parallel_for(int t=0;t<Nthread;t++) {
 	  // set up one per local site in threaded fashion
 	  std::vector<uint32_t> newseeds;
 	  std::uniform_int_distribution<uint32_t> uid;	
 	  for(int l=0;l<_grid->lSites();l++) {
 	    if ( (l%Nthread)==t ) {
 	      _generators[l] = Reseed(seeders[t],newseeds,uid);
 	    }
 	  }
 	}
      }
 #endif
    }
    void Report(){
      std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
    }
    ////////////////////////////////////////////////////////////////////////
    // Support for rigorous test of RNG's
    // Return uniform random uint32_t from requested site generator
    ////////////////////////////////////////////////////////////////////////
    uint32_t GlobalU01(int gsite){
      uint32_t the_number;
      // who
      std::vector<int> gcoor;
      int rank,o_idx,i_idx;
      _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
      _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
      // draw
      int l_idx=generator_idx(o_idx,i_idx);
      if( rank == _grid->ThisRank() ){
 	the_number = _uid[l_idx](_generators[l_idx]);
      }
      // share & return
      _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
      return the_number;
    }
  };
  template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
  template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
 }
 #endif
@@ -0,0 +1,67 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_trace.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_TRACE_H
 #define GRID_LATTICE_TRACE_H
 ///////////////////////////////////////////////
 // Tracing, transposing, peeking, poking
 ///////////////////////////////////////////////
 namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // Trace
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class vobj>
    inline auto trace(const Lattice<vobj> &lhs)
      -> Lattice<decltype(trace(lhs._odata[0]))>
    {
      Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
            ret._odata[ss] = trace(lhs._odata[ss]);
        }
        return ret;
    };
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // Trace Index level dependent operation
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    template<int Index,class vobj>
    inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
    {
      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
      }
      return ret;
    };
 }
 #endif
@@ -0,0 +1,63 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_transpose.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_TRANSPOSE_H
 #define GRID_LATTICE_TRANSPOSE_H
 ///////////////////////////////////////////////
 // Transpose
 ///////////////////////////////////////////////
 namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // Transpose
    ////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj>
    inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
    Lattice<vobj> ret(lhs._grid);
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
      ret._odata[ss] = transpose(lhs._odata[ss]);
    }
    return ret;
  };
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Index level dependent transpose
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  template<int Index,class vobj>
    inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
  {
    Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
      ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
    }
    return ret;
  };
 }
 #endif
@@ -0,0 +1,84 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_unary.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_UNARY_H
 #define GRID_LATTICE_UNARY_H
 namespace Grid {
  template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs,RealD y){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=pow(rhs._odata[ss],y);
    }
    return ret;
  }
  template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs,Integer y){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=mod(rhs._odata[ss],y);
    }
    return ret;
  }
  template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=div(rhs._odata[ss],y);
    }
    return ret;
  }
  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
    }
    return ret;
  }
 }
 #endif
@@ -0,0 +1,86 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_where.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_WHERE_H
 #define GRID_LATTICE_WHERE_H
 namespace Grid {
 // Must implement the predicate gating the 
 // Must be able to reduce the predicate down to a single vInteger per site.
 // Must be able to require the type be iScalar x iScalar x ....
 //                              give a GetVtype method in iScalar
 //                              and blow away the tensor structures.
 //
 template<class vobj,class iobj>
 inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
 {
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);
  conformable(iftrue,ret);
  GridBase *grid=iftrue._grid;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  typedef typename iobj::vector_type mask_type;
  const int Nsimd = grid->Nsimd();
  std::vector<Integer> mask(Nsimd);
  std::vector<scalar_object> truevals (Nsimd);
  std::vector<scalar_object> falsevals(Nsimd);
  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
    extract(iftrue._odata[ss]   ,truevals);
    extract(iffalse._odata[ss]  ,falsevals);
    extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
    for(int s=0;s<Nsimd;s++){
      if (mask[s]) falsevals[s]=truevals[s];
    }
    merge(ret._odata[ss],falsevals);
  }
 }
 template<class vobj,class iobj>
 inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
 {
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);
  Lattice<vobj> ret(iftrue._grid);
  where(ret,predicate,iftrue,iffalse);
  return ret;
 }
 }
 #endif
@@ -0,0 +1,116 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/Log.cc
 Copyright (C) 2015
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/util/CompilerCompatible.h>
 #include <cxxabi.h>
 #include <memory>
 namespace Grid {
  std::string demangle(const char* name) {
    int status = -4; // some arbitrary value to eliminate the compiler warning
    // enable c++11 by passing the flag -std=c++11 to g++
    std::unique_ptr<char, void(*)(void*)> res {
      abi::__cxa_demangle(name, NULL, NULL, &status),
 	std::free
 	};
    return (status==0) ? res.get() : name ;
  }
 GridStopWatch Logger::GlobalStopWatch;
 int Logger::timestamp;
 std::ostream Logger::devnull(0);
 void GridLogTimestamp(int on){
  Logger::Timestamp(on);
 }
 Colours GridLogColours(0);
 GridLogger GridLogIRL    (1, "IRL"   , GridLogColours, "NORMAL");
 GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
  GridLogWarning.Active(0);
  GridLogMessage.Active(1); // at least the messages should be always on
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(0);
  GridLogColours.Active(0);
  for (int i = 0; i < logstreams.size(); i++) {
    if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
    if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
    if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
    if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance"))
      GridLogPerformance.Active(1);
    if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
    if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
  }
 }
 ////////////////////////////////////////////////////////////
 // Verbose limiter on MPI tasks
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
  int me = 0;
 #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
  me = shmem_my_pe();
 #endif
  if (me) {
    std::cout.setstate(std::ios::badbit);
  }
 }
 void Grid_unquiesce_nodes(void) {
 #ifdef GRID_COMMS_MPI
  std::cout.clear();
 #endif
 }
 }
@@ -0,0 +1,216 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Log.h
    Copyright (C) 2015
    Author: Antonin Portelli <antonin.portelli@me.com>
    Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <map>
 #ifndef GRID_LOG_H
 #define GRID_LOG_H
 #ifdef HAVE_EXECINFO_H
 #include <execinfo.h>
 #endif
 namespace Grid {
 //////////////////////////////////////////////////////////////////////////////////////////////////
 // Dress the output; use std::chrono for time stamping via the StopWatch class
 //////////////////////////////////////////////////////////////////////////////////////////////////
 class Colours{
 protected:
  bool is_active;
 public:
  std::map<std::string, std::string> colour;
  Colours(bool activate=false){
    Active(activate);
  };
  void Active(bool activate){
    is_active=activate;
    if (is_active){
     colour["BLACK"]  ="\033[30m";
     colour["RED"]    ="\033[31m";
     colour["GREEN"]  ="\033[32m";
     colour["YELLOW"] ="\033[33m";
     colour["BLUE"]   ="\033[34m";
     colour["PURPLE"] ="\033[35m";
     colour["CYAN"]   ="\033[36m";
     colour["WHITE"]  ="\033[37m";
     colour["NORMAL"] ="\033[0;39m";
    } else {
      colour["BLACK"] ="";
      colour["RED"]   ="";
      colour["GREEN"] ="";
      colour["YELLOW"]="";
      colour["BLUE"]  ="";
      colour["PURPLE"]="";
      colour["CYAN"]  ="";
      colour["WHITE"] ="";
      colour["NORMAL"]="";
    }
  };
 };
 class Logger {
 protected:
  Colours &Painter;
  int active;
  int timing_mode;
  int topWidth{-1}, chanWidth{-1};
  static int timestamp;
  std::string name, topName;
  std::string COLOUR;
 public:
  static GridStopWatch GlobalStopWatch;
  GridStopWatch         LocalStopWatch;
  GridStopWatch *StopWatch;
  static std::ostream devnull;
  std::string background() {return Painter.colour["NORMAL"];}
  std::string evidence() {return Painter.colour["YELLOW"];}
  std::string colour() {return Painter.colour[COLOUR];}
  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)  : active(on),
    name(nm),
    topName(topNm),
    Painter(col_class),
    timing_mode(0),
    COLOUR(col) 
    {
      StopWatch = & GlobalStopWatch;
    };
  void Active(int on) {active = on;};
  int  isActive(void) {return active;};
  static void Timestamp(int on) {timestamp = on;};
  void Reset(void) { 
    StopWatch->Reset(); 
    StopWatch->Start(); 
  }
  void TimingMode(int on) { 
    timing_mode = on; 
    if(on) { 
      StopWatch = &LocalStopWatch;
      Reset(); 
    }
  }
  void setTopWidth(const int w) {topWidth = w;}
  void setChanWidth(const int w) {chanWidth = w;}
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
    if ( log.active ) {
      stream << log.background()<<  std::left;
      if (log.topWidth > 0)
      {
        stream << std::setw(log.topWidth);
      }
      stream << log.topName << log.background()<< " : ";
      stream << log.colour() <<  std::left;
      if (log.chanWidth > 0)
      {
        stream << std::setw(log.chanWidth);
      }
      stream << log.name << log.background() << " : ";
      if ( log.timestamp ) {
 	log.StopWatch->Stop();
 	GridTime now = log.StopWatch->Elapsed();
 	if ( log.timing_mode==1 ) log.StopWatch->Reset();
 	log.StopWatch->Start();
 	stream << log.evidence()<< std::setw(6)<<now << log.background() << " : " ;
      }
      stream << log.colour();
      return stream;
    } else { 
      return devnull;
    }
  }
 };
 class GridLogger: public Logger {
 public:
  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
  Logger("Grid", on, nm, col_class, col_key){};
 };
 void GridLogConfigure(std::vector<std::string> &logstreams);
 extern GridLogger GridLogIRL;
 extern GridLogger GridLogSolver;
 extern GridLogger GridLogError;
 extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
 extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
 #define _NBACKTRACE (256)
 extern void * Grid_backtrace_buffer[_NBACKTRACE];
 #define BACKTRACEFILE() {\
 char string[20];					\
 std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
 std::FILE * fp = std::fopen(string,"w");				\
 BACKTRACEFP(fp)\
 std::fclose(fp);	    \
 }
 #ifdef HAVE_EXECINFO_H
 #define BACKTRACEFP(fp) { \
 int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
 char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
 for (int i = 0; i < symbols; i++){\
  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, demangle(strings[i]).c_str()); std::fflush(fp); \
 }\
 }
 #else 
 #define BACKTRACEFP(fp) { \
 std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
 std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
 std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
 std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
 }
 #endif
 #define BACKTRACE() BACKTRACEFP(stdout) 
 }
 #endif
@@ -0,0 +1,729 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/parallelIO/BinaryIO.h
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu<guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_BINARY_IO_H
 #define GRID_BINARY_IO_H
 #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
 #define USE_MPI_IO
 #else
 #undef  USE_MPI_IO
 #endif
 #ifdef HAVE_ENDIAN_H
 #include <endian.h>
 #endif
 #include <arpa/inet.h>
 #include <algorithm>
 namespace Grid { 
 /////////////////////////////////////////////////////////////////////////////////
 // Byte reversal garbage
 /////////////////////////////////////////////////////////////////////////////////
 inline uint32_t byte_reverse32(uint32_t f) { 
      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
      return f;
 }
 inline uint64_t byte_reverse64(uint64_t f) { 
  uint64_t g;
  g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
  g = g << 32;
  f = f >> 32;
  g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
  return g;
 }
 #if BYTE_ORDER == BIG_ENDIAN 
 inline uint64_t Grid_ntohll(uint64_t A) { return A; }
 #else
 inline uint64_t Grid_ntohll(uint64_t A) { 
  return byte_reverse64(A);
 }
 #endif
 // A little helper
 inline void removeWhitespace(std::string &key)
 {
  key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Static class holding the parallel IO code
 // Could just use a namespace
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
 public:
  /////////////////////////////////////////////////////////////////////////////
  // more byte manipulation helpers
  /////////////////////////////////////////////////////////////////////////////
  template<class vobj> static inline void Uint32Checksum(Lattice<vobj> &lat,uint32_t &nersc_csum)
  {
    typedef typename vobj::scalar_object sobj;
    GridBase *grid = lat._grid;
    uint64_t lsites = grid->lSites();
    std::vector<sobj> scalardata(lsites); 
    unvectorizeToLexOrdArray(scalardata,lat);    
    NerscChecksum(grid,scalardata,nersc_csum);
  }
  template <class fobj>
  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
  {
    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
    uint64_t lsites = grid->lSites();
    if (fbuf.size() == 1)
    {
      lsites = 1;
    }
 PARALLEL_REGION
    {
      uint32_t nersc_csum_thr = 0;
 PARALLEL_FOR_LOOP_INTERN
      for (uint64_t local_site = 0; local_site < lsites; local_site++)
      {
        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
        for (uint64_t j = 0; j < size32; j++)
        {
          nersc_csum_thr = nersc_csum_thr + site_buf[j];
        }
      }
 PARALLEL_CRITICAL
      {
        nersc_csum += nersc_csum_thr;
      }
    }
  }
  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
  {
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
    int nd = grid->_ndimension;
    uint64_t lsites              =grid->lSites();
    if (fbuf.size()==1) {
      lsites=1;
    }
    std::vector<int> local_vol   =grid->LocalDimensions();
    std::vector<int> local_start =grid->LocalStarts();
    std::vector<int> global_vol  =grid->FullDimensions();
 PARALLEL_REGION
    { 
      std::vector<int> coor(nd);
      uint32_t scidac_csuma_thr=0;
      uint32_t scidac_csumb_thr=0;
      uint32_t site_crc=0;
 PARALLEL_FOR_LOOP_INTERN
      for(uint64_t local_site=0;local_site<lsites;local_site++){
 	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
 	/* 
 	 * Scidac csum  is rather more heavyweight
 	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
 	int global_site;
 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);
 	for(int d=0;d<nd;d++) {
 	  coor[d] = coor[d]+local_start[d];
 	}
 	Lexicographic::IndexFromCoor(coor,global_site,global_vol);
 	uint32_t gsite29   = global_site%29;
 	uint32_t gsite31   = global_site%31;
 	site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
 	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
 	//	std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
 	scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
 	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
      }
 PARALLEL_CRITICAL
      {
 	scidac_csuma^= scidac_csuma_thr;
 	scidac_csumb^= scidac_csumb_thr;
      }
    }
  }
  // Network is big endian
  static inline void htobe32_v(void *file_object,uint32_t bytes){ be32toh_v(file_object,bytes);} 
  static inline void htobe64_v(void *file_object,uint32_t bytes){ be64toh_v(file_object,bytes);} 
  static inline void htole32_v(void *file_object,uint32_t bytes){ le32toh_v(file_object,bytes);} 
  static inline void htole64_v(void *file_object,uint32_t bytes){ le64toh_v(file_object,bytes);} 
  static inline void be32toh_v(void *file_object,uint64_t bytes)
  {
    uint32_t * f = (uint32_t *)file_object;
    uint64_t count = bytes/sizeof(uint32_t);
    parallel_for(uint64_t i=0;i<count;i++){  
      f[i] = ntohl(f[i]);
    }
  }
  // LE must Swap and switch to host
  static inline void le32toh_v(void *file_object,uint64_t bytes)
  {
    uint32_t *fp = (uint32_t *)file_object;
    uint32_t f;
    uint64_t count = bytes/sizeof(uint32_t);
    parallel_for(uint64_t i=0;i<count;i++){  
      f = fp[i];
      // got network order and the network to host
      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
      fp[i] = ntohl(f);
    }
  }
  // BE is same as network
  static inline void be64toh_v(void *file_object,uint64_t bytes)
  {
    uint64_t * f = (uint64_t *)file_object;
    uint64_t count = bytes/sizeof(uint64_t);
    parallel_for(uint64_t i=0;i<count;i++){  
      f[i] = Grid_ntohll(f[i]);
    }
  }
  // LE must swap and switch;
  static inline void le64toh_v(void *file_object,uint64_t bytes)
  {
    uint64_t *fp = (uint64_t *)file_object;
    uint64_t f,g;
    uint64_t count = bytes/sizeof(uint64_t);
    parallel_for(uint64_t i=0;i<count;i++){  
      f = fp[i];
      // got network order and the network to host
      g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
      g = g << 32;
      f = f >> 32;
      g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
      fp[i] = Grid_ntohll(g);
    }
  }
  /////////////////////////////////////////////////////////////////////////////
  // Real action:
  // Read or Write distributed lexico array of ANY object to a specific location in file 
  //////////////////////////////////////////////////////////////////////////////////////
  static const int BINARYIO_MASTER_APPEND = 0x10;
  static const int BINARYIO_UNORDERED     = 0x08;
  static const int BINARYIO_LEXICOGRAPHIC = 0x04;
  static const int BINARYIO_READ          = 0x02;
  static const int BINARYIO_WRITE         = 0x01;
  template<class word,class fobj>
  static inline void IOobject(word w,
 			      GridBase *grid,
 			      std::vector<fobj> &iodata,
 			      std::string file,
 			      uint64_t& offset,
 			      const std::string &format, int control,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
 			      uint32_t &scidac_csumb)
  {
    grid->Barrier();
    GridStopWatch timer; 
    GridStopWatch bstimer;
    nersc_csum=0;
    scidac_csuma=0;
    scidac_csumb=0;
    int ndim                 = grid->Dimensions();
    int nrank                = grid->ProcessorCount();
    int myrank               = grid->ThisRank();
    std::vector<int>  psizes = grid->ProcessorGrid(); 
    std::vector<int>  pcoor  = grid->ThisProcessorCoor();
    std::vector<int> gLattice= grid->GlobalDimensions();
    std::vector<int> lLattice= grid->LocalDimensions();
    std::vector<int> lStart(ndim);
    std::vector<int> gStart(ndim);
    // Flatten the file
    uint64_t lsites = grid->lSites();
    if ( control & BINARYIO_MASTER_APPEND )  {
      assert(iodata.size()==1);
    } else {
      assert(lsites==iodata.size());
    }
    for(int d=0;d<ndim;d++){
      gStart[d] = lLattice[d]*pcoor[d];
      lStart[d] = 0;
    }
 #ifdef USE_MPI_IO
    std::vector<int> distribs(ndim,MPI_DISTRIBUTE_BLOCK);
    std::vector<int> dargs   (ndim,MPI_DISTRIBUTE_DFLT_DARG);
    MPI_Datatype mpiObject;
    MPI_Datatype fileArray;
    MPI_Datatype localArray;
    MPI_Datatype mpiword;
    MPI_Offset disp = offset;
    MPI_File fh ;
    MPI_Status status;
    int numword;
    if ( sizeof( word ) == sizeof(float ) ) {
      numword = sizeof(fobj)/sizeof(float);
      mpiword = MPI_FLOAT;
    } else {
      numword = sizeof(fobj)/sizeof(double);
      mpiword = MPI_DOUBLE;
    }
    //////////////////////////////////////////////////////////////////////////////
    // Sobj in MPI phrasing
    //////////////////////////////////////////////////////////////////////////////
    int ierr;
    ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject);    assert(ierr==0);
    ierr = MPI_Type_commit(&mpiObject);
    //////////////////////////////////////////////////////////////////////////////
    // File global array data type
    //////////////////////////////////////////////////////////////////////////////
    ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray);    assert(ierr==0);
    ierr=MPI_Type_commit(&fileArray);    assert(ierr==0);
    //////////////////////////////////////////////////////////////////////////////
    // local lattice array
    //////////////////////////////////////////////////////////////////////////////
    ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray);    assert(ierr==0);
    ierr=MPI_Type_commit(&localArray);    assert(ierr==0);
 #endif
    //////////////////////////////////////////////////////////////////////////////
    // Byte order
    //////////////////////////////////////////////////////////////////////////////
    int ieee32big = (format == std::string("IEEE32BIG"));
    int ieee32    = (format == std::string("IEEE32"));
    int ieee64big = (format == std::string("IEEE64BIG"));
    int ieee64    = (format == std::string("IEEE64"));
    //////////////////////////////////////////////////////////////////////////////
    // Do the I/O
    //////////////////////////////////////////////////////////////////////////////
    if ( control & BINARYIO_READ ) { 
      timer.Start();
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
 	std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
 	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
 	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
 	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
 	MPI_File_close(&fh);
 	MPI_Type_free(&fileArray);
 	MPI_Type_free(&localArray);
 #else 
 	assert(0);
 #endif
      } else {
 	std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
        std::ifstream fin;
 	fin.open(file, std::ios::binary | std::ios::in);
        if (control & BINARYIO_MASTER_APPEND)
        {
          fin.seekg(-sizeof(fobj), fin.end);
        }
        else
        {
          fin.seekg(offset + myrank * lsites * sizeof(fobj));
        }
        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
        assert(fin.fail() == 0);
        fin.close();
      }
      timer.Stop();
      grid->Barrier();
      bstimer.Start();
      ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
      if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      NerscChecksum(grid,iodata,nersc_csum);
      bstimer.Stop();
    }
    if ( control & BINARYIO_WRITE ) { 
      bstimer.Start();
      NerscChecksum(grid,iodata,nersc_csum);
      if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee32)    htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      if (ieee64)    htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
      ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
      bstimer.Stop();
      grid->Barrier();
      timer.Start();
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
        std::cout << GridLogMessage <<"IOobject: MPI write I/O " << file << std::endl;
        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
 	//        std::cout << GridLogMessage << "Checking for errors" << std::endl;
        if (ierr != MPI_SUCCESS)
        {
          char error_string[BUFSIZ];
          int length_of_error_string, error_class;
          MPI_Error_class(ierr, &error_class);
          MPI_Error_string(error_class, error_string, &length_of_error_string);
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
          MPI_Error_string(ierr, error_string, &length_of_error_string);
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
        }
        std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
        assert(ierr == 0);
        std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
        assert(ierr == 0);
        MPI_Offset os;
        MPI_File_get_position(fh, &os);
        MPI_File_get_byte_offset(fh, os, &disp);
        offset = disp;
        MPI_File_close(&fh);
        MPI_Type_free(&fileArray);
        MPI_Type_free(&localArray);
 #else 
 	assert(0);
 #endif
      } else { 
        std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
                  << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
 	std::ofstream fout; 
 	fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
 	try {
 	  if (offset) { // Must already exist and contain data
 	    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 	  } else {     // Allow create
 	    fout.open(file,std::ios::binary|std::ios::out);
 	  }
 	} catch (const std::fstream::failure& exc) {
 	  std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
 	  std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
 	  //	  std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
 #ifdef USE_MPI_IO
 	  MPI_Abort(MPI_COMM_WORLD,1);
 #else
 	  exit(1);
 #endif
 	}
 	if ( control & BINARYIO_MASTER_APPEND )  {
 	  try {
 	    fout.seekp(0,fout.end);
 	  } catch (const std::fstream::failure& exc) {
 	    std::cout << "Exception in seeking file end " << file << std::endl;
 	  }
 	} else {
 	  try { 
 	    fout.seekp(offset+myrank*lsites*sizeof(fobj));
 	  } catch (const std::fstream::failure& exc) {
 	    std::cout << "Exception in seeking file " << file <<" offset "<< offset << std::endl;
 	  }
 	}
 	try {
 	  fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
 	}
 	catch (const std::fstream::failure& exc) {
 	  std::cout << "Exception in writing file " << file << std::endl;
 	  std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
 #ifdef USE_MPI_IO
 	  MPI_Abort(MPI_COMM_WORLD,1);
 #else
 	  exit(1);
 #endif
 	}
  offset  = fout.tellp();
 	fout.close();
      }
      timer.Stop();
    }
    std::cout<<GridLogMessage<<"IOobject: ";
    if ( control & BINARYIO_READ) std::cout << " read  ";
    else                          std::cout << " write ";
    uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
    std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
    std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;
    //////////////////////////////////////////////////////////////////////////////
    // Safety check
    //////////////////////////////////////////////////////////////////////////////
    // if the data size is 1 we do not want to sum over the MPI ranks
    if (iodata.size() != 1){
      grid->Barrier();
      grid->GlobalSum(nersc_csum);
      grid->GlobalXOR(scidac_csuma);
      grid->GlobalXOR(scidac_csumb);
      grid->Barrier();
    }
  }
  /////////////////////////////////////////////////////////////////////////////
  // Read a Lattice of object
  //////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class fobj,class munger>
  static inline void readLatticeObject(Lattice<vobj> &Umu,
 				       std::string file,
 				       munger munge,
 				       uint64_t offset,
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
 				       uint32_t &scidac_csumb)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
    uint64_t lsites = grid->lSites();
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);
    GridStopWatch timer; 
    timer.Start();
    parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
    vectorizeFromLexOrdArray(scalardata,Umu);    
    grid->Barrier();
    timer.Stop();
    std::cout<<GridLogMessage<<"readLatticeObject: vectorize overhead "<<timer.Elapsed()  <<std::endl;
  }
  /////////////////////////////////////////////////////////////////////////////
  // Write a Lattice of object
  //////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class fobj,class munger>
    static inline void writeLatticeObject(Lattice<vobj> &Umu,
 					  std::string file,
 					  munger munge,
 					  uint64_t offset,
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
 					  uint32_t &scidac_csumb)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
    uint64_t lsites = grid->lSites();
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
    //////////////////////////////////////////////////////////////////////////////
    // Munge [ .e.g 3rd row recon ]
    //////////////////////////////////////////////////////////////////////////////
    GridStopWatch timer; timer.Start();
    unvectorizeToLexOrdArray(scalardata,Umu);    
    parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
    grid->Barrier();
    timer.Stop();
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);
    std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
  }
  /////////////////////////////////////////////////////////////////////////////
  // Read a RNG;  use IOobject and lexico map to an array of state 
  //////////////////////////////////////////////////////////////////////////////////////
  static inline void readRNG(GridSerialRNG &serial,
 			     GridParallelRNG &parallel,
 			     std::string file,
 			     uint64_t offset,
 			     uint32_t &nersc_csum,
 			     uint32_t &scidac_csuma,
 			     uint32_t &scidac_csumb)
  {
    typedef typename GridSerialRNG::RngStateType RngStateType;
    const int RngStateCount = GridSerialRNG::RngStateCount;
    typedef std::array<RngStateType,RngStateCount> RNGstate;
    typedef RngStateType word;    word w=0;
    std::string format = "IEEE32BIG";
    GridBase *grid = parallel._grid;
    uint64_t gsites = grid->gSites();
    uint64_t lsites = grid->lSites();
    uint32_t nersc_csum_tmp   = 0;
    uint32_t scidac_csuma_tmp = 0;
    uint32_t scidac_csumb_tmp = 0;
    GridStopWatch timer;
    std::cout << GridLogMessage << "RNG read I/O on file " << file << std::endl;
    std::vector<RNGstate> iodata(lsites);
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);
    timer.Start();
    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
      std::vector<RngStateType> tmp(RngStateCount);
      std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
      parallel.SetState(tmp,lidx);
    }
    timer.Stop();
    iodata.resize(1);
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_MASTER_APPEND,
 	     nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
    {
      std::vector<RngStateType> tmp(RngStateCount);
      std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
      serial.SetState(tmp,0);
    }
    nersc_csum   = nersc_csum   + nersc_csum_tmp;
    scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
    scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
    std::cout << GridLogMessage << "RNG file nersc_checksum   " << std::hex << nersc_csum << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG file scidac_checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG file scidac_checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
  }
  /////////////////////////////////////////////////////////////////////////////
  // Write a RNG; lexico map to an array of state and use IOobject
  //////////////////////////////////////////////////////////////////////////////////////
  static inline void writeRNG(GridSerialRNG &serial,
 			      GridParallelRNG &parallel,
 			      std::string file,
 			      uint64_t offset,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
 			      uint32_t &scidac_csumb)
  {
    typedef typename GridSerialRNG::RngStateType RngStateType;
    typedef RngStateType word; word w=0;
    const int RngStateCount = GridSerialRNG::RngStateCount;
    typedef std::array<RngStateType,RngStateCount> RNGstate;
    GridBase *grid = parallel._grid;
    uint64_t gsites = grid->gSites();
    uint64_t lsites = grid->lSites();
    uint32_t nersc_csum_tmp;
    uint32_t scidac_csuma_tmp;
    uint32_t scidac_csumb_tmp;
    GridStopWatch timer;
    std::string format = "IEEE32BIG";
    std::cout << GridLogMessage << "RNG write I/O on file " << file << std::endl;
    timer.Start();
    std::vector<RNGstate> iodata(lsites);
    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
      std::vector<RngStateType> tmp(RngStateCount);
      parallel.GetState(tmp,lidx);
      std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
    }
    timer.Stop();
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);
    iodata.resize(1);
    {
      std::vector<RngStateType> tmp(RngStateCount);
      serial.GetState(tmp,0);
      std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
    }
    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
 	     nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
    nersc_csum   = nersc_csum   + nersc_csum_tmp;
    scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
    scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
    std::cout << GridLogMessage << "RNG file checksum " << std::hex << nersc_csum    << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG file checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG file checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
  }
 };
 }
 #endif
@@ -0,0 +1,875 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/parallelIO/IldgIO.h
 Copyright (C) 2015
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_ILDG_IO_H
 #define GRID_ILDG_IO_H
 #ifdef HAVE_LIME
 #include <algorithm>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <map>
 #include <pwd.h>
 #include <sys/utsname.h>
 #include <unistd.h>
 //C-Lime is a must have for this functionality
 extern "C" {  
 #include "lime.h"
 }
 namespace Grid {
 namespace QCD {
  /////////////////////////////////
  // Encode word types as strings
  /////////////////////////////////
 template<class word> inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); }
 template<> inline std::string ScidacWordMnemonic<double>  (void){ return std::string("D"); }
 template<> inline std::string ScidacWordMnemonic<float>   (void){ return std::string("F"); }
 template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); }
 template<> inline std::string ScidacWordMnemonic<uint32_t>(void){ return std::string("U32_t"); }
 template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); }
 template<> inline std::string ScidacWordMnemonic<uint64_t>(void){ return std::string("U64_t"); }
  /////////////////////////////////////////
  // Encode a generic tensor as a string
  /////////////////////////////////////////
 template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { 
   typedef typename getPrecision<vobj>::real_scalar_type stype;
   int _ColourN       = indexRank<ColourIndex,vobj>();
   int _ColourScalar  =  isScalar<ColourIndex,vobj>();
   int _ColourVector  =  isVector<ColourIndex,vobj>();
   int _ColourMatrix  =  isMatrix<ColourIndex,vobj>();
   int _SpinN       = indexRank<SpinIndex,vobj>();
   int _SpinScalar  =  isScalar<SpinIndex,vobj>();
   int _SpinVector  =  isVector<SpinIndex,vobj>();
   int _SpinMatrix  =  isMatrix<SpinIndex,vobj>();
   int _LorentzN       = indexRank<LorentzIndex,vobj>();
   int _LorentzScalar  =  isScalar<LorentzIndex,vobj>();
   int _LorentzVector  =  isVector<LorentzIndex,vobj>();
   int _LorentzMatrix  =  isMatrix<LorentzIndex,vobj>();
   std::stringstream stream;
   stream << "GRID_";
   stream << ScidacWordMnemonic<stype>();
   if ( _LorentzVector )   stream << "_LorentzVector"<<_LorentzN;
   if ( _LorentzMatrix )   stream << "_LorentzMatrix"<<_LorentzN;
   if ( _SpinVector )   stream << "_SpinVector"<<_SpinN;
   if ( _SpinMatrix )   stream << "_SpinMatrix"<<_SpinN;
   if ( _ColourVector )   stream << "_ColourVector"<<_ColourN;
   if ( _ColourMatrix )   stream << "_ColourMatrix"<<_ColourN;
   if ( _ColourScalar && _LorentzScalar && _SpinScalar )   stream << "_Complex";
   typesize = sizeof(typename vobj::scalar_type);
   if ( _ColourMatrix ) typesize*= _ColourN*_ColourN;
   else                 typesize*= _ColourN;
   if ( _SpinMatrix )   typesize*= _SpinN*_SpinN;
   else                 typesize*= _SpinN;
   colors    = _ColourN;
   spins     = _SpinN;
   datacount = _LorentzN;
   return stream.str();
 }
 template<class vobj> std::string ScidacRecordTypeString(Lattice<vobj> & lat,int &colors, int &spins, int & typesize,int &datacount) { 
   return ScidacRecordTypeString<vobj>(colors,spins,typesize,datacount);
 };
 ////////////////////////////////////////////////////////////
 // Helper to fill out metadata
 ////////////////////////////////////////////////////////////
 template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
 					  FieldMetaData &header,
 					  scidacRecord & _scidacRecord,
 					  scidacFile   & _scidacFile) 
 {
   typedef typename getPrecision<vobj>::real_scalar_type stype;
   /////////////////////////////////////
   // Pull Grid's metadata
   /////////////////////////////////////
   PrepareMetaData(field,header);
   /////////////////////////////////////
   // Scidac Private File structure
   /////////////////////////////////////
   _scidacFile              = scidacFile(field._grid);
   /////////////////////////////////////
   // Scidac Private Record structure
   /////////////////////////////////////
   scidacRecord sr;
   sr.datatype   = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount);
   sr.date       = header.creation_date;
   sr.precision  = ScidacWordMnemonic<stype>();
   sr.recordtype = GRID_IO_FIELD;
   _scidacRecord = sr;
   //   std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
 }
 ///////////////////////////////////////////////////////
 // Scidac checksum
 ///////////////////////////////////////////////////////
 static int scidacChecksumVerify(scidacChecksum &scidacChecksum_,uint32_t scidac_csuma,uint32_t scidac_csumb)
 {
   uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
   uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
   if ( scidac_csuma !=scidac_checksuma) return 0;
   if ( scidac_csumb !=scidac_checksumb) return 0;
   return 1;
 }
 ////////////////////////////////////////////////////////////////////////////////////
 // Lime, ILDG and Scidac I/O classes
 ////////////////////////////////////////////////////////////////////////////////////
 class GridLimeReader : public BinaryIO {
 public:
   ///////////////////////////////////////////////////
   // FIXME: format for RNG? Now just binary out instead
   ///////////////////////////////////////////////////
   FILE       *File;
   LimeReader *LimeR;
   std::string filename;
   /////////////////////////////////////////////
   // Open the file
   /////////////////////////////////////////////
   void open(const std::string &_filename) 
   {
     filename= _filename;
     File = fopen(filename.c_str(), "r");
     if (File == nullptr)
     {
       std::cerr << "cannot open file '" << filename << "'" << std::endl;
       abort();
     }
     LimeR = limeCreateReader(File);
   }
   /////////////////////////////////////////////
   // Close the file
   /////////////////////////////////////////////
   void close(void){
     fclose(File);
     //     limeDestroyReader(LimeR);
   }
  ////////////////////////////////////////////
  // Read a generic lattice field and verify checksum
  ////////////////////////////////////////////
  template<class vobj>
  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    typedef typename vobj::scalar_object sobj;
    scidacChecksum scidacChecksum_;
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    std::string format = getFormatString<vobj>();
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      uint64_t file_bytes =limeReaderBytes(LimeR);
      //      std::cout << GridLogMessage << limeReaderType(LimeR) << " "<< file_bytes <<" bytes "<<std::endl;
      //      std::cout << GridLogMessage<< " readLimeObject seeking "<<  record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
 	//	std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
 	uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
 	//	std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
 	//	std::cout << "R Gsites " <<field._grid->_gsites<<std::endl;
 	//	std::cout << "R Payload expected " <<PayloadSize<<std::endl;
 	//	std::cout << "R file size " <<file_bytes <<std::endl;
 	assert(PayloadSize == file_bytes);// Must match or user error
 	uint64_t offset= ftello(File);
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
 	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	/////////////////////////////////////////////
 	// Insist checksum is next record
 	/////////////////////////////////////////////
 	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
 	/////////////////////////////////////////////
 	// Verify checksums
 	/////////////////////////////////////////////
 	assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
 	return;
      }
    }
  }
  ////////////////////////////////////////////
  // Read a generic serialisable object
  ////////////////////////////////////////////
  void readLimeObject(std::string &xmlstring,std::string record_name)
  {
    // should this be a do while; can we miss a first record??
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      //      std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
 	//	std::cout << GridLogMessage<< " readLimeObject matches ! " << record_name <<std::endl;
 	std::vector<char> xmlc(nbytes+1,'\0');
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
   xmlstring = std::string(&xmlc[0]);
 	return;
      }
    }  
    assert(0);
  }
  template<class serialisable_object>
  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
  {
    std::string xmlstring;
    readLimeObject(xmlstring, record_name);
 	  XmlReader RD(xmlstring, true, "");
 	  read(RD,object_name,object);
  }
 };
 class GridLimeWriter : public BinaryIO 
 {
 public:
   ///////////////////////////////////////////////////
   // FIXME: format for RNG? Now just binary out instead
   // FIXME: collective calls or not ?
   //      : must know if I am the I/O boss
   ///////////////////////////////////////////////////
   FILE       *File;
   LimeWriter *LimeW;
   std::string filename;
   bool        boss_node;
   GridLimeWriter( bool isboss = true) {
     boss_node = isboss;
   }
   void open(const std::string &_filename) { 
     filename= _filename;
     if ( boss_node ) {
       File = fopen(filename.c_str(), "w");
       LimeW = limeCreateWriter(File); assert(LimeW != NULL );
     }
   }
   /////////////////////////////////////////////
   // Close the file
   /////////////////////////////////////////////
   void close(void) {
     if ( boss_node ) {
       fclose(File);
     }
     //  limeDestroyWriter(LimeW);
   }
  ///////////////////////////////////////////////////////
  // Lime utility functions
  ///////////////////////////////////////////////////////
  int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
  {
    if ( boss_node ) {
      LimeRecordHeader *h;
      h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
      assert(limeWriteRecordHeader(h, LimeW) >= 0);
      limeDestroyHeader(h);
    }
    return LIME_SUCCESS;
  }
  ////////////////////////////////////////////
  // Write a generic serialisable object
  ////////////////////////////////////////////
  void writeLimeObject(int MB,int ME,XmlWriter &writer,std::string object_name,std::string record_name)
  {
    if ( boss_node ) {
      std::string xmlstring = writer.docString();
      //    std::cout << "WriteLimeObject" << record_name <<std::endl;
      uint64_t nbytes = xmlstring.size();
      //    std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
      int err;
      LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes); 
      assert(h!= NULL);
      err=limeWriteRecordHeader(h, LimeW);                    assert(err>=0);
      err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
      err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
      limeDestroyHeader(h);
    }
  }
  template<class serialisable_object>
  void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, const unsigned int scientificPrec = 0)
  {
    XmlWriter WR("","");
    if (scientificPrec)
    {
      WR.scientificFormat(true);
      WR.setPrecision(scientificPrec);
    }
    write(WR,object_name,object);
    writeLimeObject(MB, ME, WR, object_name, record_name);
  }
  ////////////////////////////////////////////////////
  // Write a generic lattice field and csum
  // This routine is Collectively called by all nodes
  // in communicator used by the field._grid
  ////////////////////////////////////////////////////
  template<class vobj>
  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    ////////////////////////////////////////////////////////////////////
    // NB: FILE and iostream are jointly writing disjoint sequences in the
    // the same file through different file handles (integer units).
    // 
    // These are both buffered, so why I think this code is right is as follows.
    //
    // i)  write record header to FILE *File, telegraphing the size; flush
    // ii) ftello reads the offset from FILE *File . 
    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
    //      Closes iostream and flushes.
    // iv) fseek on FILE * to end of this disjoint section.
    //  v) Continue writing scidac record.
    ////////////////////////////////////////////////////////////////////
    GridBase *grid = field._grid;
    assert(boss_node == field._grid->IsBoss() );
    ////////////////////////////////////////////
    // Create record header
    ////////////////////////////////////////////
    typedef typename vobj::scalar_object sobj;
    int err;
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    uint64_t PayloadSize = sizeof(sobj) * grid->_gsites;
    if ( boss_node ) {
      createLimeRecordHeader(record_name, 0, 0, PayloadSize);
      fflush(File);
    }
    //    std::cout << "W sizeof(sobj)"      <<sizeof(sobj)<<std::endl;
    //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl;
    //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;
    ////////////////////////////////////////////////
    // Check all nodes agree on file position
    ////////////////////////////////////////////////
    uint64_t offset1;
    if ( boss_node ) {
      offset1 = ftello(File);    
    }
    grid->Broadcast(0,(void *)&offset1,sizeof(offset1));
    ///////////////////////////////////////////
    // The above is collective. Write by other means into the binary record
    ///////////////////////////////////////////
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
    ///////////////////////////////////////////
    // Wind forward and close the record
    ///////////////////////////////////////////
    if ( boss_node ) {
      fseek(File,0,SEEK_END);             
      uint64_t offset2 = ftello(File);     //    std::cout << " now at offset "<<offset2 << std::endl;
      assert( (offset2-offset1) == PayloadSize);
    }
    /////////////////////////////////////////////////////////////
    // Check MPI-2 I/O did what we expect to file
    /////////////////////////////////////////////////////////////
    if ( boss_node ) { 
      err=limeWriterCloseRecord(LimeW);  assert(err>=0);
    }
    ////////////////////////////////////////
    // Write checksum element, propagaing forward from the BinaryIO
    // Always pair a checksum with a binary object, and close message
    ////////////////////////////////////////
    scidacChecksum checksum;
    std::stringstream streama; streama << std::hex << scidac_csuma;
    std::stringstream streamb; streamb << std::hex << scidac_csumb;
    checksum.suma= streama.str();
    checksum.sumb= streamb.str();
    if ( boss_node ) { 
      writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
    }
  }
 };
 class ScidacWriter : public GridLimeWriter {
 public:
  ScidacWriter(bool isboss =true ) : GridLimeWriter(isboss)  { };
  template<class SerialisableUserFile>
  void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
  {
    scidacFile    _scidacFile(grid);
    if ( this->boss_node ) {
      writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
      writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
    }
  }
  ////////////////////////////////////////////////
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
                              const unsigned int recordScientificPrec = 0) 
  {
    GridBase * grid = field._grid;
    ////////////////////////////////////////
    // fill the Grid header
    ////////////////////////////////////////
    FieldMetaData header;
    scidacRecord  _scidacRecord;
    scidacFile    _scidacFile;
    ScidacMetaData(field,header,_scidacRecord,_scidacFile);
    //////////////////////////////////////////////
    // Fill the Lime file record by record
    //////////////////////////////////////////////
    if ( this->boss_node ) {
      writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
      writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML), recordScientificPrec);
      writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    }
    // Collective call
    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
  }
 };
 class ScidacReader : public GridLimeReader {
 public:
   template<class SerialisableUserFile>
   void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
   {
     scidacFile    _scidacFile(grid);
     readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
     readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
   }
  ////////////////////////////////////////////////
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
  {
    typedef typename vobj::scalar_object sobj;
    GridBase * grid = field._grid;
    ////////////////////////////////////////
    // fill the Grid header
    ////////////////////////////////////////
    FieldMetaData header;
    scidacRecord  _scidacRecord;
    scidacFile    _scidacFile;
    //////////////////////////////////////////////
    // Fill the Lime file record by record
    //////////////////////////////////////////////
    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
  }
  void skipPastBinaryRecord(void) {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
    }    
  }
  void skipPastObjectRecord(std::string rec_name) {
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
 	return;
      }
    }
  }
  void skipScidacFieldRecord() {
    skipPastObjectRecord(std::string(GRID_FORMAT));
    skipPastObjectRecord(std::string(SCIDAC_RECORD_XML));
    skipPastObjectRecord(std::string(SCIDAC_PRIVATE_RECORD_XML));
    skipPastBinaryRecord();
  }
 };
 class IldgWriter : public ScidacWriter {
 public:
  IldgWriter(bool isboss) : ScidacWriter(isboss) {};
  ///////////////////////////////////
  // A little helper
  ///////////////////////////////////
  void writeLimeIldgLFN(std::string &LFN)
  {
    uint64_t PayloadSize = LFN.size();
    int err;
    createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
    err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0);
    err=limeWriterCloseRecord(LimeW); assert(err>=0);
  }
  ////////////////////////////////////////////////////////////////
  // Special ILDG operations ; gauge configs only.
  // Don't require scidac records EXCEPT checksum
  // Use Grid MetaData object if present.
  ////////////////////////////////////////////////////////////////
  template <class vsimd>
  void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) 
  {
    GridBase * grid = Umu._grid;
    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
    typedef iLorentzColourMatrix<vsimd> vobj;
    typedef typename vobj::scalar_object sobj;
    ////////////////////////////////////////
    // fill the Grid header
    ////////////////////////////////////////
    FieldMetaData header;
    scidacRecord  _scidacRecord;
    scidacFile    _scidacFile;
    ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
    std::string format = header.floating_point;
    header.ensemble_id    = description;
    header.ensemble_label = description;
    header.sequence_number = sequence;
    header.ildg_lfn = LFN;
    assert ( (format == std::string("IEEE32BIG"))  
           ||(format == std::string("IEEE64BIG")) );
    //////////////////////////////////////////////////////
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
    ildgfmt.field     = std::string("su3gauge");
    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
    } else { 
      ildgfmt.precision = 64;
    }
    ildgfmt.version = 1.0;
    ildgfmt.lx = header.dimension[0];
    ildgfmt.ly = header.dimension[1];
    ildgfmt.lz = header.dimension[2];
    ildgfmt.lt = header.dimension[3];
    assert(header.nd==4);
    assert(header.nd==header.dimension.size());
    //////////////////////////////////////////////////////////////////////////////
    // Fill the USQCD info field
    //////////////////////////////////////////////////////////////////////////////
    usqcdInfo info;
    info.version=1.0;
    info.plaq   = header.plaquette;
    info.linktr = header.link_trace;
    std::cout << GridLogMessage << " Writing config; IldgIO "<<std::endl;
    //////////////////////////////////////////////
    // Fill the Lime file record by record
    //////////////////////////////////////////////
    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
    writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
    writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
    writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    writeLimeObject(0,0,ildgfmt,std::string("ildgFormat")   ,std::string(ILDG_FORMAT)); // rec
    writeLimeIldgLFN(header.ildg_lfn);                                                 // rec
    writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
    //    limeDestroyWriter(LimeW);
  }
 };
 class IldgReader : public GridLimeReader {
 public:
  ////////////////////////////////////////////////////////////////
  // Read either Grid/SciDAC/ILDG configuration
  // Don't require scidac records EXCEPT checksum
  // Use Grid MetaData object if present.
  // Else use ILDG MetaData object if present.
  // Else use SciDAC MetaData object if present.
  ////////////////////////////////////////////////////////////////
  template <class vsimd>
  void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
    typedef typename GaugeField::vector_object  vobj;
    typedef typename vobj::scalar_object sobj;
    typedef LorentzColourMatrixF fobj;
    typedef LorentzColourMatrixD dobj;
    GridBase *grid = Umu._grid;
    std::vector<int> dims = Umu._grid->FullDimensions();
    assert(dims.size()==4);
    // Metadata holders
    ildgFormat     ildgFormat_    ;
    std::string    ildgLFN_       ;
    scidacChecksum scidacChecksum_; 
    usqcdInfo      usqcdInfo_     ;
    // track what we read from file
    int found_ildgFormat    =0;
    int found_ildgLFN       =0;
    int found_scidacChecksum=0;
    int found_usqcdInfo     =0;
    int found_ildgBinary =0;
    int found_FieldMetaData =0;
    uint32_t nersc_csum;
    uint32_t scidac_csuma;
    uint32_t scidac_csumb;
    // Binary format
    std::string format;
    //////////////////////////////////////////////////////////////////////////
    // Loop over all records
    // -- Order is poorly guaranteed except ILDG header preceeds binary section.
    // -- Run like an event loop.
    // -- Impose trust hierarchy. Grid takes precedence & look for ILDG, and failing
    //    that Scidac. 
    // -- Insist on Scidac checksum record.
    //////////////////////////////////////////////////////////////////////////
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
      //////////////////////////////////////////////////////////////////
      // If not BINARY_DATA read a string and parse
      //////////////////////////////////////////////////////////////////
      if ( strncmp(limeReaderType(LimeR), ILDG_BINARY_DATA,strlen(ILDG_BINARY_DATA) )  ) {
 	// Copy out the string
 	std::vector<char> xmlc(nbytes+1,'\0');
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 	//	std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
 	//////////////////////////////////
 	// ILDG format record
  std::string xmlstring(&xmlc[0]);
 	if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) { 
 	  XmlReader RD(xmlstring, true, "");
 	  read(RD,"ildgFormat",ildgFormat_);
 	  if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
 	  if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
 	  assert( ildgFormat_.lx == dims[0]);
 	  assert( ildgFormat_.ly == dims[1]);
 	  assert( ildgFormat_.lz == dims[2]);
 	  assert( ildgFormat_.lt == dims[3]);
 	  found_ildgFormat = 1;
 	}
 	if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
 	  FieldMetaData_.ildg_lfn = xmlstring;
 	  found_ildgLFN = 1;
 	}
 	if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) { 
 	  XmlReader RD(xmlstring, true, "");
 	  read(RD,"FieldMetaData",FieldMetaData_);
 	  format = FieldMetaData_.floating_point;
 	  assert(FieldMetaData_.dimension[0] == dims[0]);
 	  assert(FieldMetaData_.dimension[1] == dims[1]);
 	  assert(FieldMetaData_.dimension[2] == dims[2]);
 	  assert(FieldMetaData_.dimension[3] == dims[3]);
 	  found_FieldMetaData = 1;
 	}
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) { 
 	  // is it a USQCD info field
 	  if ( xmlstring.find(std::string("usqcdInfo")) != std::string::npos ) { 
 	    //	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
 	    XmlReader RD(xmlstring, true, "");
 	    read(RD,"usqcdInfo",usqcdInfo_);
 	    found_usqcdInfo = 1;
 	  }
 	}
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) { 
 	  XmlReader RD(xmlstring, true, "");
 	  read(RD,"scidacChecksum",scidacChecksum_);
 	  found_scidacChecksum = 1;
 	}
      } else {  
 	/////////////////////////////////
 	// Binary data
 	/////////////////////////////////
 	std::cout << GridLogMessage << "ILDG Binary record found : "  ILDG_BINARY_DATA << std::endl;
 	uint64_t offset= ftello(File);
 	if ( format == std::string("IEEE64BIG") ) {
 	  GaugeSimpleMunger<dobj, sobj> munge;
 	  BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	} else { 
 	  GaugeSimpleMunger<fobj, sobj> munge;
 	  BinaryIO::readLatticeObject< vobj, fobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	}
 	found_ildgBinary = 1;
      }
    }
    //////////////////////////////////////////////////////
    // Minimally must find binary segment and checksum
    // Since this is an ILDG reader require ILDG format
    //////////////////////////////////////////////////////
    assert(found_ildgBinary);
    assert(found_ildgFormat);
    assert(found_scidacChecksum);
    // Must find something with the lattice dimensions
    assert(found_FieldMetaData||found_ildgFormat);
    if ( found_FieldMetaData ) {
      std::cout << GridLogMessage<<"Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<<std::endl;
    } else { 
      assert(found_ildgFormat);
      assert ( ildgFormat_.field == std::string("su3gauge") );
      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
      ///////////////////////////////////////////////////////////////////////////////////////
      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
      FieldMetaData_.dimension[0] = ildgFormat_.lx ;
      FieldMetaData_.dimension[1] = ildgFormat_.ly ;
      FieldMetaData_.dimension[2] = ildgFormat_.lz ;
      FieldMetaData_.dimension[3] = ildgFormat_.lt ;
      if ( found_usqcdInfo ) { 
 	FieldMetaData_.plaquette = usqcdInfo_.plaq;
 	FieldMetaData_.link_trace= usqcdInfo_.linktr;
 	std::cout << GridLogMessage <<"This configuration was probably written by USQCD "<<std::endl;
 	std::cout << GridLogMessage <<"USQCD xml record Plaquette : "<<FieldMetaData_.plaquette<<std::endl;
 	std::cout << GridLogMessage <<"USQCD xml record LinkTrace : "<<FieldMetaData_.link_trace<<std::endl;
      } else { 
 	FieldMetaData_.plaquette = 0.0;
 	FieldMetaData_.link_trace= 0.0;
 	std::cout << GridLogWarning << "This configuration is unsafe with no plaquette records that can verify it !!! "<<std::endl;
      }
    }
    ////////////////////////////////////////////////////////////
    // Really really want to mandate a scidac checksum
    ////////////////////////////////////////////////////////////
    if ( found_scidacChecksum ) {
      FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
      FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
      scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
      assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
      assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
      std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
    } else { 
      std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
      assert(0); // Can I insist always checksum ?
    }
    if ( found_FieldMetaData || found_usqcdInfo ) {
      FieldMetaData checker;
      GaugeStatistics(Umu,checker);
      assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
      assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
      std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
    }
  }
 };
 }}
 //HAVE_LIME
 #endif
 #endif
@@ -0,0 +1,237 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/parallelIO/IldgIO.h
 Copyright (C) 2015
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_ILDGTYPES_IO_H
 #define GRID_ILDGTYPES_IO_H
 #ifdef HAVE_LIME
 extern "C" { // for linkage
 #include "lime.h"
 }
 namespace Grid {
 /////////////////////////////////////////////////////////////////////////////////
 // Data representation of records that enter ILDG and SciDac formats
 /////////////////////////////////////////////////////////////////////////////////
 #define GRID_FORMAT      "grid-format"
 #define ILDG_FORMAT      "ildg-format"
 #define ILDG_BINARY_DATA "ildg-binary-data"
 #define ILDG_DATA_LFN    "ildg-data-lfn"
 #define SCIDAC_CHECKSUM           "scidac-checksum"
 #define SCIDAC_PRIVATE_FILE_XML   "scidac-private-file-xml"
 #define SCIDAC_FILE_XML           "scidac-file-xml"
 #define SCIDAC_PRIVATE_RECORD_XML "scidac-private-record-xml"
 #define SCIDAC_RECORD_XML         "scidac-record-xml"
 #define SCIDAC_BINARY_DATA        "scidac-binary-data"
 // Unused SCIDAC records names; could move to support this functionality
 #define SCIDAC_SITELIST           "scidac-sitelist"
  ////////////////////////////////////////////////////////////
  const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat
  const int GRID_IO_MULTIFILE  = 1; // hardcode lift from QIO compat
  const int GRID_IO_FIELD      = 0; // hardcode lift from QIO compat
  const int GRID_IO_GLOBAL     = 1; // hardcode lift from QIO compat
  ////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////
 // QIO uses mandatory "private" records fixed format
 // Private is in principle "opaque" however it can't be changed now because that would break existing 
 // file compatability, so should be correct to assume the undocumented but defacto file structure.
 /////////////////////////////////////////////////////////////////////////////////
 struct emptyUserRecord : Serializable { 
  GRID_SERIALIZABLE_CLASS_MEMBERS(emptyUserRecord,int,dummy);
  emptyUserRecord() { dummy=0; };
 };
 ////////////////////////
 // Scidac private file xml
 // <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
 ////////////////////////
 struct scidacFile : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile,
                                  double, version,
                                  int, spacetime,
 				  std::string, dims, // must convert to int
                                  int, volfmt);
  std::vector<int> getDimensions(void) { 
    std::stringstream stream(dims);
    std::vector<int> dimensions;
    int n;
    while(stream >> n){
      dimensions.push_back(n);
    }
    return dimensions;
  }
  void setDimensions(std::vector<int> dimensions) { 
    char delimiter = ' ';
    std::stringstream stream;
    for(int i=0;i<dimensions.size();i++){ 
      stream << dimensions[i];
      if ( i != dimensions.size()-1) { 
 	stream << delimiter <<std::endl;
      }
    }
    dims = stream.str();
  }
  // Constructor provides Grid
  scidacFile() =default; // default constructor
  scidacFile(GridBase * grid){
    version      = 1.0;
    spacetime    = grid->_ndimension;
    setDimensions(grid->FullDimensions()); 
    volfmt       = GRID_IO_SINGLEFILE;
  }
 };
 ///////////////////////////////////////////////////////////////////////
 // scidac-private-record-xml : example
 // <scidacRecord>
 // <version>1.1</version><date>Tue Jul 26 21:14:44 2011 UTC</date><recordtype>0</recordtype>
 // <datatype>QDP_D3_ColorMatrix</datatype><precision>D</precision><colors>3</colors><spins>4</spins>
 // <typesize>144</typesize><datacount>4</datacount>
 // </scidacRecord>
 ///////////////////////////////////////////////////////////////////////
 struct scidacRecord : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord,
                                  double, version,
                                  std::string, date,
 				  int, recordtype,
 				  std::string, datatype,
 				  std::string, precision,
 				  int, colors,
 				  int, spins,
 				  int, typesize,
 				  int, datacount);
  scidacRecord()
  : version(1.0), recordtype(0), colors(0), spins(0), typesize(0), datacount(0)
  {}
 };
 ////////////////////////
 // ILDG format
 ////////////////////////
 struct ildgFormat : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(ildgFormat,
 				  double, version,
 				  std::string, field,
 				  int, precision,
 				  int, lx,
 				  int, ly,
 				  int, lz,
 				  int, lt);
  ildgFormat() { version=1.0; };
 };
 ////////////////////////
 // USQCD info
 ////////////////////////
 struct usqcdInfo : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo,
 				  double, version,
 				  double, plaq,
 				  double, linktr,
 				  std::string, info);
  usqcdInfo() { 
    version=1.0; 
  };
 };
 ////////////////////////
 // Scidac Checksum
 ////////////////////////
 struct scidacChecksum : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
 				  double, version,
 				  std::string, suma,
 				  std::string, sumb);
  scidacChecksum() { 
    version=1.0; 
  };
 };
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Type:           scidac-file-xml         <title>MILC ILDG archival gauge configuration</title>
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Type:           
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////
 // Scidac private file xml 
 // <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile> 
 ////////////////////////                                                                                                                                                                              
 #if 0
 ////////////////////////////////////////////////////////////////////////////////////////
 // From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf
 ////////////////////////////////////////////////////////////////////////////////////////
 struct usqcdPropFile : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile,
 				  double, version,
 				  std::string, type,
 				  std::string, info);
  usqcdPropFile() { 
    version=1.0; 
  };
 };
 struct usqcdSourceInfo : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo,
 				  double, version,
 				  std::string, info);
  usqcdSourceInfo() { 
    version=1.0; 
  };
 };
 struct usqcdPropInfo : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo,
 				  double, version,
 				  int, spin,
 				  int, color,
 				  std::string, info);
  usqcdPropInfo() { 
    version=1.0; 
  };
 };
 #endif
 }
 #endif
 #endif
@@ -0,0 +1,327 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/parallelIO/NerscIO.h
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <algorithm>
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <map>
 #include <unistd.h>
 #include <sys/utsname.h>
 #include <pwd.h>
 namespace Grid {
  ///////////////////////////////////////////////////////
  // Precision mapping
  ///////////////////////////////////////////////////////
  template<class vobj> static std::string getFormatString (void)
  {
    std::string format;
    typedef typename getPrecision<vobj>::real_scalar_type stype;
    if ( sizeof(stype) == sizeof(float) ) {
      format = std::string("IEEE32BIG");
    }
    if ( sizeof(stype) == sizeof(double) ) {
      format = std::string("IEEE64BIG");
    }
    return format;
  }
  ////////////////////////////////////////////////////////////////////////////////
  // header specification/interpretation
  ////////////////////////////////////////////////////////////////////////////////
    class FieldMetaData : Serializable {
    public:
      GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData,
 				      int, nd,
 				      std::vector<int>, dimension,
 				      std::vector<std::string>, boundary,
 				      int, data_start,
 				      std::string, hdr_version,
 				      std::string, storage_format,
 				      double, link_trace,
 				      double, plaquette,
 				      uint32_t, checksum,
 				      uint32_t, scidac_checksuma,
 				      uint32_t, scidac_checksumb,
 				      unsigned int, sequence_number,
 				      std::string, data_type,
 				      std::string, ensemble_id,
 				      std::string, ensemble_label,
 				      std::string, ildg_lfn,
 				      std::string, creator,
 				      std::string, creator_hardware,
 				      std::string, creation_date,
 				      std::string, archive_date,
 				      std::string, floating_point);
      // WARNING: non-initialised values might lead to twisted parallel IO
      // issues, std::string are fine because they initliase to size 0
      // as per C++ standard.
      FieldMetaData(void) 
      : nd(4), dimension(4,0), boundary(4, ""), data_start(0),
      link_trace(0.), plaquette(0.), checksum(0),
      scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
      {}
    };
  namespace QCD {
    using namespace Grid;
    //////////////////////////////////////////////////////////////////////
    // Bit and Physical Checksumming and QA of data
    //////////////////////////////////////////////////////////////////////
    inline void GridMetaData(GridBase *grid,FieldMetaData &header)
    {
      int nd = grid->_ndimension;
      header.nd = nd;
      header.dimension.resize(nd);
      header.boundary.resize(nd);
      header.data_start = 0;
      for(int d=0;d<nd;d++) {
 	header.dimension[d] = grid->_fdimensions[d];
      }
      for(int d=0;d<nd;d++) {
 	header.boundary[d] = std::string("PERIODIC");
      }
    }
    inline void MachineCharacteristics(FieldMetaData &header)
    {
      // Who
      struct passwd *pw = getpwuid (getuid());
      if (pw) header.creator = std::string(pw->pw_name); 
      // When
      std::time_t t = std::time(nullptr);
      std::tm tm_ = *std::localtime(&t);
      std::ostringstream oss; 
      //      oss << std::put_time(&tm_, "%c %Z");
      header.creation_date = oss.str();
      header.archive_date  = header.creation_date;
      // What
      struct utsname name;  uname(&name);
      header.creator_hardware = std::string(name.nodename)+"-";
      header.creator_hardware+= std::string(name.machine)+"-";
      header.creator_hardware+= std::string(name.sysname)+"-";
      header.creator_hardware+= std::string(name.release);
    }
 #define dump_meta_data(field, s)					\
      s << "BEGIN_HEADER"      << std::endl;				\
      s << "HDR_VERSION = "    << field.hdr_version    << std::endl;	\
      s << "DATATYPE = "       << field.data_type      << std::endl;	\
      s << "STORAGE_FORMAT = " << field.storage_format << std::endl;	\
      for(int i=0;i<4;i++){						\
 	s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
      }									\
      s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
      s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl; \
      for(int i=0;i<4;i++){						\
 	s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;	\
      }									\
 									\
      s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
      s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
      s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
      s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;	\
      s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;	\
      s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;	\
      s << "CREATOR = "         << field.creator          << std::endl;	\
      s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;	\
      s << "CREATION_DATE = "   << field.creation_date    << std::endl;	\
      s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;	\
      s << "FLOATING_POINT = "  << field.floating_point   << std::endl;	\
      s << "END_HEADER"         << std::endl;
 template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
 {
  GridBase *grid = field._grid;
  std::string format = getFormatString<vobj>();
   header.floating_point = format;
   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
   GridMetaData(grid,header); 
   MachineCharacteristics(header);
 }
 inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
 {
   // How to convert data precision etc...
   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data);
   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
 }
 inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
 {
   // How to convert data precision etc...
   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data);
   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
 }
 template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
 {
   GridBase *grid = field._grid;
   std::string format = getFormatString<vLorentzColourMatrixF>();
   header.floating_point = format;
   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
   GridMetaData(grid,header); 
   GaugeStatistics(field,header);
   MachineCharacteristics(header);
 }
 template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
 {
   GridBase *grid = field._grid;
   std::string format = getFormatString<vLorentzColourMatrixD>();
   header.floating_point = format;
   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
   GridMetaData(grid,header); 
   GaugeStatistics(field,header);
   MachineCharacteristics(header);
 }
    //////////////////////////////////////////////////////////////////////
    // Utilities ; these are QCD aware
    //////////////////////////////////////////////////////////////////////
    inline void reconstruct3(LorentzColourMatrix & cm)
    {
      const int x=0;
      const int y=1;
      const int z=2;
      for(int mu=0;mu<Nd;mu++){
 	cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
 	cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
 	cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
      }
    }
    ////////////////////////////////////////////////////////////////////////////////
    // Some data types for intermediate storage
    ////////////////////////////////////////////////////////////////////////////////
    template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
    typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
    typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
    typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
 /////////////////////////////////////////////////////////////////////////////////
 // Simple classes for precision conversion
 /////////////////////////////////////////////////////////////////////////////////
 template <class fobj, class sobj>
 struct BinarySimpleUnmunger {
  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
  void operator()(sobj &in, fobj &out) {
    // take word by word and transform accoding to the status
    fobj_stype *out_buffer = (fobj_stype *)&out;
    sobj_stype *in_buffer = (sobj_stype *)&in;
    size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
    size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
    assert(fobj_words == sobj_words);
    for (unsigned int word = 0; word < sobj_words; word++)
      out_buffer[word] = in_buffer[word];  // type conversion on the fly
  }
 };
 template <class fobj, class sobj>
 struct BinarySimpleMunger {
  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
  void operator()(fobj &in, sobj &out) {
    // take word by word and transform accoding to the status
    fobj_stype *in_buffer = (fobj_stype *)&in;
    sobj_stype *out_buffer = (sobj_stype *)&out;
    size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
    size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
    assert(fobj_words == sobj_words);
    for (unsigned int word = 0; word < sobj_words; word++)
      out_buffer[word] = in_buffer[word];  // type conversion on the fly
  }
 };
    template<class fobj,class sobj>
    struct GaugeSimpleMunger{
      void operator()(fobj &in, sobj &out) {
        for (int mu = 0; mu < Nd; mu++) {
          for (int i = 0; i < Nc; i++) {
          for (int j = 0; j < Nc; j++) {
 	    out(mu)()(i, j) = in(mu)()(i, j);
 	  }}
        }
      };
    };
    template <class fobj, class sobj>
    struct GaugeSimpleUnmunger {
      void operator()(sobj &in, fobj &out) {
        for (int mu = 0; mu < Nd; mu++) {
          for (int i = 0; i < Nc; i++) {
          for (int j = 0; j < Nc; j++) {
 	    out(mu)()(i, j) = in(mu)()(i, j);
 	  }}
        }
      };
    };
    template<class fobj,class sobj>
    struct Gauge3x2munger{
      void operator() (fobj &in,sobj &out){
 	for(int mu=0;mu<Nd;mu++){
 	  for(int i=0;i<2;i++){
 	  for(int j=0;j<3;j++){
 	    out(mu)()(i,j) = in(mu)(i)(j);
 	  }}
 	}
 	reconstruct3(out);
      }
    };
    template<class fobj,class sobj>
    struct Gauge3x2unmunger{
      void operator() (sobj &in,fobj &out){
 	for(int mu=0;mu<Nd;mu++){
 	  for(int i=0;i<2;i++){
 	  for(int j=0;j<3;j++){
 	    out(mu)(i)(j) = in(mu)()(i,j);
 	  }}
 	}
      }
    };
  }
 }
@@ -0,0 +1,363 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/parallelIO/NerscIO.h
    Copyright (C) 2015
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 namespace Grid {
  namespace QCD {
    using namespace Grid;
    ////////////////////////////////////////////////////////////////////////////////
    // Write and read from fstream; comput header offset for payload
    ////////////////////////////////////////////////////////////////////////////////
    class NerscIO : public BinaryIO { 
    public:
      static inline void truncate(std::string file){
 	std::ofstream fout(file,std::ios::out);
      }
      static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
      {
      std::ofstream fout(file,std::ios::out|std::ios::in);
      fout.seekp(0,std::ios::beg);
      dump_meta_data(field, fout);
      field.data_start = fout.tellp();
      return field.data_start;
    }
      // for the header-reader
      static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field)
      {
      uint64_t offset=0;
      std::map<std::string,std::string> header;
      std::string line;
      //////////////////////////////////////////////////
      // read the header
      //////////////////////////////////////////////////
      std::ifstream fin(file);
      getline(fin,line); // read one line and insist is 
      removeWhitespace(line);
      std::cout << GridLogMessage << "* " << line << std::endl;
      assert(line==std::string("BEGIN_HEADER"));
      do {
      getline(fin,line); // read one line
      std::cout << GridLogMessage << "* "<<line<< std::endl;
      int eq = line.find("=");
      if(eq >0) {
      std::string key=line.substr(0,eq);
      std::string val=line.substr(eq+1);
      removeWhitespace(key);
      removeWhitespace(val);
      header[key] = val;
    }
    } while( line.find("END_HEADER") == std::string::npos );
      field.data_start = fin.tellg();
      //////////////////////////////////////////////////
      // chomp the values
      //////////////////////////////////////////////////
      field.hdr_version    = header["HDR_VERSION"];
      field.data_type      = header["DATATYPE"];
      field.storage_format = header["STORAGE_FORMAT"];
      field.dimension[0] = std::stol(header["DIMENSION_1"]);
      field.dimension[1] = std::stol(header["DIMENSION_2"]);
      field.dimension[2] = std::stol(header["DIMENSION_3"]);
      field.dimension[3] = std::stol(header["DIMENSION_4"]);
      assert(grid->_ndimension == 4);
      for(int d=0;d<4;d++){
      assert(grid->_fdimensions[d]==field.dimension[d]);
    }
      field.link_trace = std::stod(header["LINK_TRACE"]);
      field.plaquette  = std::stod(header["PLAQUETTE"]);
      field.boundary[0] = header["BOUNDARY_1"];
      field.boundary[1] = header["BOUNDARY_2"];
      field.boundary[2] = header["BOUNDARY_3"];
      field.boundary[3] = header["BOUNDARY_4"];
      field.checksum = std::stoul(header["CHECKSUM"],0,16);
      field.ensemble_id      = header["ENSEMBLE_ID"];
      field.ensemble_label   = header["ENSEMBLE_LABEL"];
      field.sequence_number  = std::stol(header["SEQUENCE_NUMBER"]);
      field.creator          = header["CREATOR"];
      field.creator_hardware = header["CREATOR_HARDWARE"];
      field.creation_date    = header["CREATION_DATE"];
      field.archive_date     = header["ARCHIVE_DATE"];
      field.floating_point   = header["FLOATING_POINT"];
      return field.data_start;
    }
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Now the meat: the object readers
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class vsimd>
    static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
 					 FieldMetaData& header,
 					 std::string file)
    {
      typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
      GridBase *grid = Umu._grid;
      uint64_t offset = readHeader(file,Umu._grid,header);
      FieldMetaData clone(header);
      std::string format(header.floating_point);
      int ieee32big = (format == std::string("IEEE32BIG"));
      int ieee32    = (format == std::string("IEEE32"));
      int ieee64big = (format == std::string("IEEE64BIG"));
      int ieee64    = (format == std::string("IEEE64"));
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      // depending on datatype, set up munger;
      // munger is a function of <floating point, Real, data_type>
      if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
 	if ( ieee32 || ieee32big ) {
 	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 	    (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
 	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
 	if ( ieee64 || ieee64big ) {
 	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 	    (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
      } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
 	if ( ieee32 || ieee32big ) {
 	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
 	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
 	if ( ieee64 || ieee64big ) {
 	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
 	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
 	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
      } else {
 	assert(0);
      }
      GaugeStatistics(Umu,clone);
      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
 	       <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl;
      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
 	       <<" header    "<<header.plaquette<<std::endl;
      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
 	       <<" header    "<<header.link_trace<<std::endl;
      if ( fabs(clone.plaquette -header.plaquette ) >=  1.0e-5 ) { 
 	std::cout << " Plaquette mismatch "<<std::endl;
 	std::cout << Umu[0]<<std::endl;
 	std::cout << Umu[1]<<std::endl;
      }
      if ( nersc_csum != header.checksum ) { 
 	std::cerr << " checksum mismatch " << std::endl;
 	std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
 	std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
 	std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
 	exit(0);
      }
      assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
      assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
      assert(nersc_csum == header.checksum );
      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
    }
      template<class vsimd>
      static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
 					    std::string file, 
 					    int two_row,
 					    int bits32)
      {
 	typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
 	typedef iLorentzColourMatrix<vsimd> vobj;
 	typedef typename vobj::scalar_object sobj;
 	FieldMetaData header;
 	///////////////////////////////////////////
 	// Following should become arguments
 	///////////////////////////////////////////
 	header.sequence_number = 1;
 	header.ensemble_id     = "UKQCD";
 	header.ensemble_label  = "DWF";
 	typedef LorentzColourMatrixD fobj3D;
 	typedef LorentzColour2x3D    fobj2D;
 	GridBase *grid = Umu._grid;
 	GridMetaData(grid,header);
 	assert(header.nd==4);
 	GaugeStatistics(Umu,header);
 	MachineCharacteristics(header);
 	uint64_t offset;
 	// Sod it -- always write 3x3 double
 	header.floating_point = std::string("IEEE64BIG");
 	header.data_type      = std::string("4D_SU3_GAUGE_3x3");
 	GaugeSimpleUnmunger<fobj3D,sobj> munge;
 	if ( grid->IsBoss() ) { 
 	  truncate(file);
 	  offset = writeHeader(header,file);
 	}
 	grid->Broadcast(0,(void *)&offset,sizeof(offset));
 	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
 	BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 								  nersc_csum,scidac_csuma,scidac_csumb);
 	header.checksum = nersc_csum;
 	if ( grid->IsBoss() ) { 
 	  writeHeader(header,file);
 	}
 	std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
 		 <<std::hex<<header.checksum
 		 <<std::dec<<" plaq "<< header.plaquette <<std::endl;
      }
      ///////////////////////////////
      // RNG state
      ///////////////////////////////
      static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file)
      {
 	typedef typename GridParallelRNG::RngStateType RngStateType;
 	// Following should become arguments
 	FieldMetaData header;
 	header.sequence_number = 1;
 	header.ensemble_id     = "UKQCD";
 	header.ensemble_label  = "DWF";
 	GridBase *grid = parallel._grid;
 	GridMetaData(grid,header);
 	assert(header.nd==4);
 	header.link_trace=0.0;
 	header.plaquette=0.0;
 	MachineCharacteristics(header);
 	uint64_t offset;
 #ifdef RNG_RANLUX
 	header.floating_point = std::string("UINT64");
 	header.data_type      = std::string("RANLUX48");
 #endif
 #ifdef RNG_MT19937
 	header.floating_point = std::string("UINT32");
 	header.data_type      = std::string("MT19937");
 #endif
 #ifdef RNG_SITMO
 	header.floating_point = std::string("UINT64");
 	header.data_type      = std::string("SITMO");
 #endif
 	if ( grid->IsBoss() ) { 
 	  truncate(file);
 	  offset = writeHeader(header,file);
 	}
 	grid->Broadcast(0,(void *)&offset,sizeof(offset));
 	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
 	BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
 	header.checksum = nersc_csum;
 	if ( grid->IsBoss() ) { 
 	  offset = writeHeader(header,file);
 	}
 	std::cout<<GridLogMessage 
 		 <<"Written NERSC RNG STATE "<<file<< " checksum "
 		 <<std::hex<<header.checksum
 		 <<std::dec<<std::endl;
      }
      static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
      {
 	typedef typename GridParallelRNG::RngStateType RngStateType;
 	GridBase *grid = parallel._grid;
 	uint64_t offset = readHeader(file,grid,header);
 	FieldMetaData clone(header);
 	std::string format(header.floating_point);
 	std::string data_type(header.data_type);
 #ifdef RNG_RANLUX
 	assert(format == std::string("UINT64"));
 	assert(data_type == std::string("RANLUX48"));
 #endif
 #ifdef RNG_MT19937
 	assert(format == std::string("UINT32"));
 	assert(data_type == std::string("MT19937"));
 #endif
 #ifdef RNG_SITMO
 	assert(format == std::string("UINT64"));
 	assert(data_type == std::string("SITMO"));
 #endif
 	// depending on datatype, set up munger;
 	// munger is a function of <floating point, Real, data_type>
 	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
 	BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
 	if ( nersc_csum != header.checksum ) { 
 	  std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
 	  exit(0);
 	}
 	assert(nersc_csum == header.checksum );
 	std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
      }
    };
  }}
 #endif
@@ -0,0 +1,75 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/PerfCount.cc
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/perfmon/PerfCount.h>
 namespace Grid {
 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 #define RawConfig(A,B) (A<<8|B)
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
 #ifdef __linux__
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." , INSTRUCTIONS},
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." , CACHE_REFERENCES},
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
    // 4
 #ifdef KNL
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
    { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS  },
    { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS  },
    { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
    { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
    // 11
 #else
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS....",INSTRUCTIONS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
    // 11
 #endif
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS.......",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS.....",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
    //15
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......",INSTRUCTIONS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS....",INSTRUCTIONS}
    //19
  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
 #endif
 };
 }
@@ -0,0 +1,245 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/PerfCount.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <peterboyle@MacBook-Pro.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_PERFCOUNT_H
 #define GRID_PERFCOUNT_H
 #include <sys/time.h>
 #include <ctime>
 #include <chrono>
 #include <string.h>
 #include <unistd.h>
 #include <sys/ioctl.h>
 #ifdef __linux__
 #include <syscall.h>
 #include <linux/perf_event.h>
 #else
 #include <sys/syscall.h>
 #endif
 #ifdef __x86_64__
 #include <x86intrin.h>
 #endif
 namespace Grid {
 #ifdef __linux__
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 			    int cpu, int group_fd, unsigned long flags)
 {
  int ret=0;
  ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
 		group_fd, flags);
  return ret;
 }
 #endif
 #ifdef TIMERS_OFF
 inline uint64_t cyclecount(void){ 
  return 0;
 }
 #define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
 #define __SSC_STOP  __SSC_MARK(0x110)
 #define __SSC_START __SSC_MARK(0x111)
 #else
 #define __SSC_MARK(mark) 
 #define __SSC_STOP  
 #define __SSC_START 
 /*
 * cycle counters arch dependent
 */
 #ifdef __bgq__
 inline uint64_t cyclecount(void){ 
   uint64_t tmp;
   asm volatile ("mfspr %0,0x10C" : "=&r" (tmp)  );
   return tmp;
 }
 #elif defined __x86_64__
 inline uint64_t cyclecount(void){ 
  return __rdtsc();
  //  unsigned int dummy;
  // return __rdtscp(&dummy);
 }
 #else
 inline uint64_t cyclecount(void){ 
   return 0;
 }
 #endif
 #endif
 class PerformanceCounter {
 private:
  typedef struct { 
  public:
    uint32_t type;
    uint64_t config;
    const char *name;
    int normalisation;
  } PerformanceCounterConfig; 
  static const PerformanceCounterConfig PerformanceCounterConfigs [];
 public:
  enum PerformanceCounterType {
    CACHE_REFERENCES=0,
    CACHE_MISSES=1,
    CPUCYCLES=2,
    INSTRUCTIONS=3,
    L1D_READ_ACCESS=4,
    PERFORMANCE_COUNTER_NUM_TYPES=19
  };
 public:
  int PCT;
  long long count;
  long long cycles;
  int fd;
  int cyclefd;
  unsigned long long elapsed;
  uint64_t begin;
  static int NumTypes(void){ 
    return PERFORMANCE_COUNTER_NUM_TYPES;
  }
  PerformanceCounter(int _pct) {
 #ifdef __linux__
    assert(_pct>=0);
    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
    fd=-1;
    cyclefd=-1;
    count=0;
    cycles=0;
    PCT =_pct;
    Open();
 #endif
  }
  void Open(void) 
  {
 #ifdef __linux__
    struct perf_event_attr pe;
    memset(&pe, 0, sizeof(struct perf_event_attr));
    pe.size = sizeof(struct perf_event_attr);
    pe.disabled = 1;
    pe.exclude_kernel = 1;
    pe.exclude_hv = 1;
    pe.inherit    = 1;
    pe.type  = PerformanceCounterConfigs[PCT].type;
    pe.config= PerformanceCounterConfigs[PCT].config;
    const char * name = PerformanceCounterConfigs[PCT].name;
    fd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
    if (fd == -1) {
      fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name);
      perror("Error is");
    }
    int norm = PerformanceCounterConfigs[PCT].normalisation;
    pe.type  = PerformanceCounterConfigs[norm].type;
    pe.config= PerformanceCounterConfigs[norm].config;
    name = PerformanceCounterConfigs[norm].name;
    cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
    if (cyclefd == -1) {
      fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name);
      perror("Error is");
    }
 #endif
  }
  void Start(void)
  {
 #ifdef __linux__
    if ( fd!= -1) {
      ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
      ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
    }
    begin  =cyclecount();
 #else
    begin = 0;
 #endif
  }
  void Stop(void) {
    count=0;
    cycles=0;
 #ifdef __linux__
    ssize_t ign;
    if ( fd!= -1) {
      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
      ign=::read(fd, &count, sizeof(long long));
      ign+=::read(cyclefd, &cycles, sizeof(long long));
      assert(ign=2*sizeof(long long));
    }
    elapsed = cyclecount() - begin;
 #else
    elapsed = 0;
 #endif
  }
  void Report(void) {
 #ifdef __linux__
    int N = PerformanceCounterConfigs[PCT].normalisation;
    const char * sn = PerformanceCounterConfigs[N].name ;
    const char * sc = PerformanceCounterConfigs[PCT].name;
      std::printf("tsc = %llu %s = %llu  %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, 
 		  sc, count, sc,sn, (double)count/(double)cycles);
 #else
    std::printf("%llu cycles \n", elapsed );
 #endif
  }
  ~PerformanceCounter()
  {
 #ifdef __linux__
    ::close(fd);    ::close(cyclefd);
 #endif
  }
 };
 }
 #endif
@@ -0,0 +1,245 @@
 #include <Grid/GridCore.h>
 #include <Grid/perfmon/PerfCount.h>
 #include <Grid/perfmon/Stat.h>
 namespace Grid { 
 bool PmuStat::pmu_initialized=false;
 void PmuStat::init(const char *regname)
 {
 #ifdef __x86_64__
  name = regname;
  if (!pmu_initialized)
    {
      std::cout<<"initialising pmu"<<std::endl;
      pmu_initialized = true;
      pmu_init();
    }
  clear();
 #endif
 }
 void PmuStat::clear(void)
 {
 #ifdef __x86_64__
  count = 0;
  tregion = 0;
  pmc0 = 0;
  pmc1 = 0;
  inst = 0;
  cyc = 0;
  ref = 0;
  tcycles = 0;
  reads = 0;
  writes = 0;
 #endif
 }
 void PmuStat::print(void)
 {
 #ifdef __x86_64__
  std::cout <<"Reg "<<std::string(name)<<":\n";
  std::cout <<"  region "<<tregion<<std::endl;
  std::cout <<"  cycles "<<tcycles<<std::endl;
  std::cout <<"  inst   "<<inst   <<std::endl;
  std::cout <<"  cyc    "<<cyc    <<std::endl;
  std::cout <<"  ref    "<<ref    <<std::endl;
  std::cout <<"  pmc0   "<<pmc0   <<std::endl;
  std::cout <<"  pmc1   "<<pmc1   <<std::endl;
  std::cout <<"  count  "<<count  <<std::endl;
  std::cout <<"  reads  "<<reads  <<std::endl;
  std::cout <<"  writes "<<writes <<std::endl;
 #endif
 }
 void PmuStat::start(void)
 {
 #ifdef __x86_64__
  pmu_start();
  ++count;
  xmemctrs(&mrstart, &mwstart);
  tstart = __rdtsc();
 #endif
 }
 void PmuStat::enter(int t)
 {
 #ifdef __x86_64__
  counters[0][t] = __rdpmc(0);
  counters[1][t] = __rdpmc(1);
  counters[2][t] = __rdpmc((1<<30)|0);
  counters[3][t] = __rdpmc((1<<30)|1);
  counters[4][t] = __rdpmc((1<<30)|2);
  counters[5][t] = __rdtsc();
 #endif
 }
 void PmuStat::exit(int t)
 {
 #ifdef __x86_64__
  counters[0][t] = __rdpmc(0) - counters[0][t];
  counters[1][t] = __rdpmc(1) - counters[1][t];
  counters[2][t] = __rdpmc((1<<30)|0) - counters[2][t];
  counters[3][t] = __rdpmc((1<<30)|1) - counters[3][t];
  counters[4][t] = __rdpmc((1<<30)|2) - counters[4][t];
  counters[5][t] = __rdtsc() - counters[5][t];
 #endif
 }
 void PmuStat::accum(int nthreads)
 {
 #ifdef __x86_64__
  tend = __rdtsc();
  xmemctrs(&mrend, &mwend);
  pmu_stop();
  for (int t = 0; t < nthreads; ++t) {
    pmc0 += counters[0][t];
    pmc1 += counters[1][t];
    inst += counters[2][t];
    cyc += counters[3][t];
    ref += counters[4][t];
    tcycles += counters[5][t];
  }
  uint64_t region = tend - tstart;
  tregion += region;
  uint64_t mreads = mrend - mrstart;
  reads += mreads;
  uint64_t mwrites = mwend - mwstart;
  writes += mwrites;
 #endif
 }
 void PmuStat::pmu_fini(void) {}
 void PmuStat::pmu_start(void) {};
 void PmuStat::pmu_stop(void) {};
 void PmuStat::pmu_init(void)
 {
 #ifdef _KNIGHTS_LANDING_
  KNLsetup();
 #endif
 }
 void PmuStat::xmemctrs(uint64_t *mr, uint64_t *mw)
 {
 #ifdef _KNIGHTS_LANDING_
  ctrs c;
  KNLreadctrs(c);
  uint64_t emr = 0, emw = 0;
  for (int i = 0; i < NEDC; ++i)
    {
      emr += c.edcrd[i];
      emw += c.edcwr[i];
    }
  *mr = emr;
  *mw = emw;
 #else
  *mr = *mw = 0;
 #endif
 }
 #ifdef _KNIGHTS_LANDING_
 struct knl_gbl_ PmuStat::gbl;
 #define PMU_MEM
 void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
 {
  char fname[1024];
  snprintf(fname, sizeof(fname), "%s/type", ename);
  FILE *fp = fopen(fname, "r");
  if (fp == 0) {
    ::printf("open %s", fname);
    ::exit(0);
  }
  int type;
  int ret = fscanf(fp, "%d", &type);
  assert(ret == 1);
  fclose(fp);
  //  std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
  struct perf_event_attr hw = {};
  hw.size = sizeof(hw);
  hw.type = type;
  // see /sys/devices/uncore_*/format/*
  // All of the events we are interested in are configured the same way, but
  // that isn't always true. Proper code would parse the format files
  hw.config = event | (umask << 8);
  //hw.read_format = PERF_FORMAT_GROUP;
  // unfortunately the above only works within a single PMU; might
  // as well just read them one at a time
  int cpu = 0;
  fd = perf_event_open(&hw, -1, cpu, -1, 0);
  if (fd == -1) {
    ::printf("CPU %d, box %s, event 0x%lx", cpu, ename, hw.config);
    ::exit(0);
  } else { 
    //    std::cout << "event "<<std::string(ename)<<" set up for fd "<<fd<<" hw.config "<<hw.config <<std::endl;
  }
 }
 void PmuStat::KNLsetup(void){
   int ret;
   char fname[1024];
   // MC RPQ inserts and WPQ inserts (reads & writes)
   for (int mc = 0; mc < NMC; ++mc)
     {
       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
       // RPQ Inserts
       KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
       // WPQ Inserts
       KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
     }
   // EDC RPQ inserts and WPQ inserts
   for (int edc=0; edc < NEDC; ++edc)
     {
       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
       // RPQ inserts
       KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
       // WPQ inserts
       KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
     }
   // EDC HitE, HitM, MissE, MissM
   for (int edc=0; edc < NEDC; ++edc)
     {
       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
       KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
       KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
       KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
       KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
     }
 }
 uint64_t PmuStat::KNLreadctr(int fd)
 {
  uint64_t data;
  size_t s = ::read(fd, &data, sizeof(data));
  if (s != sizeof(uint64_t)){
    ::printf("read counter %lu", s);
    ::exit(0);
  }
  return data;
 }
 void PmuStat::KNLreadctrs(ctrs &c)
 {
  for (int i = 0; i < NMC; ++i)
    {
      c.mcrd[i] = KNLreadctr(gbl.mc_rd[i]);
      c.mcwr[i] = KNLreadctr(gbl.mc_wr[i]);
    }
  for (int i = 0; i < NEDC; ++i)
    {
      c.edcrd[i] = KNLreadctr(gbl.edc_rd[i]);
      c.edcwr[i] = KNLreadctr(gbl.edc_wr[i]);
    }
  for (int i = 0; i < NEDC; ++i)
    {
      c.edchite[i] = KNLreadctr(gbl.edc_hite[i]);
      c.edchitm[i] = KNLreadctr(gbl.edc_hitm[i]);
      c.edcmisse[i] = KNLreadctr(gbl.edc_misse[i]);
      c.edcmissm[i] = KNLreadctr(gbl.edc_missm[i]);
    }
 }
 #endif
 }
@@ -0,0 +1,104 @@
 #ifndef _GRID_STAT_H
 #define _GRID_STAT_H
 #ifdef AVX512
 #define _KNIGHTS_LANDING_ROOTONLY
 #endif
 namespace Grid { 
 ///////////////////////////////////////////////////////////////////////////////
 // Extra KNL counters from MCDRAM
 ///////////////////////////////////////////////////////////////////////////////
 #ifdef _KNIGHTS_LANDING_
 #define NMC 6
 #define NEDC 8
 struct ctrs
 {
    uint64_t mcrd[NMC];
    uint64_t mcwr[NMC];
    uint64_t edcrd[NEDC]; 
    uint64_t edcwr[NEDC];
    uint64_t edchite[NEDC];
    uint64_t edchitm[NEDC];
    uint64_t edcmisse[NEDC];
    uint64_t edcmissm[NEDC];
 };
 // Peter/Azusa:
 // Our modification of a code provided by Larry Meadows from Intel
 // Verified by email exchange non-NDA, ok for github. Should be as uses /sys/devices/ FS
 // so is already public and in the linux kernel for KNL.
 struct knl_gbl_
 {
  int mc_rd[NMC];
  int mc_wr[NMC];
  int edc_rd[NEDC];
  int edc_wr[NEDC];
  int edc_hite[NEDC];
  int edc_hitm[NEDC];
  int edc_misse[NEDC];
  int edc_missm[NEDC];
 };
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 class PmuStat
 {
    uint64_t counters[8][256];
 #ifdef _KNIGHTS_LANDING_
    static struct knl_gbl_ gbl;
 #endif
    const char *name;
    uint64_t reads;     // memory reads
    uint64_t writes;    // memory writes
    uint64_t mrstart;   // memory read counter at start of parallel region
    uint64_t mrend;     // memory read counter at end of parallel region
    uint64_t mwstart;   // memory write counter at start of parallel region
    uint64_t mwend;     // memory write counter at end of parallel region
    // cumulative counters
    uint64_t count;     // number of invocations
    uint64_t tregion;   // total time in parallel region (from thread 0)
    uint64_t tcycles;   // total cycles inside parallel region
    uint64_t inst, ref, cyc;   // fixed counters
    uint64_t pmc0, pmc1;// pmu
    // add memory counters here
    // temp variables
    uint64_t tstart;    // tsc at start of parallel region
    uint64_t tend;      // tsc at end of parallel region
    // map for ctrs values
    // 0 pmc0 start
    // 1 pmc0 end
    // 2 pmc1 start
    // 3 pmc1 end
    // 4 tsc start
    // 5 tsc end
    static bool pmu_initialized;
 public:
    static bool is_init(void){ return pmu_initialized;}
    static void pmu_init(void);
    static void pmu_fini(void);
    static void pmu_start(void);
    static void pmu_stop(void);
    void accum(int nthreads);
    static void xmemctrs(uint64_t *mr, uint64_t *mw);
    void start(void);
    void enter(int t);
    void exit(int t);
    void print(void);
    void init(const char *regname);
    void clear(void);
 #ifdef _KNIGHTS_LANDING_
    static void     KNLsetup(void);
    static uint64_t KNLreadctr(int fd);
    static void     KNLreadctrs(ctrs &c);
    static void     KNLevsetup(const char *ename, int &fd, int event, int umask);
 #endif
  };
 }
 #endif
@@ -0,0 +1,111 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Timer.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_TIME_H
 #define GRID_TIME_H
 #include <sys/time.h>
 #include <ctime>
 #include <chrono>
 namespace Grid {
  // Dress the output; use std::chrono
 // C++11 time facilities better?
 inline double usecond(void) {
  struct timeval tv;
 #ifdef TIMERS_ON
  gettimeofday(&tv,NULL);
 #endif
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
 typedef  std::chrono::milliseconds          GridMillisecs;
 typedef  std::chrono::microseconds          GridTime;
 typedef  std::chrono::microseconds          GridUsecs;
 inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
 {
  stream << time.count()<<" ms";
  return stream;
 }
 inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
 {
  stream << time.count()<<" usec";
  return stream;
 }
 class GridStopWatch {
 private:
  bool running;
  GridTimePoint start;
  GridUsecs accumulator;
 public:
  GridStopWatch () { 
    Reset();
  }
  void     Start(void) { 
    assert(running == false);
 #ifdef TIMERS_ON
    start = GridClock::now(); 
 #endif
    running = true;
  }
  void     Stop(void)  { 
    assert(running == true);
 #ifdef TIMERS_ON
    accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); 
 #endif
    running = false; 
  };
  void     Reset(void){
    running = false;
 #ifdef TIMERS_ON
    start = GridClock::now();
 #endif
    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
  }
  GridTime Elapsed(void) {
    assert(running == false);
    return std::chrono::duration_cast<GridTime>( accumulator );
  }
  uint64_t useconds(void){
    assert(running == false);
    return (uint64_t) accumulator.count();
  }
  bool isRunning(void){
    return running;
  }
 };
 }
 #endif
@@ -0,0 +1,74 @@
 /**
 * pugixml parser - version 1.9
 * --------------------------------------------------------
 * Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 * Report bugs and download new versions at http://pugixml.org/
 *
 * This library is distributed under the MIT License. See notice at the end
 * of this file.
 *
 * This work is based on the pugxml parser, which is:
 * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
 */
 #ifndef HEADER_PUGICONFIG_HPP
 #define HEADER_PUGICONFIG_HPP
 // Uncomment this to enable wchar_t mode
 // #define PUGIXML_WCHAR_MODE
 // Uncomment this to enable compact mode
 // #define PUGIXML_COMPACT
 // Uncomment this to disable XPath
 // #define PUGIXML_NO_XPATH
 // Uncomment this to disable STL
 // #define PUGIXML_NO_STL
 // Uncomment this to disable exceptions
 // #define PUGIXML_NO_EXCEPTIONS
 // Set this to control attributes for public classes/functions, i.e.:
 // #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
 // #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
 // #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
 // In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
 // Tune these constants to adjust memory-related behavior
 // #define PUGIXML_MEMORY_PAGE_SIZE 32768
 // #define PUGIXML_MEMORY_OUTPUT_STACK 10240
 // #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
 // Uncomment this to switch to header-only version
 // #define PUGIXML_HEADER_ONLY
 // Uncomment this to enable long long support
 // #define PUGIXML_HAS_LONG_LONG
 #endif
 /**
 * Copyright (c) 2006-2018 Arseny Kapoulkine
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
--- a/Show More
+++ b/Show More