Hadrons: support for twisted boundary conditions

Optional Twisted BC's added, in "DoubleStore" for WilsonImpl.
Untested but doesn't affect answers when twists are all zero. The zero is the default behaviour for ImplParams.
2026-07-21 19:13:28 +01:00 · 2018-11-12 17:16:18 +00:00 · 2018-11-08 12:55:25 +00:00 · 2018-11-07 23:40:15 +00:00 · 2018-11-07 23:39:18 +00:00 · 2018-11-07 23:38:46 +00:00
938 changed files with 170790 additions and 37880 deletions
@@ -5,11 +5,11 @@
 *.o
 *.obj

-
 # Editor files #
 ################
 *~
 *#
+*.sublime-*

 # Precompiled Headers #
 #######################
@@ -48,6 +48,9 @@ Config.h.in
 config.log
 config.status
 .deps
+Make.inc
+eigen.inc
+Eigen.inc

 # http://www.gnu.org/software/autoconf #
 ########################################
@@ -62,19 +65,8 @@ stamp-h1
 config.sub
 config.guess
 INSTALL
-
-# Packages #
-############
-# it's better to unpack these files and commit the raw source
-# git has its own built in compression methods
-*.7z
-*.dmg
-*.gz
-*.iso
-*.jar
-*.rar
-*.tar
-*.zip
+.dirstamp
+ltmain.sh
 
 # Logs and databases #
 ######################
@@ -91,12 +83,34 @@ INSTALL
 .Trashes
 ehthumbs.db
 Thumbs.db
+.dirstamp

 # build directory #
 ###################
-build/*
+build*/*

 # IDE related files #
 #####################
 *.xcodeproj/*
 build.sh
+.vscode
+*.code-workspace
+
+# Eigen source #
+################
+Grid/Eigen
+Eigen/*
+
+# libtool macros #
+##################
+m4/lt*
+m4/libtool.m4
+
+# github pages #
+################
+gh-pages/
+
+# generated sources #
+#####################
+Grid/qcd/spin/gamma-gen/*.h
+Grid/qcd/spin/gamma-gen/*.cc
@@ -0,0 +1,61 @@
+language: cpp
+
+cache:
+  directories:
+    - clang
+
+matrix:
+  include:
+    - os:        osx
+      osx_image: xcode8.3
+      compiler: clang
+      env: PREC=single
+    - os:        osx
+      osx_image: xcode8.3
+      compiler: clang
+      env: PREC=double
+      
+before_install:
+    - export GRIDDIR=`pwd`
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
+    
+install:
+    - export CWD=`pwd`
+    - echo $CWD
+    - export CC=$CC$VERSION
+    - export CXX=$CXX$VERSION
+    - echo $PATH
+    - which autoconf
+    - autoconf  --version
+    - which automake
+    - automake  --version
+    - which $CC
+    - $CC  --version
+    - which $CXX
+    - $CXX --version
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
+    
+script:
+    - ./bootstrap.sh
+    - mkdir build
+    - cd build
+    - mkdir lime
+    - cd lime
+    - mkdir build
+    - cd build
+    - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
+    - tar xf lime-1.3.2.tar.gz
+    - cd lime-1.3.2
+    - ./configure --prefix=$CWD/build/lime/install
+    - make -j4
+    - make install
+    - cd $CWD/build
+    - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
+    - make -j4 
+    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
+    - make check
@@ -1,5 +1,4 @@
-Peter Boyle
-Azusa Yamaguchi
-Intel Parallel Computing Centre @ Higgs Centre for Theoretical Physics
-University of Edinburgh
-Scotland, UK
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@MacBook-Pro.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
@@ -1,622 +1,281 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
+                   GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991

- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
 your programs, too.

  When we speak of free software, we are referring to freedom, not
 price.  Our General Public Licenses are designed to make sure that you
 have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.

-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.

  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.

-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.

-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.

-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.

  The precise terms and conditions for copying, distribution and
 modification follow.

-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
 this License.

-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
 otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.

-  13. Use with the GNU Affero General Public License.
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.

-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.

-  14. Revised Versions of this License.
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.

-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
 be similar in spirit to the present version, but may differ in detail to
 address new problems or concerns.

-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.

-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.

-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
+                            NO WARRANTY

-  15. Disclaimer of Warranty.
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.

-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.

                     END OF TERMS AND CONDITIONS

@@ -628,15 +287,15 @@ free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
 to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
+convey the exclusion of warranty; and each file should have at least
 the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

-    This program is free software: you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
@@ -644,31 +303,38 @@ the "copyright" line and a pointer to where the full notice is found.
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

 Also add information on how to contact you by electronic and paper mail.

-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:

-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

 The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.

-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.

-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
@@ -0,0 +1,37 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/DisableWarnings.h
+
+Copyright (C) 2016
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef DISABLE_WARNINGS_H
+#define DISABLE_WARNINGS_H
+
+ //disables and intel compiler specific warning (in json.hpp)
+#pragma warning disable 488  
+
+
+#endif
@@ -0,0 +1,49 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Grid.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+//
+//  Grid.h
+//  simd
+//
+//  Created by Peter Boyle on 09/05/2014.
+//  Copyright (c) 2014 University of Edinburgh. All rights reserved.
+//
+
+#ifndef GRID_H
+#define GRID_H
+
+#include <Grid/GridCore.h>
+#include <Grid/GridQCDcore.h>
+#include <Grid/qcd/action/Action.h>
+#include <Grid/qcd/utils/GaugeFix.h>
+#include <Grid/qcd/smearing/Smearing.h>
+#include <Grid/parallelIO/MetaData.h>
+#include <Grid/qcd/hmc/HMC_aggregate.h>
+
+#endif
@@ -0,0 +1,61 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Grid.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+//
+//  Grid.h
+//  simd
+//
+//  Created by Peter Boyle on 09/05/2014.
+//  Copyright (c) 2014 University of Edinburgh. All rights reserved.
+//
+
+#ifndef GRID_BASE_H
+#define GRID_BASE_H
+
+#include <Grid/GridStd.h>
+
+#include <Grid/perfmon/Timer.h>
+#include <Grid/perfmon/PerfCount.h>
+#include <Grid/log/Log.h>
+#include <Grid/allocator/AlignedAllocator.h>
+#include <Grid/simd/Simd.h>
+#include <Grid/serialisation/Serialisation.h>
+#include <Grid/threads/Threads.h>
+#include <Grid/util/Util.h>
+#include <Grid/util/Sha.h>
+#include <Grid/communicator/Communicator.h> 
+#include <Grid/cartesian/Cartesian.h>    
+#include <Grid/tensors/Tensors.h>      
+#include <Grid/lattice/Lattice.h>      
+#include <Grid/cshift/Cshift.h>       
+#include <Grid/stencil/Stencil.h>      
+#include <Grid/parallelIO/BinaryIO.h>
+#include <Grid/algorithms/Algorithms.h>   
+
+#endif
@@ -0,0 +1,42 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Grid.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_QCD_CORE_H
+#define GRID_QCD_CORE_H
+
+/////////////////////////
+// Core Grid QCD headers
+/////////////////////////
+#include <Grid/GridCore.h>
+#include <Grid/qcd/QCD.h>
+#include <Grid/qcd/spin/Spin.h>
+#include <Grid/qcd/utils/Utils.h>
+#include <Grid/qcd/representations/Representations.h>
+
+#endif
@@ -0,0 +1,29 @@
+#ifndef GRID_STD_H
+#define GRID_STD_H
+
+///////////////////
+// Std C++ dependencies
+///////////////////
+#include <cassert>
+#include <complex>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <random>
+#include <functional>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <signal.h>
+#include <ctime>
+#include <sys/time.h>
+#include <chrono>
+#include <zlib.h>
+
+///////////////////
+// Grid config
+///////////////////
+#include "Config.h"
+
+#endif /* GRID_STD_H */
@@ -0,0 +1,14 @@
+#pragma once
+// Force Eigen to use MKL if Grid has been configured with --enable-mkl
+#ifdef USE_MKL
+#define EIGEN_USE_MKL_ALL
+#endif
+
+#if defined __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+#include <Grid/Eigen/Dense>
+#if defined __GNUC__
+#pragma GCC diagnostic pop
+#endif
@@ -0,0 +1,63 @@
+extra_sources=
+extra_headers=
+
+if BUILD_COMMS_MPI3
+  extra_sources+=communicator/Communicator_mpi3.cc
+  extra_sources+=communicator/Communicator_base.cc
+  extra_sources+=communicator/SharedMemoryMPI.cc
+  extra_sources+=communicator/SharedMemory.cc
+endif
+
+if BUILD_COMMS_NONE
+  extra_sources+=communicator/Communicator_none.cc
+  extra_sources+=communicator/Communicator_base.cc
+  extra_sources+=communicator/SharedMemoryNone.cc
+  extra_sources+=communicator/SharedMemory.cc
+endif
+
+if BUILD_HDF5
+  extra_sources+=serialisation/Hdf5IO.cc 
+  extra_headers+=serialisation/Hdf5IO.h
+  extra_headers+=serialisation/Hdf5Type.h
+endif
+
+all: version-cache
+
+version-cache:
+	@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
+		a="uncommited changes";\
+	else\
+		a="clean";\
+	fi;\
+	echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d $$a\\"%n" HEAD`" > vertmp;\
+	if [ -e version-cache ]; then\
+		d=`diff vertmp version-cache`;\
+		if [ "$${d}" != "" ]; then\
+			mv vertmp version-cache;\
+			rm -f Version.h;\
+		fi;\
+	else\
+		mv vertmp version-cache;\
+		rm -f Version.h;\
+	fi;\
+	rm -f vertmp
+
+Version.h:
+	cp version-cache Version.h
+
+.PHONY: version-cache
+
+#
+# Libraries
+#
+include Make.inc
+include Eigen.inc
+
+lib_LIBRARIES = libGrid.a
+
+CCFILES += $(extra_sources)
+HFILES  += $(extra_headers) Config.h Version.h
+
+libGrid_a_SOURCES              = $(CCFILES)
+libGrid_adir                   = $(includedir)/Grid
+nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) $(eigen_unsupp_files)
@@ -0,0 +1,61 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/Algorithms.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ALGORITHMS_H
+#define GRID_ALGORITHMS_H
+
+#include <Grid/algorithms/SparseMatrix.h>
+#include <Grid/algorithms/LinearOperator.h>
+#include <Grid/algorithms/Preconditioner.h>
+
+#include <Grid/algorithms/approx/Zolotarev.h>
+#include <Grid/algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/Remez.h>
+#include <Grid/algorithms/approx/MultiShiftFunction.h>
+#include <Grid/algorithms/approx/Forecast.h>
+
+#include <Grid/algorithms/iterative/Deflation.h>
+#include <Grid/algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateResidual.h>
+#include <Grid/algorithms/iterative/NormalEquations.h>
+#include <Grid/algorithms/iterative/SchurRedBlack.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/CoarsenedMatrix.h>
+#include <Grid/algorithms/FFT.h>
+
+// EigCg
+// Pcg
+// Hdcg
+// GCR
+// etc..
+
+#endif
@@ -1,7 +1,36 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/CoarsenedMatrix.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H

-#include <Grid.h>

 namespace Grid {

@@ -74,29 +103,32 @@ namespace Grid {
    GridBase *CoarseGrid;
    GridBase *FineGrid;
    std::vector<Lattice<Fobj> > subspace;
+    int checkerboard;

-    Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid) : 
-      CoarseGrid(_CoarseGrid),
+  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
+    CoarseGrid(_CoarseGrid),
      FineGrid(_FineGrid),
-      subspace(nbasis,_FineGrid)
+      subspace(nbasis,_FineGrid),
+      checkerboard(_checkerboard)
 	{
 	};
  
    void Orthogonalise(void){
      CoarseScalar InnerProd(CoarseGrid); 
+      std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
      blockOrthogonalise(InnerProd,subspace);
+      std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+      blockOrthogonalise(InnerProd,subspace);
+      //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
+      //      CheckOrthogonal();
    } 
    void CheckOrthogonal(void){
      CoarseVector iProj(CoarseGrid); 
      CoarseVector eProj(CoarseGrid); 
-      Lattice<CComplex> pokey(CoarseGrid);
-
-      
      for(int i=0;i<nbasis;i++){
 	blockProject(iProj,subspace[i],subspace);
-
 	eProj=zero; 
-	for(int ss=0;ss<CoarseGrid->oSites();ss++){
+	parallel_for(int ss=0;ss<CoarseGrid->oSites();ss++){
 	  eProj._odata[ss](i)=CComplex(1.0);
 	}
 	eProj=eProj - iProj;
@@ -108,6 +140,7 @@ namespace Grid {
      blockProject(CoarseVec,FineVec,subspace);
    }
    void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
+      FineVec.checkerboard = subspace[0].checkerboard;
      blockPromote(CoarseVec,FineVec,subspace);
    }
    void CreateSubspaceRandom(GridParallelRNG &RNG){
@@ -117,6 +150,57 @@ namespace Grid {
      }
      Orthogonalise();
    }
+
+    /*
+    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
+    {
+      // Run a Lanczos with sloppy convergence
+	const int Nstop = nn;
+	const int Nk = nn+20;
+	const int Np = nn+20;
+	const int Nm = Nk+Np;
+	const int MaxIt= 10000;
+	RealD resid = 1.0e-3;
+
+	Chebyshev<FineField> Cheb(0.5,64.0,21);
+	ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
+	//	IRL.lock = 1;
+
+	FineField noise(FineGrid); gaussian(RNG,noise);
+	FineField tmp(FineGrid); 
+	std::vector<RealD>     eval(Nm);
+	std::vector<FineField> evec(Nm,FineGrid);
+
+	int Nconv;
+	IRL.calc(eval,evec,
+		 noise,
+		 Nconv);
+
+    	// pull back nn vectors
+	for(int b=0;b<nn;b++){
+
+	  subspace[b]   = evec[b];
+
+	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
+
+	  hermop.Op(subspace[b],tmp); 
+	  std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
+
+	  noise = tmp -  sqrt(eval[b])*subspace[b] ;
+
+	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
+
+	  noise = tmp +  eval[b]*subspace[b] ;
+
+	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
+
+	}
+	Orthogonalise();
+	for(int b=0;b<nn;b++){
+	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
+	}
+    }
+    */
    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {

      RealD scale;
@@ -170,11 +254,10 @@ namespace Grid {
    ////////////////////
    Geometry         geom;
    GridBase *       _grid; 
-    CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil; 
+    CartesianStencil<siteVector,siteVector> Stencil; 

    std::vector<CoarseMatrix> A;

-    std::vector<siteVector,alignedAllocator<siteVector> >   comm_buf;
      
    ///////////////////////
    // Interface
@@ -187,10 +270,9 @@ namespace Grid {
      conformable(in._grid,out._grid);

      SimpleCompressor<siteVector> compressor;
-      Stencil.HaloExchange(in,comm_buf,compressor);
+      Stencil.HaloExchange(in,compressor);

-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<Grid()->oSites();ss++){
+      parallel_for(int ss=0;ss<Grid()->oSites();ss++){
        siteVector res = zero;
 	siteVector nbr;
 	int ptype;
@@ -204,7 +286,7 @@ PARALLEL_FOR_LOOP
 	  } else if(SE->_is_local) { 
 	    nbr = in._odata[SE->_offset];
 	  } else {
-	    nbr = comm_buf[SE->_offset];
+	    nbr = Stencil.CommBuf()[SE->_offset];
 	  }
 	  res = res + A[point]._odata[ss]*nbr;
 	}
@@ -228,7 +310,6 @@ PARALLEL_FOR_LOOP
      Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
      A(geom.npoint,&CoarseGrid)
    {
-      comm_buf.resize(Stencil._unified_buffer_size);
    };

    void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
@@ -303,8 +384,7 @@ PARALLEL_FOR_LOOP
 	  Subspace.ProjectToSubspace(oProj,oblock);
 	  //	  blockProject(iProj,iblock,Subspace.subspace);
 	  //	  blockProject(oProj,oblock,Subspace.subspace);
-PARALLEL_FOR_LOOP
-	  for(int ss=0;ss<Grid()->oSites();ss++){
+	  parallel_for(int ss=0;ss<Grid()->oSites();ss++){
 	    for(int j=0;j<nbasis;j++){
 	      if( disp!= 0 ) {
 		A[p]._odata[ss](j,i) = oProj._odata[ss](j);
@@ -350,7 +430,7 @@ PARALLEL_FOR_LOOP
 	A[p]=zero;
      }

-      GridParallelRNG  RNG(Grid()); RNG.SeedRandomDevice();
+      GridParallelRNG  RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34}));
      Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);

      Complex one(1.0);
@@ -0,0 +1,306 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Cshift.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_FFT_H_
+#define _GRID_FFT_H_
+
+#ifdef HAVE_FFTW
+#ifdef USE_MKL
+#include <fftw/fftw3.h>
+#else
+#include <fftw3.h>
+#endif
+#endif
+
+
+namespace Grid {
+
+  template<class scalar> struct FFTW { };
+
+#ifdef HAVE_FFTW	
+  template<> struct FFTW<ComplexD> {
+  public:
+
+    typedef fftw_complex FFTW_scalar;
+    typedef fftw_plan    FFTW_plan;
+
+    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+					FFTW_scalar *in, const int *inembed,		
+					int istride, int idist,		
+					FFTW_scalar *out, const int *onembed,		
+					int ostride, int odist,		
+					int sign, unsigned flags) {
+      return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+    }	  
+    
+    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+      ::fftw_flops(p,add,mul,fmas);
+    }
+
+    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+      ::fftw_execute_dft(p,in,out);
+    }
+    inline static void fftw_destroy_plan(const FFTW_plan p) {
+      ::fftw_destroy_plan(p);
+    }
+  };
+
+  template<> struct FFTW<ComplexF> {
+  public:
+
+    typedef fftwf_complex FFTW_scalar;
+    typedef fftwf_plan    FFTW_plan;
+
+    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+					FFTW_scalar *in, const int *inembed,		
+					int istride, int idist,		
+					FFTW_scalar *out, const int *onembed,		
+					int ostride, int odist,		
+					int sign, unsigned flags) {
+      return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+    }	  
+    
+    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+      ::fftwf_flops(p,add,mul,fmas);
+    }
+
+    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+      ::fftwf_execute_dft(p,in,out);
+    }
+    inline static void fftw_destroy_plan(const FFTW_plan p) {
+      ::fftwf_destroy_plan(p);
+    }
+  };
+
+#endif
+
+#ifndef FFTW_FORWARD
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+#endif
+
+  class FFT {
+  private:
+    
+    GridCartesian *vgrid;
+    GridCartesian *sgrid;
+    
+    int Nd;
+    double flops;
+    double flops_call;
+    uint64_t usec;
+    
+    std::vector<int> dimensions;
+    std::vector<int> processors;
+    std::vector<int> processor_coor;
+    
+  public:
+    
+    static const int forward=FFTW_FORWARD;
+    static const int backward=FFTW_BACKWARD;
+    
+    double Flops(void) {return flops;}
+    double MFlops(void) {return flops/usec;}
+    double USec(void)   {return (double)usec;}    
+
+    FFT ( GridCartesian * grid ) :
+    vgrid(grid),
+    Nd(grid->_ndimension),
+    dimensions(grid->_fdimensions),
+    processors(grid->_processors),
+    processor_coor(grid->_processor_coor)
+    {
+      flops=0;
+      usec =0;
+      std::vector<int> layout(Nd,1);
+      sgrid = new GridCartesian(dimensions,layout,processors);
+    };
+    
+    ~FFT ( void)  {
+      delete sgrid;
+    }
+    
+    template<class vobj>
+    void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){
+
+      conformable(result._grid,vgrid);
+      conformable(source._grid,vgrid);
+      Lattice<vobj> tmp(vgrid);
+      tmp = source;
+      for(int d=0;d<Nd;d++){
+	if( mask[d] ) {
+	  FFT_dim(result,tmp,d,sign);
+	  tmp=result;
+	}
+      }
+    }
+
+    template<class vobj>
+    void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
+      std::vector<int> mask(Nd,1);
+      FFT_dim_mask(result,source,mask,sign);
+    }
+
+
+    template<class vobj>
+    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
+#ifndef HAVE_FFTW
+      assert(0);
+#else
+      conformable(result._grid,vgrid);
+      conformable(source._grid,vgrid);
+
+      int L = vgrid->_ldimensions[dim];
+      int G = vgrid->_fdimensions[dim];
+      
+      std::vector<int> layout(Nd,1);
+      std::vector<int> pencil_gd(vgrid->_fdimensions);
+      
+      pencil_gd[dim] = G*processors[dim];
+      
+      // Pencil global vol LxLxGxLxL per node
+      GridCartesian pencil_g(pencil_gd,layout,processors);
+      
+      // Construct pencils
+      typedef typename vobj::scalar_object sobj;
+      typedef typename sobj::scalar_type   scalar;
+      
+      Lattice<sobj> pgbuf(&pencil_g);
+      
+
+      typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
+      typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
+      
+      int Ncomp = sizeof(sobj)/sizeof(scalar);
+      int Nlow  = 1;
+      for(int d=0;d<dim;d++){
+        Nlow*=vgrid->_ldimensions[d];
+      }
+      
+      int rank = 1;  /* 1d transforms */
+      int n[] = {G}; /* 1d transforms of length G */
+      int howmany = Ncomp;
+      int odist,idist,istride,ostride;
+      idist   = odist   = 1;          /* Distance between consecutive FT's */
+      istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
+      int *inembed = n, *onembed = n;
+      
+      scalar div;
+	  if ( sign == backward ) div = 1.0/G;
+	  else if ( sign == forward ) div = 1.0;
+	  else assert(0);
+      
+      FFTW_plan p;
+      {
+        FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0];
+        FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0];
+        p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
+                                             in,inembed,
+                                             istride,idist,
+                                             out,onembed,
+                                             ostride, odist,
+                                             sign,FFTW_ESTIMATE);
+      }
+      
+      // Barrel shift and collect global pencil
+      std::vector<int> lcoor(Nd), gcoor(Nd);
+      result = source;
+      int pc = processor_coor[dim];
+      for(int p=0;p<processors[dim];p++) {
+        PARALLEL_REGION
+        {
+          std::vector<int> cbuf(Nd);
+          sobj s;
+          
+          PARALLEL_FOR_LOOP_INTERN
+          for(int idx=0;idx<sgrid->lSites();idx++) {
+            sgrid->LocalIndexToLocalCoor(idx,cbuf);
+            peekLocalSite(s,result,cbuf);
+	    cbuf[dim]+=((pc+p) % processors[dim])*L;
+	    //            cbuf[dim]+=p*L;
+            pokeLocalSite(s,pgbuf,cbuf);
+          }
+        }
+        if (p != processors[dim] - 1)
+        {
+          result = Cshift(result,dim,L);
+        }
+      }
+      
+      // Loop over orthog coords
+      int NN=pencil_g.lSites();
+      GridStopWatch timer;
+      timer.Start();
+      PARALLEL_REGION
+      {
+        std::vector<int> cbuf(Nd);
+        
+        PARALLEL_FOR_LOOP_INTERN
+        for(int idx=0;idx<NN;idx++) {
+          pencil_g.LocalIndexToLocalCoor(idx, cbuf);
+          if ( cbuf[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
+            FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx];
+            FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx];
+            FFTW<scalar>::fftw_execute_dft(p,in,out);
+          }
+        }
+      }
+      timer.Stop();
+      
+      // performance counting
+      double add,mul,fma;
+      FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
+      flops_call = add+mul+2.0*fma;
+      usec += timer.useconds();
+      flops+= flops_call*NN;
+      
+      // writing out result
+      PARALLEL_REGION
+      {
+        std::vector<int> clbuf(Nd), cgbuf(Nd);
+        sobj s;
+        
+        PARALLEL_FOR_LOOP_INTERN
+        for(int idx=0;idx<sgrid->lSites();idx++) {
+          sgrid->LocalIndexToLocalCoor(idx,clbuf);
+          cgbuf = clbuf;
+          cgbuf[dim] = clbuf[dim]+L*pc;
+          peekLocalSite(s,pgbuf,cgbuf);
+          pokeLocalSite(s,result,clbuf);
+        }
+      }
+      result = result*div;
+      
+      // destroying plan
+      FFTW<scalar>::fftw_destroy_plan(p);
+#endif
+    }
+  };
+}
+
+#endif
@@ -1,3 +1,31 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/LinearOperator.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef  GRID_ALGORITHM_LINEAR_OP_H
 #define  GRID_ALGORITHM_LINEAR_OP_H

@@ -23,7 +51,7 @@ namespace Grid {

      virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
      virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
-      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
+      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
      virtual void HermOp(const Field &in, Field &out)=0;
    };

@@ -134,15 +162,10 @@ namespace Grid {
 	_Mat.M(in,out);
      }
      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-	ComplexD dot;
-
 	_Mat.M(in,out);
 	
-	dot= innerProduct(in,out);
-	n1=real(dot);
-
-	dot = innerProduct(out,out);
-	n2=real(dot);
+	ComplexD dot= innerProduct(in,out); n1=real(dot);
+	n2=norm2(out);
      }
      void HermOp(const Field &in, Field &out){
 	_Mat.M(in,out);
@@ -160,14 +183,16 @@ namespace Grid {
      virtual  RealD Mpc      (const Field &in, Field &out) =0;
      virtual  RealD MpcDag   (const Field &in, Field &out) =0;
      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
-	Field tmp(in._grid);
+      Field tmp(in._grid);
+      tmp.checkerboard = in.checkerboard;
 	ni=Mpc(in,tmp);
 	no=MpcDag(tmp,out);
      }
-      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+      out.checkerboard = in.checkerboard;
 	MpcDagMpc(in,out,n1,n2);
      }
-      void HermOp(const Field &in, Field &out){
+      virtual void HermOp(const Field &in, Field &out){
 	RealD n1,n2;
 	HermOpAndNorm(in,out,n1,n2);
      }
@@ -184,7 +209,6 @@ namespace Grid {
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	assert(0);
      }
-
    };
    template<class Matrix,class Field>
      class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
@@ -193,12 +217,15 @@ namespace Grid {
    public:
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
-	Field tmp(in._grid);
+      Field tmp(in._grid);
+      tmp.checkerboard = !in.checkerboard;
+	//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;

 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);

+      //std::cout << "cb in " << in.checkerboard << "  cb out " << out.checkerboard << std::endl;
 	_Mat.Mooee(in,out);
 	return axpy_norm(out,-1.0,tmp,out);
      }
@@ -206,7 +233,7 @@ namespace Grid {
 	Field tmp(in._grid);

 	_Mat.MeooeDag(in,tmp);
-	_Mat.MooeeInvDag(tmp,out);
+        _Mat.MooeeInvDag(tmp,out);
 	_Mat.MeooeDag(out,tmp);

 	_Mat.MooeeDag(in,out);
@@ -223,10 +250,10 @@ namespace Grid {
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);

-	_Mat.Meooe(in,tmp);
-	_Mat.MooeeInv(tmp,out);
-	_Mat.Meooe(out,tmp);
-	_Mat.MooeeInv(tmp,out);
+	_Mat.Meooe(in,out);
+	_Mat.MooeeInv(out,tmp);
+	_Mat.Meooe(tmp,out);
+	_Mat.MooeeInv(out,tmp);

 	return axpy_norm(out,-1.0,tmp,in);
      }
@@ -241,6 +268,110 @@ namespace Grid {
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
+    template<class Matrix,class Field>
+      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
+    protected:
+      Matrix &_Mat;
+    public:
+      SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
+
+      virtual  RealD Mpc      (const Field &in, Field &out) {
+	Field tmp(in._grid);
+
+	_Mat.MooeeInv(in,out);
+	_Mat.Meooe(out,tmp);
+	_Mat.MooeeInv(tmp,out);
+	_Mat.Meooe(out,tmp);
+
+	return axpy_norm(out,-1.0,tmp,in);
+      }
+      virtual  RealD MpcDag   (const Field &in, Field &out){
+	Field tmp(in._grid);
+
+	_Mat.MeooeDag(in,out);
+	_Mat.MooeeInvDag(out,tmp);
+	_Mat.MeooeDag(tmp,out);
+	_Mat.MooeeInvDag(out,tmp);
+
+	return axpy_norm(out,-1.0,tmp,in);
+      }
+    };
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
+    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
+    template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    //  Staggered use
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class Matrix,class Field>
+      class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
+    protected:
+      Matrix &_Mat;
+      Field tmp;
+      RealD mass;
+      double tMpc;
+      double tIP;
+      double tMeo;
+      double taxpby_norm;
+      uint64_t ncall;
+    public:
+      void Report(void)
+      {
+	std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
+	std::cout << GridLogMessage << " HermOpAndNorm.IP  "<< tIP /ncall<<" usec "<<std::endl;
+	std::cout << GridLogMessage << " Mpc.MeoMoe        "<< tMeo/ncall<<" usec "<<std::endl;
+	std::cout << GridLogMessage << " Mpc.axpby_norm    "<< taxpby_norm/ncall<<" usec "<<std::endl;
+      }
+      SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
+      { 
+	assert( _Mat.isTrivialEE() );
+	mass = _Mat.Mass();
+	tMpc=0;
+	tIP =0;
+        tMeo=0;
+        taxpby_norm=0;
+	ncall=0;
+      }
+      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+	ncall++;
+	tMpc-=usecond();
+	n2 = Mpc(in,out);
+	tMpc+=usecond();
+	tIP-=usecond();
+	ComplexD dot= innerProduct(in,out);
+	tIP+=usecond();
+	n1 = real(dot);
+      }
+      virtual void HermOp(const Field &in, Field &out){
+	ncall++;
+	tMpc-=usecond();
+	_Mat.Meooe(in,out);
+	_Mat.Meooe(out,tmp);
+	tMpc+=usecond();
+	taxpby_norm-=usecond();
+	axpby(out,-1.0,mass*mass,tmp,in);
+	taxpby_norm+=usecond();
+      }
+      virtual  RealD Mpc      (const Field &in, Field &out) {
+	tMeo-=usecond();
+	_Mat.Meooe(in,out);
+	_Mat.Meooe(out,tmp);
+	tMeo+=usecond();
+	taxpby_norm-=usecond();
+	RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
+	taxpby_norm+=usecond();
+	return nn;
+      }
+      virtual  RealD MpcDag   (const Field &in, Field &out){
+	return Mpc(in,out);
+      }
+      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+	assert(0);// Never need with staggered
+      }
+    };
+    template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;


    /////////////////////////////////////////////////////////////
@@ -249,6 +380,12 @@ namespace Grid {
    template<class Field> class OperatorFunction {
    public:
      virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
+      virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
+	assert(in.size()==out.size());
+	for(int k=0;k<in.size();k++){
+	  (*this)(Linop,in[k],out[k]);
+	}
+      };
    };

    template<class Field> class LinearFunction {
@@ -256,6 +393,14 @@ namespace Grid {
      virtual void operator() (const Field &in, Field &out) = 0;
    };

+    template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
+    public:
+      void operator() (const Field &in, Field &out){
+	out = in;
+      };
+    };
+
+
    /////////////////////////////////////////////////////////////
    // Base classes for Multishift solvers for operators
    /////////////////////////////////////////////////////////////
@@ -278,6 +423,64 @@ namespace Grid {
     };
    */

+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Hermitian operator Linear function and operator function
+  ////////////////////////////////////////////////////////////////////////////////////////////
+    template<class Field>
+    class HermOpOperatorFunction : public OperatorFunction<Field> {
+      void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+	Linop.HermOp(in,out);
+      };
+    };
+
+    template<typename Field>
+      class PlainHermOp : public LinearFunction<Field> {
+    public:
+      LinearOperatorBase<Field> &_Linop;
+      
+      PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
+      {}
+      
+      void operator()(const Field& in, Field& out) {
+	_Linop.HermOp(in,out);
+      }
+    };
+
+    template<typename Field>
+    class FunctionHermOp : public LinearFunction<Field> {
+    public:
+      OperatorFunction<Field>   & _poly;
+      LinearOperatorBase<Field> &_Linop;
+      
+      FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop) 
+	: _poly(poly), _Linop(linop) {};
+      
+      void operator()(const Field& in, Field& out) {
+	_poly(_Linop,in,out);
+      }
+    };
+
+  template<class Field>
+  class Polynomial : public OperatorFunction<Field> {
+  private:
+    std::vector<RealD> Coeffs;
+  public:
+    Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
+
+    // Implement the required interface
+    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+
+      Field AtoN(in._grid);
+      Field Mtmp(in._grid);
+      AtoN = in;
+      out = AtoN*Coeffs[0];
+      for(int n=1;n<Coeffs.size();n++){
+	Mtmp = AtoN;
+	Linop.HermOp(Mtmp,AtoN);
+	out=out+AtoN*Coeffs[n];
+      }
+    };
+  };

 }

@@ -0,0 +1,46 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/Preconditioner.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_PRECONDITIONER_H
+#define GRID_PRECONDITIONER_H
+
+namespace Grid {
+
+  template<class Field> class Preconditioner :  public LinearFunction<Field> { 
+    virtual void operator()(const Field &src, Field & psi)=0;
+  };
+
+  template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
+  public:
+    void operator()(const Field &src, Field & psi){
+      psi = src;
+    }
+    TrivialPrecon(void){};
+  };
+
+}
+#endif
@@ -1,7 +1,33 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/SparseMatrix.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef  GRID_ALGORITHM_SPARSE_MATRIX_H
 #define  GRID_ALGORITHM_SPARSE_MATRIX_H

-#include <Grid.h>

 namespace Grid {

@@ -29,6 +55,14 @@ namespace Grid {
    template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
    public:
      virtual GridBase *RedBlackGrid(void)=0;
+
+      //////////////////////////////////////////////////////////////////////
+      // Query the even even properties to make algorithmic decisions
+      //////////////////////////////////////////////////////////////////////
+      virtual RealD  Mass(void)        { return 0.0; };
+      virtual int    ConstEE(void)     { return 0; }; // Disable assumptions unless overridden
+      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+
      // half checkerboard operaions
      virtual  void Meooe    (const Field &in, Field &out)=0;
      virtual  void Mooee    (const Field &in, Field &out)=0;
@@ -1,45 +1,45 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/Chebyshev.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <clehner@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_CHEBYSHEV_H
 #define GRID_CHEBYSHEV_H

-#include<Grid.h>
-#include<algorithms/LinearOperator.h>
+#include <Grid/algorithms/LinearOperator.h>

 namespace Grid {

-  ////////////////////////////////////////////////////////////////////////////////////////////
-  // Simple general polynomial with user supplied coefficients
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  template<class Field>
-  class HermOpOperatorFunction : public OperatorFunction<Field> {
-    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
-      Linop.HermOp(in,out);
-    };
-  };
-
-  template<class Field>
-  class Polynomial : public OperatorFunction<Field> {
-  private:
-    std::vector<RealD> Coeffs;
-  public:
-    Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
-
-    // Implement the required interface
-    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
-
-      Field AtoN(in._grid);
-      Field Mtmp(in._grid);
-      AtoN = in;
-      out = AtoN*Coeffs[0];
-      //      std::cout <<"Poly in " <<norm2(in)<<std::endl;
-      //      std::cout <<"0 " <<norm2(out)<<std::endl;
-      for(int n=1;n<Coeffs.size();n++){
-	Mtmp = AtoN;
-	Linop.HermOp(Mtmp,AtoN);
-	out=out+AtoN*Coeffs[n];
-	//	std::cout << n<<" " <<norm2(out)<<std::endl;
-      }
-    };
-  };
+struct ChebyParams : Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
+				  RealD, alpha,  
+				  RealD, beta,   
+				  int, Npoly);
+};

  ////////////////////////////////////////////////////////////////////////////////////////////
  // Generic Chebyshev approximations
@@ -54,7 +54,10 @@ namespace Grid {

  public:
    void csv(std::ostream &out){
-      for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
+      RealD diff = hi-lo;
+      RealD delta = (hi-lo)*1.0e-9;
+      for (RealD x=lo; x<hi; x+=delta) {
+	delta*=1.1;
 	RealD f = approx(x);
 	out<< x<<" "<<f<<std::endl;
      }
@@ -70,11 +73,26 @@ namespace Grid {
    };

    Chebyshev(){};
+    Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
-    
+    Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
+
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
    ////////////////////////////////////////////////////////////////////////////////////////////////////
+// CJ: the one we need for Lanczos
+    void Init(RealD _lo,RealD _hi,int _order)
+    {
+      lo=_lo;
+      hi=_hi;
+      order=_order;
+      
+      if(order < 2) exit(-1);
+      Coeffs.resize(order);
+      Coeffs.assign(0.,order);
+      Coeffs[order-1] = 1.;
+    };
+
    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
    {
      lo=_lo;
@@ -150,11 +168,55 @@ namespace Grid {
      return sum;
    };

+    RealD approxD(RealD x)
+    {
+      RealD Un;
+      RealD Unm;
+      RealD Unp;
+      
+      RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+      
+      RealD U0=1;
+      RealD U1=2*y;
+      
+      RealD sum;
+      sum = Coeffs[1]*U0;
+      sum+= Coeffs[2]*U1*2.0;
+      
+      Un =U1;
+      Unm=U0;
+      for(int i=2;i<order-1;i++){
+	Unp=2*y*Un-Unm;
+	Unm=Un;
+	Un =Unp;
+	sum+= Un*Coeffs[i+1]*(i+1.0);
+      }
+      return sum/(0.5*(hi-lo));
+    };
+    
+    RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
+      RealD x = x0;
+      RealD eps;
+      
+      int i;
+      for (i=0;i<maxiter;i++) {
+	eps = approx(x) - z;
+	if (fabs(eps / z) < resid)
+	  return x;
+	x = x - eps / approxD(x);
+      }
+      
+      return std::numeric_limits<double>::quiet_NaN();
+    }
+    
    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {

      GridBase *grid=in._grid;

+      // std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
+      //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
+
      int vol=grid->gSites();

      Field T0(grid); T0 = in;  
@@ -0,0 +1,152 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/approx/Forecast.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef INCLUDED_FORECAST_H
+#define INCLUDED_FORECAST_H
+
+namespace Grid {
+
+  // Abstract base class.
+  // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
+  // and returns a forecasted solution to the system D*psi = phi (psi).
+  template<class Matrix, class Field>
+  class Forecast
+  {
+    public:
+      virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
+  };
+
+  // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
+  // used to forecast solutions across poles of the EOFA heatbath.
+  //
+  // Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
+  template<class Matrix, class Field>
+  class ChronoForecast : public Forecast<Matrix,Field>
+  {
+    public:
+      Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
+      {
+        int degree = prev_solns.size();
+        Field chi(phi); // forecasted solution
+
+        // Trivial cases
+        if(degree == 0){ chi = zero; return chi; }
+        else if(degree == 1){ return prev_solns[0]; }
+
+        RealD dot;
+        ComplexD xp;
+        Field r(phi); // residual
+        Field Mv(phi);
+        std::vector<Field> v(prev_solns); // orthonormalized previous solutions
+        std::vector<Field> MdagMv(degree,phi);
+
+        // Array to hold the matrix elements
+        std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
+
+        // Solution and source vectors
+        std::vector<ComplexD> a(degree);
+        std::vector<ComplexD> b(degree);
+
+        // Orthonormalize the vector basis
+        for(int i=0; i<degree; i++){
+          v[i] *= 1.0/std::sqrt(norm2(v[i]));
+          for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
+        }
+
+        // Perform sparse matrix multiplication and construct rhs
+        for(int i=0; i<degree; i++){
+          b[i] = innerProduct(v[i],phi);
+          Mat.M(v[i],Mv);
+          Mat.Mdag(Mv,MdagMv[i]);
+          G[i][i] = innerProduct(v[i],MdagMv[i]);
+        }
+
+        // Construct the matrix
+        for(int j=0; j<degree; j++){
+        for(int k=j+1; k<degree; k++){
+          G[j][k] = innerProduct(v[j],MdagMv[k]);
+          G[k][j] = std::conj(G[j][k]);
+        }}
+
+        // Gauss-Jordan elimination with partial pivoting
+        for(int i=0; i<degree; i++){
+
+          // Perform partial pivoting
+          int k = i;
+          for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
+          if(k != i){
+            xp = b[k];
+            b[k] = b[i];
+            b[i] = xp;
+            for(int j=0; j<degree; j++){
+              xp = G[k][j];
+              G[k][j] = G[i][j];
+              G[i][j] = xp;
+            }
+          }
+
+          // Convert matrix to upper triangular form
+          for(int j=i+1; j<degree; j++){
+            xp = G[j][i]/G[i][i];
+            b[j] -= xp * b[i];
+            for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
+          }
+        }
+
+        // Use Gaussian elimination to solve equations and calculate initial guess
+        chi = zero;
+        r = phi;
+        for(int i=degree-1; i>=0; i--){
+          a[i] = 0.0;
+          for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
+          a[i] = (b[i]-a[i])/G[i][i];
+          chi += a[i]*v[i];
+          r -= a[i]*MdagMv[i];
+        }
+
+        RealD true_r(0.0);
+        ComplexD tmp;
+        for(int i=0; i<degree; i++){
+          tmp = -b[i];
+          for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
+          tmp = std::conj(tmp)*tmp;
+          true_r += std::sqrt(tmp.real());
+        }
+
+        RealD error = std::sqrt(norm2(r)/norm2(phi));
+        std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
+
+        return chi;
+      };
+  };
+
+}
+
+#endif
@@ -0,0 +1,56 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/MultiShiftFunction.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/GridCore.h>
+
+namespace Grid {
+double MultiShiftFunction::approx(double x)
+{
+  double a = norm;
+  for(int n=0;n<poles.size();n++){
+    a = a + residues[n]/(x+poles[n]);
+  }
+  return a;
+}
+void MultiShiftFunction::gnuplot(std::ostream &out)
+{
+  out<<"f(x) = "<<norm<<"";
+  for(int n=0;n<poles.size();n++){
+    out<<"+("<<residues[n]<<"/(x+"<<poles[n]<<"))";
+  }
+  out<<";"<<std::endl;
+}
+void MultiShiftFunction::csv(std::ostream &out)
+{
+  for (double x=lo; x<hi; x*=1.05) {
+    double f = approx(x);
+    double r = sqrt(x);
+    out<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
+  }
+  return;
+}
+}
@@ -0,0 +1,67 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/MultiShiftFunction.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef MULTI_SHIFT_FUNCTION
+#define MULTI_SHIFT_FUNCTION
+
+namespace Grid {
+
+class MultiShiftFunction {
+public:
+  int order;
+  std::vector<RealD> poles;
+  std::vector<RealD> residues;
+  std::vector<RealD> tolerances;
+  RealD norm;
+  RealD lo,hi;
+
+  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
+  RealD approx(RealD x);
+  void csv(std::ostream &out);
+  void gnuplot(std::ostream &out);
+
+  void Init(AlgRemez & remez,double tol,bool inverse) 
+  {
+    order=remez.getDegree();
+    tolerances.resize(remez.getDegree(),tol);
+    poles.resize(remez.getDegree());
+    residues.resize(remez.getDegree());
+    remez.getBounds(lo,hi);
+    if ( inverse ) remez.getIPFE (&residues[0],&poles[0],&norm);
+    else           remez.getPFE (&residues[0],&poles[0],&norm);
+  }
+  // Allow deferred initialisation
+  MultiShiftFunction(void){};
+  MultiShiftFunction(AlgRemez & remez,double tol,bool inverse)
+  {
+    Init(remez,tol,inverse);
+  }
+
+};
+}
+#endif
@@ -20,7 +20,7 @@
 #include<iomanip>
 #include<cassert>

-#include<algorithms/approx/Remez.h>
+#include<Grid/algorithms/approx/Remez.h>

 // Constructor
 AlgRemez::AlgRemez(double lower, double upper, long precision) 
@@ -16,9 +16,13 @@
 #define INCLUDED_ALG_REMEZ_H

 #include <stddef.h>
+#include <Grid/GridStd.h>

-//#include <algorithms/approx/bigfloat.h>
-#include <algorithms/approx/bigfloat_double.h>
+#ifdef HAVE_LIBGMP
+#include "bigfloat.h"
+#else
+#include "bigfloat_double.h"
+#endif

 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
 #define SUM_MAX 10 // Maximum number of terms in exponential
@@ -1,3 +1,30 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/bigfloat_double.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #include <math.h>

 typedef double mfloat; 
@@ -1,3 +1,30 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/AdefGeneric.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
 #define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG

@@ -0,0 +1,698 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
+
+Copyright (C) 2017
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
+#define GRID_BLOCK_CONJUGATE_GRADIENT_H
+
+
+namespace Grid {
+
+enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
+
+//////////////////////////////////////////////////////////////////////////
+// Block conjugate gradient. Dimension zero should be the block direction
+//////////////////////////////////////////////////////////////////////////
+template <class Field>
+class BlockConjugateGradient : public OperatorFunction<Field> {
+ public:
+
+  typedef typename Field::scalar_type scomplex;
+
+  int blockDim ;
+  int Nblock;
+
+  BlockCGtype CGtype;
+  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+                           // Defaults true.
+  RealD Tolerance;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  Integer PrintInterval; //GridLogMessages or Iterative
+  
+  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
+  {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Thin QR factorisation (google it)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  //Dimensions
+  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
+  //
+  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
+  //
+  //   Q  C = R => Q = R C^{-1}
+  //
+  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
+  //
+  // Set C = L^{dag}, and then Q^dag Q = ident 
+  //
+  // Checks:
+  // Cdag C = Rdag R ; passes.
+  // QdagQ  = 1      ; passes
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 Field & Q,
+		 const Field & R)
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  sliceInnerProductMatrix(m_rr,R,R,Orthog);
+
+  // Force manifest hermitian to avoid rounding related
+  m_rr = 0.5*(m_rr+m_rr.adjoint());
+
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Q = R C^{-1}
+  //
+  // Q_j  = R_i Cinv(i,j) 
+  //
+  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  sliceMulMatrix(Q,Cinv,R,Orthog);
+}
+// see comments above
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 std::vector<Field> & Q,
+		 const std::vector<Field> & R)
+{
+  InnerProductMatrix(m_rr,R,R);
+
+  m_rr = 0.5*(m_rr+m_rr.adjoint());
+
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+
+  MulMatrix(Q,Cinv,R);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Call one of several implementations
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+{
+  if ( CGtype == BlockCGrQ ) {
+    BlockCGrQsolve(Linop,Src,Psi);
+  } else if (CGtype == CGmultiRHS ) {
+    CGmultiRHSsolve(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}
+virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) 
+{
+  if ( CGtype == BlockCGrQVec ) {
+    BlockCGrQsolveVec(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQ implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  Nblock = B._grid->_fdimensions[Orthog];
+/* FAKE */
+  Nblock=8;
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
+
+  X.checkerboard = B.checkerboard;
+  conformable(X, B);
+
+  Field tmp(B);
+  Field Q(B);
+  Field D(B);
+  Field Z(B);
+  Field AD(B);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  sliceNorm(ssq,B,Orthog);
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  sliceNorm(residuals,B,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  sliceNorm(residuals,X,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+  Linop.HermOp(X, AD);
+  tmp = B - AD;  
+
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+  D=Q;
+
+  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    Linop.HermOp(D, Z);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+    
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
+	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
+		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      Linop.HermOp(X, AD);
+      AD = AD-B;
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+//////////////////////////////////////////////////////////////////////////
+// multiRHS conjugate gradient. Dimension zero should be the block direction
+// Use this for spread out across nodes
+//////////////////////////////////////////////////////////////////////////
+void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+{
+  int Orthog = blockDim; // First dimension is block dim
+  Nblock = Src._grid->_fdimensions[Orthog];
+
+  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
+
+  Psi.checkerboard = Src.checkerboard;
+  conformable(Psi, Src);
+
+  Field P(Src);
+  Field AP(Src);
+  Field R(Src);
+  
+  std::vector<ComplexD> v_pAp(Nblock);
+  std::vector<RealD> v_rr (Nblock);
+  std::vector<RealD> v_rr_inv(Nblock);
+  std::vector<RealD> v_alpha(Nblock);
+  std::vector<RealD> v_beta(Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  sliceNorm(ssq,Src,Orthog);
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  sliceNorm(residuals,Src,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  sliceNorm(residuals,Psi,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  // Initial search dir is guess
+  Linop.HermOp(Psi, AP);
+
+  R = Src - AP;  
+  P = R;
+  sliceNorm(v_rr,R,Orthog);
+
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch sliceNormTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+
+  SolverTimer.Start();
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    RealD rrsum=0;
+    for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
+
+    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
+	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
+
+    MatrixTimer.Start();
+    Linop.HermOp(P, AP);
+    MatrixTimer.Stop();
+
+    // Alpha
+    sliceInnerTimer.Start();
+    sliceInnerProductVector(v_pAp,P,AP,Orthog);
+    sliceInnerTimer.Stop();
+    for(int b=0;b<Nblock;b++){
+      v_alpha[b] = v_rr[b]/real(v_pAp[b]);
+    }
+
+    // Psi, R update
+    sliceMaddTimer.Start();
+    sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi
+    sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
+    sliceMaddTimer.Stop();
+
+    // Beta
+    for(int b=0;b<Nblock;b++){
+      v_rr_inv[b] = 1.0/v_rr[b];
+    }
+    sliceNormTimer.Start();
+    sliceNorm(v_rr,R,Orthog);
+    sliceNormTimer.Stop();
+    for(int b=0;b<Nblock;b++){
+      v_beta[b] = v_rr_inv[b] *v_rr[b];
+    }
+
+    // Search update
+    sliceMaddTimer.Start();
+    sliceMaddVector(P,v_beta,P,R,Orthog);
+    sliceMaddTimer.Stop();
+
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    RealD max_resid=0;
+    for(int b=0;b<Nblock;b++){
+      RealD rr = v_rr[b]/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+    
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      Linop.HermOp(Psi, AP);
+      AP = AP-Src;
+      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+
+
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+
+void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
+  for(int b=0;b<Nblock;b++){
+  for(int bp=0;bp<Nblock;bp++) {
+    m(b,bp) = innerProduct(X[b],Y[bp]);  
+  }}
+}
+void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
+  // Should make this cache friendly with site outermost, parallel_for
+  // Deal with case AP aliases with either Y or X
+  std::vector<Field> tmp(Nblock,X[0]);
+  for(int b=0;b<Nblock;b++){
+    tmp[b]   = Y[b];
+    for(int bp=0;bp<Nblock;bp++) {
+      tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp]; 
+    }
+  }
+  for(int b=0;b<Nblock;b++){
+    AP[b] = tmp[b];
+  }
+}
+void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
+  // Should make this cache friendly with site outermost, parallel_for
+  for(int b=0;b<Nblock;b++){
+    AP[b] = zero;
+    for(int bp=0;bp<Nblock;bp++) {
+      AP[b] += (m(bp,b))*X[bp]; 
+    }
+  }
+}
+double normv(const std::vector<Field> &P){
+  double nn = 0.0;
+  for(int b=0;b<Nblock;b++) {
+    nn+=norm2(P[b]);
+  }
+  return nn;
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQvec implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) 
+{
+  Nblock = B.size();
+  assert(Nblock == X.size());
+
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
+
+  for(int b=0;b<Nblock;b++){ 
+    X[b].checkerboard = B[b].checkerboard;
+    conformable(X[b], B[b]);
+    conformable(X[b], X[0]); 
+  }
+
+  Field Fake(B[0]);
+
+  std::vector<Field> tmp(Nblock,Fake);
+  std::vector<Field>   Q(Nblock,Fake);
+  std::vector<Field>   D(Nblock,Fake);
+  std::vector<Field>   Z(Nblock,Fake);
+  std::vector<Field>  AD(Nblock,Fake);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQvec algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+  for(int b=0;b<Nblock;b++) {
+    Linop.HermOp(X[b], AD[b]);
+    tmp[b] = B[b] - AD[b];  
+  }
+
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+
+  for(int b=0;b<Nblock;b++) D[b]=Q[b];
+
+  std::cout << GridLogMessage<<"BlockCGrQ vec computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    for(int b=0;b<Nblock;b++) Linop.HermOp(D[b], Z[b]);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    InnerProductMatrix(m_DZ,D,Z);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+    
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    MaddMatrix(X,m_tmp, D,X);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    MaddMatrix(tmp,m_M,Z,Q,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+    sliceMaddTimer.Start();
+    MaddMatrix(D,m_tmp,D,Q);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
+      for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+
+
+
+};
+
+}
+#endif
@@ -0,0 +1,177 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/ConjugateGradient.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_H
+#define GRID_CONJUGATE_GRADIENT_H
+
+namespace Grid {
+
+/////////////////////////////////////////////////////////////
+// Base classes for iterative processes based on operators
+// single input vec, single output vec.
+/////////////////////////////////////////////////////////////
+
+template <class Field>
+class ConjugateGradient : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+                           // Defaults true.
+  RealD Tolerance;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  
+  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
+      : Tolerance(tol),
+        MaxIterations(maxit),
+        ErrorOnNoConverge(err_on_no_conv){};
+
+  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
+
+
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);
+
+    RealD cp, c, a, d, b, ssq, qq, b_pred;
+
+    Field p(src);
+    Field mmp(src);
+    Field r(src);
+
+    // Initial residual computation & set up
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    
+    Linop.HermOpAndNorm(psi, mmp, d, b);
+
+    r = src - mmp;
+    p = r;
+
+    a = norm2(p);
+    cp = a;
+    ssq = norm2(src);
+
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   mmp " << b << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:  cp,r " << cp << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:     p " << a << std::endl;
+
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    // Check if guess is really REALLY good :)
+    if (cp <= rsq) {
+      return;
+    }
+
+    std::cout << GridLogIterative << std::setprecision(8)
+              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
+
+    GridStopWatch LinalgTimer;
+    GridStopWatch InnerTimer;
+    GridStopWatch AxpyNormTimer;
+    GridStopWatch LinearCombTimer;
+    GridStopWatch MatrixTimer;
+    GridStopWatch SolverTimer;
+
+    SolverTimer.Start();
+    int k;
+    for (k = 1; k <= MaxIterations*1000; k++) {
+      c = cp;
+
+      MatrixTimer.Start();
+      Linop.HermOp(p, mmp);
+      MatrixTimer.Stop();
+
+      LinalgTimer.Start();
+
+      InnerTimer.Start();
+      ComplexD dc  = innerProduct(p,mmp);
+      InnerTimer.Stop();
+      d = dc.real();
+      a = c / d;
+
+      AxpyNormTimer.Start();
+      cp = axpy_norm(r, -a, mmp, r);
+      AxpyNormTimer.Stop();
+      b = cp / c;
+
+      LinearCombTimer.Start();
+      parallel_for(int ss=0;ss<src._grid->oSites();ss++){
+	vstream(psi[ss], a      *  p[ss] + psi[ss]);
+	vstream(p  [ss], b      *  p[ss] + r[ss]);
+      }
+      LinearCombTimer.Stop();
+      LinalgTimer.Stop();
+
+      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
+                << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+
+      // Stopping condition
+      if (cp <= rsq) {
+        SolverTimer.Stop();
+        Linop.HermOpAndNorm(psi, mmp, d, qq);
+        p = mmp - src;
+
+        RealD srcnorm = sqrt(norm2(src));
+        RealD resnorm = sqrt(norm2(p));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
+        std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
+	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
+	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
+	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+
+        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
+
+	IterationsToComplete = k;	
+
+        return;
+      }
+    }
+    std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
+              << std::endl;
+
+    if (ErrorOnNoConverge) assert(0);
+    IterationsToComplete = k;
+
+  }
+};
+}
+#endif
@@ -0,0 +1,154 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+
+namespace Grid {
+
+  //Mixed precision restarted defect correction CG
+  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
+  public:                                                
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+    LinearFunction<FieldF> *guesser;
+    
+    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
+      Linop_f(_Linop_f), Linop_d(_Linop_d),
+      Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      OuterLoopNormMult(100.), guesser(NULL){ };
+
+    void useGuesser(LinearFunction<FieldF> &g){
+      guesser = &g;
+    }
+  
+    void operator() (const FieldD &src_d_in, FieldD &sol_d){
+      TotalInnerIterations = 0;
+	
+      GridStopWatch TotalTimer;
+      TotalTimer.Start();
+    
+      int cb = src_d_in.checkerboard;
+      sol_d.checkerboard = cb;
+    
+      RealD src_norm = norm2(src_d_in);
+      RealD stop = src_norm * Tolerance*Tolerance;
+
+      GridBase* DoublePrecGrid = src_d_in._grid;
+      FieldD tmp_d(DoublePrecGrid);
+      tmp_d.checkerboard = cb;
+    
+      FieldD tmp2_d(DoublePrecGrid);
+      tmp2_d.checkerboard = cb;
+    
+      FieldD src_d(DoublePrecGrid);
+      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
+    
+      RealD inner_tol = InnerTolerance;
+    
+      FieldF src_f(SinglePrecGrid);
+      src_f.checkerboard = cb;
+    
+      FieldF sol_f(SinglePrecGrid);
+      sol_f.checkerboard = cb;
+    
+      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      CG_f.ErrorOnNoConverge = false;
+
+      GridStopWatch InnerCGtimer;
+
+      GridStopWatch PrecChangeTimer;
+    
+      Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
+      
+      for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
+	//Compute double precision rsd and also new RHS vector.
+	Linop_d.HermOp(sol_d, tmp_d);
+	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
+      
+	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
+
+	if(norm < OuterLoopNormMult * stop){
+	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
+	  break;
+	}
+	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+
+	PrecChangeTimer.Start();
+	precisionChange(src_f, src_d);
+	PrecChangeTimer.Stop();
+      
+	zeroit(sol_f);
+
+	//Optionally improve inner solver guess (eg using known eigenvectors)
+	if(guesser != NULL)
+	  (*guesser)(src_f, sol_f);
+
+	//Inner CG
+	CG_f.Tolerance = inner_tol;
+	InnerCGtimer.Start();
+	CG_f(Linop_f, src_f, sol_f);
+	InnerCGtimer.Stop();
+	TotalInnerIterations += CG_f.IterationsToComplete;
+      
+	//Convert sol back to double and add to double prec solution
+	PrecChangeTimer.Start();
+	precisionChange(tmp_d, sol_f);
+	PrecChangeTimer.Stop();
+      
+	axpy(sol_d, 1.0, tmp_d, sol_d);
+      }
+    
+      //Final trial CG
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
+    
+      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
+      CG_d(Linop_d, src_d_in, sol_d);
+      TotalFinalStepIterations = CG_d.IterationsToComplete;
+
+      TotalTimer.Stop();
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+    }
+  };
+
+}
+
+#endif
@@ -1,3 +1,31 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
 #define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H

@@ -15,6 +43,7 @@ namespace Grid {
 public:                                                
    RealD   Tolerance;
    Integer MaxIterations;
+    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
    int verbose;
    MultiShiftFunction shifts;

@@ -135,7 +164,16 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  for(int s=0;s<nshift;s++) {
    axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
  }
-  
+ 
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch AXPYTimer;
+  GridStopWatch ShiftTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
  
  // Iteration loop
  int k;
@@ -143,7 +181,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  for (k=1;k<=MaxIterations;k++){
    
    a = c /cp;
+    AXPYTimer.Start();
    axpy(p,a,p,r);
+    AXPYTimer.Stop();
    
    // Note to self - direction ps is iterated seperately
    // for each shift. Does not appear to have any scope
@@ -152,6 +192,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
    // However SAME r is used. Could load "r" and update
    // ALL ps[s]. 2/3 Bandwidth saving
    // New Kernel: Load r, vector of coeffs, vector of pointers ps
+    AXPYTimer.Start();
    for(int s=0;s<nshift;s++){
      if ( ! converged[s] ) { 
 	if (s==0){
@@ -162,22 +203,34 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	}
      }
    }
+    AXPYTimer.Stop();
    
    cp=c;
+    MatrixTimer.Start();  
+    //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
+    // The below is faster on KNL
+    Linop.HermOp(p,mmp); 
+    d=real(innerProduct(p,mmp));
    
-    Linop.HermOpAndNorm(p,mmp,d,qq);
+    MatrixTimer.Stop();  
+
+    AXPYTimer.Start();
    axpy(mmp,mass[0],p,mmp);
+    AXPYTimer.Stop();
    RealD rn = norm2(p);
    d += rn*mass[0];
    
    bp=b;
    b=-cp/d;
    
+    AXPYTimer.Start();
    c=axpy_norm(r,b,mmp,r);
+    AXPYTimer.Stop();

    // Toggle the recurrence history
    bs[0] = b;
    iz = 1-iz;
+    ShiftTimer.Start();
    for(int s=1;s<nshift;s++){
      if((!converged[s])){
 	RealD z0 = z[s][1-iz];
@@ -187,6 +240,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
      }
    }
+    ShiftTimer.Stop();
    
    for(int s=0;s<nshift;s++){
      int ss = s;
@@ -229,6 +283,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
    
    if ( all_converged ){

+    SolverTimer.Stop();
+
+
      std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
      std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
      
@@ -241,12 +298,23 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	RealD cn = norm2(src);
 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
      }
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
+
+      IterationsToComplete = k;	
+
      return;
    }
+
+   
  }
  // ugly hack
  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-  assert(0);
+//  assert(0);
 }

  };
@@ -0,0 +1,256 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
+#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
+
+namespace Grid {
+
+  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+  class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
+  public:
+    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+    // Defaults true.
+    RealD Tolerance;
+    Integer MaxIterations;
+    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+    Integer ReliableUpdatesPerformed;
+
+    bool DoFinalCleanup; //Final DP cleanup, defaults to true
+    Integer IterationsToCleanup; //Final DP cleanup step iterations
+    
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+    GridBase* SinglePrecGrid;
+    RealD Delta; //reliable update parameter
+
+    //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
+    LinearOperatorBase<FieldF> *Linop_fallback;
+    RealD fallback_transition_tol;
+
+    
+    ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
+      : Tolerance(tol),
+        MaxIterations(maxit),
+	Delta(_delta),
+	Linop_f(_Linop_f),
+	Linop_d(_Linop_d),
+	SinglePrecGrid(_sp_grid),
+        ErrorOnNoConverge(err_on_no_conv),
+	DoFinalCleanup(true),
+	Linop_fallback(NULL)
+    {};
+
+    void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
+      Linop_fallback = &_Linop_fallback;
+      fallback_transition_tol = _fallback_transition_tol;      
+    }
+    
+    void operator()(const FieldD &src, FieldD &psi) {
+      LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
+      bool using_fallback = false;
+      
+      psi.checkerboard = src.checkerboard;
+      conformable(psi, src);
+
+      RealD cp, c, a, d, b, ssq, qq, b_pred;
+
+      FieldD p(src);
+      FieldD mmp(src);
+      FieldD r(src);
+
+      // Initial residual computation & set up
+      RealD guess = norm2(psi);
+      assert(std::isnan(guess) == 0);
+    
+      Linop_d.HermOpAndNorm(psi, mmp, d, b);
+    
+      r = src - mmp;
+      p = r;
+
+      a = norm2(p);
+      cp = a;
+      ssq = norm2(src);
+
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   src " << ssq << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:    mp " << d << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   mmp " << b << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:  cp,r " << cp << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:     p " << a << std::endl;
+
+      RealD rsq = Tolerance * Tolerance * ssq;
+
+      // Check if guess is really REALLY good :)
+      if (cp <= rsq) {
+	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
+	std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
+	return;
+      }
+
+      //Single prec initialization
+      FieldF r_f(SinglePrecGrid);
+      r_f.checkerboard = r.checkerboard;
+      precisionChange(r_f, r);
+
+      FieldF psi_f(r_f);
+      psi_f = zero;
+
+      FieldF p_f(r_f);
+      FieldF mmp_f(r_f);
+
+      RealD MaxResidSinceLastRelUp = cp; //initial residual    
+    
+      std::cout << GridLogIterative << std::setprecision(4)
+		<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
+
+      GridStopWatch LinalgTimer;
+      GridStopWatch MatrixTimer;
+      GridStopWatch SolverTimer;
+
+      SolverTimer.Start();
+      int k = 0;
+      int l = 0;
+    
+      for (k = 1; k <= MaxIterations; k++) {
+	c = cp;
+
+	MatrixTimer.Start();
+	Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
+	MatrixTimer.Stop();
+
+	LinalgTimer.Start();
+
+	a = c / d;
+	b_pred = a * (a * qq - d) / c;
+
+	cp = axpy_norm(r_f, -a, mmp_f, r_f);
+	b = cp / c;
+
+	// Fuse these loops ; should be really easy
+	psi_f = a * p_f + psi_f;
+	//p_f = p_f * b + r_f;
+
+	LinalgTimer.Stop();
+
+	std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
+		  << " residual " << cp << " target " << rsq << std::endl;
+	std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl;
+	std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl;
+
+	if(cp > MaxResidSinceLastRelUp){
+	  std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
+	  MaxResidSinceLastRelUp = cp;
+	}
+	  
+	// Stopping condition
+	if (cp <= rsq) {
+	  //Although not written in the paper, I assume that I have to add on the final solution
+	  precisionChange(mmp, psi_f);
+	  psi = psi + mmp;
+	
+	
+	  SolverTimer.Stop();
+	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
+	  p = mmp - src;
+
+	  RealD srcnorm = sqrt(norm2(src));
+	  RealD resnorm = sqrt(norm2(p));
+	  RealD true_residual = resnorm / srcnorm;
+
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
+	  std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
+	  std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
+	  std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+
+	  std::cout << GridLogMessage << "Time breakdown "<<std::endl;
+	  std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	  std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	  std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+
+	  IterationsToComplete = k;	
+	  ReliableUpdatesPerformed = l;
+	  
+	  if(DoFinalCleanup){
+	    //Do a final CG to cleanup
+	    std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
+	    ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
+	    CG.ErrorOnNoConverge = ErrorOnNoConverge;
+	    CG(Linop_d,src,psi);
+	    IterationsToCleanup = CG.IterationsToComplete;
+	  }
+	  else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
+
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
+	  return;
+	}
+	else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
+		    << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
+	  precisionChange(mmp, psi_f);
+	  psi = psi + mmp;
+
+	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
+	  r = src - mmp;
+
+	  psi_f = zero;
+	  precisionChange(r_f, r);
+	  cp = norm2(r);
+	  MaxResidSinceLastRelUp = cp;
+
+	  b = cp/c;
+	  
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
+	  
+	  l = l+1;
+	}
+
+	p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
+
+	if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
+	  Linop_f_use = Linop_fallback;
+	  using_fallback = true;
+	}
+
+	
+      }
+      std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
+		<< std::endl;
+      
+      if (ErrorOnNoConverge) assert(0);
+      IterationsToComplete = k;
+      ReliableUpdatesPerformed = l;      
+    }    
+  };
+
+
+};
+
+
+
+#endif
@@ -1,3 +1,31 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateResidual.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_RESIDUAL_H
 #define GRID_CONJUGATE_RESIDUAL_H

@@ -0,0 +1,104 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_DEFLATION_H
+#define GRID_DEFLATION_H
+
+namespace Grid { 
+
+template<class Field>
+class ZeroGuesser: public LinearFunction<Field> {
+public:
+  virtual void operator()(const Field &src, Field &guess) { guess = zero; };
+};
+
+template<class Field>
+class SourceGuesser: public LinearFunction<Field> {
+public:
+  virtual void operator()(const Field &src, Field &guess) { guess = src; };
+};
+
+////////////////////////////////
+// Fine grid deflation
+////////////////////////////////
+template<class Field>
+class DeflatedGuesser: public LinearFunction<Field> {
+private:
+  const std::vector<Field> &evec;
+  const std::vector<RealD> &eval;
+
+public:
+
+  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
+
+  virtual void operator()(const Field &src,Field &guess) {
+    guess = zero;
+    assert(evec.size()==eval.size());
+    auto N = evec.size();
+    for (int i=0;i<N;i++) {
+      const Field& tmp = evec[i];
+      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
+    }
+    guess.checkerboard = src.checkerboard;
+  }
+};
+
+template<class FineField, class CoarseField>
+class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
+private:
+  const std::vector<FineField>   &subspace;
+  const std::vector<CoarseField> &evec_coarse;
+  const std::vector<RealD>       &eval_coarse;
+public:
+  
+  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
+				const std::vector<CoarseField> &_evec_coarse,
+				const std::vector<RealD>       &_eval_coarse)
+    : subspace(_subspace), 
+      evec_coarse(_evec_coarse), 
+      eval_coarse(_eval_coarse)  
+  {
+  }
+  
+  void operator()(const FineField &src,FineField &guess) { 
+    int N = (int)evec_coarse.size();
+    CoarseField src_coarse(evec_coarse[0]._grid);
+    CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero;
+    blockProject(src_coarse,src,subspace);    
+    for (int i=0;i<N;i++) {
+      const CoarseField & tmp = evec_coarse[i];
+      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
+    }
+    blockPromote(guess_coarse,guess,subspace);
+    guess.checkerboard = src.checkerboard;
+  };
+};
+
+
+
+}
+#endif
@@ -0,0 +1,842 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+Author: Christoph Lehner <clehner@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_BIRL_H
+#define GRID_BIRL_H
+
+#include <string.h> //memset
+//#include <zlib.h>
+#include <sys/stat.h>
+
+namespace Grid { 
+
+  ////////////////////////////////////////////////////////
+  // Move following 100 LOC to lattice/Lattice_basis.h
+  ////////////////////////////////////////////////////////
+template<class Field>
+void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
+{
+  for(int j=0; j<k; ++j){
+    auto ip = innerProduct(basis[j],w);
+    w = w - ip*basis[j];
+  }
+}
+
+template<class Field>
+void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
+{
+  typedef typename Field::vector_object vobj;
+  GridBase* grid = basis[0]._grid;
+      
+  parallel_region
+  {
+
+    std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
+       
+    parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
+      for(int j=j0; j<j1; ++j) B[j]=0.;
+      
+      for(int j=j0; j<j1; ++j){
+	for(int k=k0; k<k1; ++k){
+	  B[j] +=Qt(j,k) * basis[k]._odata[ss];
+	}
+      }
+      for(int j=j0; j<j1; ++j){
+	  basis[j]._odata[ss] = B[j];
+      }
+    }
+  }
+}
+
+// Extract a single rotated vector
+template<class Field>
+void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
+{
+  typedef typename Field::vector_object vobj;
+  GridBase* grid = basis[0]._grid;
+
+  result.checkerboard = basis[0].checkerboard;
+  parallel_for(int ss=0;ss < grid->oSites();ss++){
+    vobj B = zero;
+    for(int k=k0; k<k1; ++k){
+      B +=Qt(j,k) * basis[k]._odata[ss];
+    }
+    result._odata[ss] = B;
+  }
+}
+
+template<class Field>
+void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
+{
+  int vlen = idx.size();
+
+  assert(vlen>=1);
+  assert(vlen<=sort_vals.size());
+  assert(vlen<=_v.size());
+
+  for (size_t i=0;i<vlen;i++) {
+
+    if (idx[i] != i) {
+
+      //////////////////////////////////////
+      // idx[i] is a table of desired sources giving a permutation.
+      // Swap v[i] with v[idx[i]].
+      // Find  j>i for which _vnew[j] = _vold[i],
+      // track the move idx[j] => idx[i]
+      // track the move idx[i] => i
+      //////////////////////////////////////
+      size_t j;
+      for (j=i;j<idx.size();j++)
+	if (idx[j]==i)
+	  break;
+
+      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
+
+      std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
+      std::swap(sort_vals[i],sort_vals[idx[i]]);
+
+      idx[j] = idx[i];
+      idx[i] = i;
+    }
+  }
+}
+
+inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
+{
+  std::vector<int> idx(sort_vals.size());
+  std::iota(idx.begin(), idx.end(), 0);
+
+  // sort indexes based on comparing values in v
+  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
+    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
+  });
+  return idx;
+}
+
+template<class Field>
+void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
+{
+  std::vector<int> idx = basisSortGetIndex(sort_vals);
+  if (reverse)
+    std::reverse(idx.begin(), idx.end());
+  
+  basisReorderInPlace(_v,sort_vals,idx);
+}
+
+/////////////////////////////////////////////////////////////
+// Implicitly restarted lanczos
+/////////////////////////////////////////////////////////////
+template<class Field> class ImplicitlyRestartedLanczosTester 
+{
+ public:
+  virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
+  virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
+};
+
+enum IRLdiagonalisation { 
+  IRLdiagonaliseWithDSTEGR,
+  IRLdiagonaliseWithQR,
+  IRLdiagonaliseWithEigen
+};
+
+template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
+{
+ public:
+
+  LinearFunction<Field>       &_HermOp;
+  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
+  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
+  {
+    return TestConvergence(j,resid,B,eval,evalMaxApprox);
+  }
+  int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox)
+  {
+    Field v(B);
+    RealD eval_poly = eval;
+    // Apply operator
+    _HermOp(B,v);
+
+    RealD vnum = real(innerProduct(B,v)); // HermOp.
+    RealD vden = norm2(B);
+    RealD vv0  = norm2(v);
+    eval   = vnum/vden;
+    v -= eval*B;
+
+    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
+
+    std::cout.precision(13);
+    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
+	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<std::endl;
+
+    int conv=0;
+    if( (vv<eresid*eresid) ) conv = 1;
+
+    return conv;
+  }
+};
+
+template<class Field> 
+class ImplicitlyRestartedLanczos {
+ private:
+  const RealD small = 1.0e-8;
+  int MaxIter;
+  int MinRestart; // Minimum number of restarts; only check for convergence after
+  int Nstop;   // Number of evecs checked for convergence
+  int Nk;      // Number of converged sought
+  //  int Np;      // Np -- Number of spare vecs in krylov space //  == Nm - Nk
+  int Nm;      // Nm -- total number of vectors
+  IRLdiagonalisation diagonalisation;
+  int orth_period;
+    
+  RealD OrthoTime;
+  RealD eresid, betastp;
+  ////////////////////////////////
+  // Embedded objects
+  ////////////////////////////////
+  LinearFunction<Field>       &_PolyOp;
+  LinearFunction<Field>       &_HermOp;
+  ImplicitlyRestartedLanczosTester<Field> &_Tester;
+  // Default tester provided (we need a ref to something in default case)
+  ImplicitlyRestartedLanczosHermOpTester<Field> SimpleTester;
+  /////////////////////////
+  // Constructor
+  /////////////////////////
+  
+public:       
+
+  //////////////////////////////////////////////////////////////////
+  // PAB:
+  //////////////////////////////////////////////////////////////////
+  // Too many options  & knobs. 
+  // Eliminate:
+  //   orth_period
+  //   betastp
+  //   MinRestart
+  //
+  // Do we really need orth_period
+  // What is the theoretical basis & guarantees of betastp ?
+  // Nstop=Nk viable?
+  // MinRestart avoidable with new convergence test?
+  // Could cut to PolyOp, HermOp, Tester, Nk, Nm, resid, maxiter (+diagonalisation)
+  // HermOp could be eliminated if we dropped the Power method for max eval.
+  // -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear
+  //////////////////////////////////////////////////////////////////
+ ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
+			    LinearFunction<Field> & HermOp,
+			    ImplicitlyRestartedLanczosTester<Field> & Tester,
+			    int _Nstop, // sought vecs
+			    int _Nk, // sought vecs
+			    int _Nm, // spare vecs
+			    RealD _eresid, // resid in lmdue deficit 
+			    int _MaxIter, // Max iterations
+			    RealD _betastp=0.0, // if beta(k) < betastp: converged
+			    int _MinRestart=1, int _orth_period = 1,
+			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
+    SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester),
+    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
+    eresid(_eresid),      betastp(_betastp),
+    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
+    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
+
+    ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
+			       LinearFunction<Field> & HermOp,
+			       int _Nstop, // sought vecs
+			       int _Nk, // sought vecs
+			       int _Nm, // spare vecs
+			       RealD _eresid, // resid in lmdue deficit 
+			       int _MaxIter, // Max iterations
+			       RealD _betastp=0.0, // if beta(k) < betastp: converged
+			       int _MinRestart=1, int _orth_period = 1,
+			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
+    SimpleTester(HermOp),  _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(SimpleTester),
+    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
+    eresid(_eresid),      betastp(_betastp),
+    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
+    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
+
+  ////////////////////////////////
+  // Helpers
+  ////////////////////////////////
+  template<typename T>  static RealD normalise(T& v) 
+  {
+    RealD nn = norm2(v);
+    nn = sqrt(nn);
+    v = v * (1.0/nn);
+    return nn;
+  }
+
+  void orthogonalize(Field& w, std::vector<Field>& evec,int k)
+  {
+    OrthoTime-=usecond()/1e6;
+    basisOrthogonalize(evec,w,k);
+    normalise(w);
+    OrthoTime+=usecond()/1e6;
+  }
+
+/* Rudy Arthur's thesis pp.137
+------------------------
+Require: M > K P = M − K †
+Compute the factorization AVM = VM HM + fM eM 
+repeat
+  Q=I
+  for i = 1,...,P do
+    QiRi =HM −θiI Q = QQi
+    H M = Q †i H M Q i
+  end for
+  βK =HM(K+1,K) σK =Q(M,K)
+  r=vK+1βK +rσK
+  VK =VM(1:M)Q(1:M,1:K)
+  HK =HM(1:K,1:K)
+  →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
+until convergence
+*/
+  void calc(std::vector<RealD>& eval, std::vector<Field>& evec,  const Field& src, int& Nconv, bool reverse=false)
+  {
+    GridBase *grid = src._grid;
+    assert(grid == evec[0]._grid);
+    
+    GridLogIRL.TimingMode(1);
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogIRL <<" -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
+    std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
+    std::cout << GridLogIRL <<" -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
+    std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl;
+    std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl;
+    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
+      std::cout << GridLogIRL << "Diagonalisation is DSTEGR "<<std::endl;
+    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
+      std::cout << GridLogIRL << "Diagonalisation is QR "<<std::endl;
+    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+      std::cout << GridLogIRL << "Diagonalisation is Eigen "<<std::endl;
+    }
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+	
+    assert(Nm <= evec.size() && Nm <= eval.size());
+    
+    // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
+    RealD evalMaxApprox = 0.0;
+    {
+      auto src_n = src;
+      auto tmp = src;
+      const int _MAX_ITER_IRL_MEVAPP_ = 50;
+      for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
+	normalise(src_n);
+	_HermOp(src_n,tmp);
+	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+	RealD vden = norm2(src_n);
+	RealD na = vnum/vden;
+	if (fabs(evalMaxApprox/na - 1.0) < 0.05)
+	  i=_MAX_ITER_IRL_MEVAPP_;
+	evalMaxApprox = na;
+	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
+	src_n = tmp;
+      }
+    }
+	
+    std::vector<RealD> lme(Nm);  
+    std::vector<RealD> lme2(Nm);
+    std::vector<RealD> eval2(Nm);
+    std::vector<RealD> eval2_copy(Nm);
+    Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm);
+
+    Field f(grid);
+    Field v(grid);
+    int k1 = 1;
+    int k2 = Nk;
+    RealD beta_k;
+
+    Nconv = 0;
+  
+    // Set initial vector
+    evec[0] = src;
+    normalise(evec[0]);
+	
+    // Initial Nk steps
+    OrthoTime=0.;
+    for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
+    std::cout<<GridLogIRL <<"Initial "<< Nk <<"steps done "<<std::endl;
+    std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
+
+    //////////////////////////////////
+    // Restarting loop begins
+    //////////////////////////////////
+    int iter;
+    for(iter = 0; iter<MaxIter; ++iter){
+      
+      OrthoTime=0.;
+
+      std::cout<< GridLogMessage <<" **********************"<< std::endl;
+      std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
+      std::cout<< GridLogMessage <<" **********************"<< std::endl;
+
+      std::cout<<GridLogIRL <<" running "<<Nm-Nk <<" steps: "<<std::endl;
+      for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
+      f *= lme[Nm-1];
+
+      std::cout<<GridLogIRL <<" "<<Nm-Nk <<" steps done "<<std::endl;
+      std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
+	  
+      //////////////////////////////////
+      // getting eigenvalues
+      //////////////////////////////////
+      for(int k=0; k<Nm; ++k){
+	eval2[k] = eval[k+k1-1];
+	lme2[k] = lme[k+k1-1];
+      }
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
+      std::cout<<GridLogIRL <<" diagonalized "<<std::endl;
+
+      //////////////////////////////////
+      // sorting
+      //////////////////////////////////
+      eval2_copy = eval2;
+      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
+      std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
+      const int chunk=8;
+      for(int io=0; io<k2;io+=chunk){
+	std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
+	for(int ii=0;ii<chunk;ii++){
+	  if ( (io+ii)<k2 )
+	    std::cout<< " "<< std::setw(12)<< eval2[io+ii];
+	}
+	std::cout << std::endl;
+      }
+
+      //////////////////////////////////
+      // Implicitly shifted QR transformations
+      //////////////////////////////////
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      for(int ip=k2; ip<Nm; ++ip){ 
+	QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
+      }
+      std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
+
+      assert(k2<Nm);      assert(k2<Nm);      assert(k1>0);
+
+      basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
+      std::cout<<GridLogIRL <<"basisRotated  by Qt"<<std::endl;
+      
+      ////////////////////////////////////////////////////
+      // Compressed vector f and beta(k2)
+      ////////////////////////////////////////////////////
+      f *= Qt(k2-1,Nm-1);
+      f += lme[k2-1] * evec[k2];
+      beta_k = norm2(f);
+      beta_k = sqrt(beta_k);
+      std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
+	  
+      RealD betar = 1.0/beta_k;
+      evec[k2] = betar * f;
+      lme[k2-1] = beta_k;
+	  
+      ////////////////////////////////////////////////////
+      // Convergence test
+      ////////////////////////////////////////////////////
+      for(int k=0; k<Nm; ++k){    
+	eval2[k] = eval[k];
+	lme2[k] = lme[k];
+      }
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
+      std::cout<<GridLogIRL <<" Diagonalized "<<std::endl;
+	  
+      Nconv = 0;
+      if (iter >= MinRestart) {
+
+	std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
+
+	Field B(grid); B.checkerboard = evec[0].checkerboard;
+
+	//  power of two search pattern;  not every evalue in eval2 is assessed.
+	int allconv =1;
+	for(int jj = 1; jj<=Nstop; jj*=2){
+	  int j = Nstop-jj;
+	  RealD e = eval2_copy[j]; // Discard the evalue
+	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
+	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
+	    allconv=0;
+	  }
+	}
+	// Do evec[0] for good measure
+	{ 
+	  int j=0;
+	  RealD e = eval2_copy[0]; 
+	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
+	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
+	}
+	if ( allconv ) Nconv = Nstop;
+
+	// test if we converged, if so, terminate
+	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
+	//	if( Nconv>=Nstop || beta_k < betastp){
+	if( Nconv>=Nstop){
+	  goto converged;
+	}
+	  
+      } else {
+	std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n";
+      } // end of iter loop
+    }
+
+    std::cout<<GridLogError<<"\n NOT converged.\n";
+    abort();
+	
+  converged:
+    {
+      Field B(grid); B.checkerboard = evec[0].checkerboard;
+      basisRotate(evec,Qt,0,Nk,0,Nk,Nm);	    
+      std::cout << GridLogIRL << " Rotated basis"<<std::endl;
+      Nconv=0;
+      //////////////////////////////////////////////////////////////////////
+      // Full final convergence test; unconditionally applied
+      //////////////////////////////////////////////////////////////////////
+      for(int j = 0; j<=Nk; j++){
+	B=evec[j];
+	if( _Tester.ReconstructEval(j,eresid,B,eval2[j],evalMaxApprox) ) {
+	  Nconv++;
+	}
+      }
+
+      if ( Nconv < Nstop )
+	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
+
+      eval=eval2;
+      
+      //Keep only converged
+      eval.resize(Nconv);// Nstop?
+      evec.resize(Nconv,grid);// Nstop?
+      basisSortInPlace(evec,eval,reverse);
+      
+    }
+       
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogIRL << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogIRL << " -- Iterations  = "<< iter   << "\n";
+    std::cout << GridLogIRL << " -- beta(k)     = "<< beta_k << "\n";
+    std::cout << GridLogIRL << " -- Nconv       = "<< Nconv  << "\n";
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+  }
+
+ private:
+/* Saad PP. 195
+1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
+2. For k = 1,2,...,m Do:
+3. wk:=Avk−βkv_{k−1}      
+4. αk:=(wk,vk)       // 
+5. wk:=wk−αkvk       // wk orthog vk 
+6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+7. vk+1 := wk/βk+1
+8. EndDo
+ */
+  void step(std::vector<RealD>& lmd,
+	    std::vector<RealD>& lme, 
+	    std::vector<Field>& evec,
+	    Field& w,int Nm,int k)
+  {
+    const RealD tiny = 1.0e-20;
+    assert( k< Nm );
+
+    GridStopWatch gsw_op,gsw_o;
+
+    Field& evec_k = evec[k];
+
+    _PolyOp(evec_k,w);    std::cout<<GridLogIRL << "PolyOp" <<std::endl;
+
+    if(k>0) w -= lme[k-1] * evec[k-1];
+
+    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
+    RealD     alph = real(zalph);
+
+    w = w - alph * evec_k;// 5. wk:=wk−αkvk
+
+    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+    // 7. vk+1 := wk/βk+1
+
+    lmd[k] = alph;
+    lme[k] = beta;
+
+    if (k>0 && k % orth_period == 0) {
+      orthogonalize(w,evec,k); // orthonormalise
+      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
+    }
+
+    if(k < Nm-1) evec[k+1] = w;
+
+    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
+    if ( beta < tiny ) 
+      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
+  }
+
+  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+			 int Nk, int Nm,  
+			 Eigen::MatrixXd & Qt, // Nm x Nm
+			 GridBase *grid)
+  {
+    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
+
+    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
+    
+    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
+
+    for (int i = 0; i < Nk; i++) {
+      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
+    }
+    for (int i = 0; i < Nk; i++) {
+      for (int j = 0; j < Nk; j++) {
+	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
+      }
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // File could end here if settle on Eigen ??? !!!
+  ///////////////////////////////////////////////////////////////////////////
+  void QR_decomp(std::vector<RealD>& lmd,   // Nm 
+		 std::vector<RealD>& lme,   // Nm 
+		 int Nk, int Nm,            // Nk, Nm
+		 Eigen::MatrixXd& Qt,       // Nm x Nm matrix
+		 RealD Dsh, int kmin, int kmax)
+  {
+    int k = kmin-1;
+    RealD x;
+    
+    RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
+    RealD c = ( lmd[k] -Dsh) *Fden;
+    RealD s = -lme[k] *Fden;
+      
+    RealD tmpa1 = lmd[k];
+    RealD tmpa2 = lmd[k+1];
+    RealD tmpb  = lme[k];
+
+    lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+    lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+    lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+    x        =-s*lme[k+1];
+    lme[k+1] = c*lme[k+1];
+      
+    for(int i=0; i<Nk; ++i){
+      RealD Qtmp1 = Qt(k,i);
+      RealD Qtmp2 = Qt(k+1,i);
+      Qt(k,i)  = c*Qtmp1 - s*Qtmp2;
+      Qt(k+1,i)= s*Qtmp1 + c*Qtmp2; 
+    }
+
+    // Givens transformations
+    for(int k = kmin; k < kmax-1; ++k){
+      
+      RealD Fden = 1.0/hypot(x,lme[k-1]);
+      RealD c = lme[k-1]*Fden;
+      RealD s = - x*Fden;
+	
+      RealD tmpa1 = lmd[k];
+      RealD tmpa2 = lmd[k+1];
+      RealD tmpb  = lme[k];
+
+      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+      lme[k-1] = c*lme[k-1] -s*x;
+
+      if(k != kmax-2){
+	x = -s*lme[k+1];
+	lme[k+1] = c*lme[k+1];
+      }
+
+      for(int i=0; i<Nk; ++i){
+	RealD Qtmp1 = Qt(k,i);
+	RealD Qtmp2 = Qt(k+1,i);
+	Qt(k,i)     = c*Qtmp1 -s*Qtmp2;
+	Qt(k+1,i)   = s*Qtmp1 +c*Qtmp2;
+      }
+    }
+  }
+
+  void diagonalize(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+		   int Nk, int Nm,   
+		   Eigen::MatrixXd & Qt,
+		   GridBase *grid)
+  {
+    Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
+      diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid);
+    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
+      diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid);
+    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+      diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
+    } else { 
+      assert(0);
+    }
+  }
+
+#ifdef USE_LAPACK
+void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
+                   double *vl, double *vu, int *il, int *iu, double *abstol,
+                   int *m, double *w, double *z, int *ldz, int *isuppz,
+                   double *work, int *lwork, int *iwork, int *liwork,
+                   int *info);
+#endif
+
+void diagonalize_lapack(std::vector<RealD>& lmd,
+			std::vector<RealD>& lme, 
+			int Nk, int Nm,  
+			Eigen::MatrixXd& Qt,
+			GridBase *grid)
+{
+#ifdef USE_LAPACK
+  const int size = Nm;
+  int NN = Nk;
+  double evals_tmp[NN];
+  double evec_tmp[NN][NN];
+  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
+  double DD[NN];
+  double EE[NN];
+  for (int i = 0; i< NN; i++) {
+    for (int j = i - 1; j <= i + 1; j++) {
+      if ( j < NN && j >= 0 ) {
+	if (i==j) DD[i] = lmd[i];
+	if (i==j) evals_tmp[i] = lmd[i];
+	if (j==(i-1)) EE[j] = lme[j];
+      }
+    }
+  }
+  int evals_found;
+  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
+  int liwork =  3+NN*10 ;
+  int iwork[liwork];
+  double work[lwork];
+  int isuppz[2*NN];
+  char jobz = 'V'; // calculate evals & evecs
+  char range = 'I'; // calculate all evals
+  //    char range = 'A'; // calculate all evals
+  char uplo = 'U'; // refer to upper half of original matrix
+  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
+  int ifail[NN];
+  int info;
+  int total = grid->_Nprocessors;
+  int node  = grid->_processor;
+  int interval = (NN/total)+1;
+  double vl = 0.0, vu = 0.0;
+  int il = interval*node+1 , iu = interval*(node+1);
+  if (iu > NN)  iu=NN;
+  double tol = 0.0;
+  if (1) {
+    memset(evals_tmp,0,sizeof(double)*NN);
+    if ( il <= NN){
+      LAPACK_dstegr(&jobz, &range, &NN,
+		    (double*)DD, (double*)EE,
+		    &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
+		    &tol, // tolerance
+		    &evals_found, evals_tmp, (double*)evec_tmp, &NN,
+		    isuppz,
+		    work, &lwork, iwork, &liwork,
+		    &info);
+      for (int i = iu-1; i>= il-1; i--){
+	evals_tmp[i] = evals_tmp[i - (il-1)];
+	if (il>1) evals_tmp[i-(il-1)]=0.;
+	for (int j = 0; j< NN; j++){
+	  evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
+	  if (il>1) evec_tmp[i-(il-1)][j]=0.;
+	}
+      }
+    }
+    {
+      grid->GlobalSumVector(evals_tmp,NN);
+      grid->GlobalSumVector((double*)evec_tmp,NN*NN);
+    }
+  } 
+  // Safer to sort instead of just reversing it, 
+  // but the document of the routine says evals are sorted in increasing order. 
+  // qr gives evals in decreasing order.
+  for(int i=0;i<NN;i++){
+    lmd [NN-1-i]=evals_tmp[i];
+    for(int j=0;j<NN;j++){
+      Qt((NN-1-i),j)=evec_tmp[i][j];
+    }
+  }
+#else 
+  assert(0);
+#endif
+}
+
+void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+		    int Nk, int Nm,   
+		    Eigen::MatrixXd & Qt,
+		    GridBase *grid)
+{
+  int QRiter = 100*Nm;
+  int kmin = 1;
+  int kmax = Nk;
+  
+  // (this should be more sophisticated)
+  for(int iter=0; iter<QRiter; ++iter){
+    
+    // determination of 2x2 leading submatrix
+    RealD dsub = lmd[kmax-1]-lmd[kmax-2];
+    RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
+    RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
+    // (Dsh: shift)
+    
+    // transformation
+    QR_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
+    
+    // Convergence criterion (redef of kmin and kamx)
+    for(int j=kmax-1; j>= kmin; --j){
+      RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
+      if(fabs(lme[j-1])+dds > dds){
+	kmax = j+1;
+	goto continued;
+      }
+    }
+    QRiter = iter;
+    return;
+    
+  continued:
+    for(int j=0; j<kmax-1; ++j){
+      RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
+      if(fabs(lme[j])+dds > dds){
+	kmin = j+1;
+	break;
+      }
+    }
+  }
+  std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<QRiter<<"\n";
+  abort();
+}
+};
+}
+#endif
@@ -0,0 +1,406 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h
+
+    Copyright (C) 2015
+
+Author: Christoph Lehner <clehner@bnl.gov>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LOCAL_COHERENCE_IRL_H
+#define GRID_LOCAL_COHERENCE_IRL_H
+
+namespace Grid { 
+
+
+struct LanczosParams : Serializable {
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
+				  ChebyParams, Cheby,/*Chebyshev*/
+				  int, Nstop,    /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
+				  int, Nk,       /*Vecs in Lanczos seek converge*/
+				  int, Nm,       /*Total vecs in Lanczos include restart*/
+				  RealD, resid,  /*residual*/
+ 				  int, MaxIt, 
+				  RealD, betastp,  /* ? */
+				  int, MinRes);    // Must restart
+};
+
+struct LocalCoherenceLanczosParams : Serializable {
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
+				  bool, saveEvecs,
+				  bool, doFine,
+				  bool, doFineRead,
+				  bool, doCoarse,
+	       			  bool, doCoarseRead,
+				  LanczosParams, FineParams,
+				  LanczosParams, CoarseParams,
+				  ChebyParams,   Smoother,
+				  RealD        , coarse_relax_tol,
+				  std::vector<int>, blockSize,
+				  std::string, config,
+				  std::vector < std::complex<double>  >, omega,
+				  RealD, mass,
+				  RealD, M5);
+};
+
+// Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function
+template<class Fobj,class CComplex,int nbasis>
+class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+  LinearOperatorBase<FineField> &_Linop;
+  std::vector<FineField>        &subspace;
+
+  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
+    _Linop(linop), subspace(_subspace)
+  {  
+    assert(subspace.size() >0);
+  };
+
+  void operator()(const CoarseField& in, CoarseField& out) {
+    GridBase *FineGrid = subspace[0]._grid;    
+    int   checkerboard = subspace[0].checkerboard;
+      
+    FineField fin (FineGrid);     fin.checkerboard= checkerboard;
+    FineField fout(FineGrid);   fout.checkerboard = checkerboard;
+
+    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
+    _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
+    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis>
+class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+
+  OperatorFunction<FineField>   & _poly;
+  LinearOperatorBase<FineField> &_Linop;
+  std::vector<FineField>        &subspace;
+
+  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
+			  LinearOperatorBase<FineField>& linop, 
+			  std::vector<FineField> & _subspace) :
+    _poly(poly),
+    _Linop(linop),
+    subspace(_subspace)
+  {  };
+
+  void operator()(const CoarseField& in, CoarseField& out) {
+    
+    GridBase *FineGrid = subspace[0]._grid;    
+    int   checkerboard = subspace[0].checkerboard;
+
+    FineField fin (FineGrid); fin.checkerboard =checkerboard;
+    FineField fout(FineGrid);fout.checkerboard =checkerboard;
+    
+    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
+    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
+    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis>
+class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
+{
+ public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+  LinearFunction<CoarseField> & _Poly;
+  OperatorFunction<FineField>   & _smoother;
+  LinearOperatorBase<FineField> &_Linop;
+  RealD                          _coarse_relax_tol;
+  std::vector<FineField>        &_subspace;
+  
+  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
+					   OperatorFunction<FineField>   &smoother,
+					   LinearOperatorBase<FineField> &Linop,
+					   std::vector<FineField>        &subspace,
+					   RealD coarse_relax_tol=5.0e3) 
+    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
+      _coarse_relax_tol(coarse_relax_tol)  
+  {    };
+
+  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
+  {
+    CoarseField v(B);
+    RealD eval_poly = eval;
+
+    // Apply operator
+    _Poly(B,v);
+
+    RealD vnum = real(innerProduct(B,v)); // HermOp.
+    RealD vden = norm2(B);
+    RealD vv0  = norm2(v);
+    eval   = vnum/vden;
+    v -= eval*B;
+
+    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
+
+    std::cout.precision(13);
+    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
+	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<std::endl;
+
+    int conv=0;
+    if( (vv<eresid*eresid) ) conv = 1;
+    return conv;
+  }
+  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
+  {
+    GridBase *FineGrid = _subspace[0]._grid;    
+    int checkerboard   = _subspace[0].checkerboard;
+    FineField fB(FineGrid);fB.checkerboard =checkerboard;
+    FineField fv(FineGrid);fv.checkerboard =checkerboard;
+
+    blockPromote(B,fv,_subspace);  
+    
+    _smoother(_Linop,fv,fB); 
+
+    RealD eval_poly = eval;
+    _Linop.HermOp(fB,fv);
+
+    RealD vnum = real(innerProduct(fB,fv)); // HermOp.
+    RealD vden = norm2(fB);
+    RealD vv0  = norm2(fv);
+    eval   = vnum/vden;
+    fv -= eval*fB;
+    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
+
+    std::cout.precision(13);
+    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
+	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<std::endl;
+    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
+    if( (vv<eresid*eresid) ) return 1;
+    return 0;
+  }
+};
+
+////////////////////////////////////////////
+// Make serializable Lanczos params
+////////////////////////////////////////////
+template<class Fobj,class CComplex,int nbasis>
+class LocalCoherenceLanczos 
+{
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CComplex>                   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<Fobj>                       FineField;
+
+protected:
+  GridBase *_CoarseGrid;
+  GridBase *_FineGrid;
+  int _checkerboard;
+  LinearOperatorBase<FineField>                 & _FineOp;
+  
+  std::vector<RealD>                              &evals_fine;
+  std::vector<RealD>                              &evals_coarse; 
+  std::vector<FineField>                          &subspace;
+  std::vector<CoarseField>                        &evec_coarse;
+
+private:
+  std::vector<RealD>                              _evals_fine;
+  std::vector<RealD>                              _evals_coarse; 
+  std::vector<FineField>                          _subspace;
+  std::vector<CoarseField>                        _evec_coarse;
+
+public:
+
+  LocalCoherenceLanczos(GridBase *FineGrid,
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard) :
+    _CoarseGrid(CoarseGrid),
+    _FineGrid(FineGrid),
+    _FineOp(FineOp),
+    _checkerboard(checkerboard),
+    evals_fine  (_evals_fine),
+    evals_coarse(_evals_coarse),
+    subspace    (_subspace),
+    evec_coarse(_evec_coarse)
+  {
+    evals_fine.resize(0);
+    evals_coarse.resize(0);
+  };
+  //////////////////////////////////////////////////////////////////////////
+  // Alternate constructore, external storage for use by Hadrons module
+  //////////////////////////////////////////////////////////////////////////
+  LocalCoherenceLanczos(GridBase *FineGrid,
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard,
+			std::vector<FineField>   &ext_subspace,
+			std::vector<CoarseField> &ext_coarse,
+			std::vector<RealD>       &ext_eval_fine,
+			std::vector<RealD>       &ext_eval_coarse
+			) :
+    _CoarseGrid(CoarseGrid),
+    _FineGrid(FineGrid),
+    _FineOp(FineOp),
+    _checkerboard(checkerboard),
+    evals_fine  (ext_eval_fine), 
+    evals_coarse(ext_eval_coarse),
+    subspace    (ext_subspace),
+    evec_coarse (ext_coarse)
+  {
+    evals_fine.resize(0);
+    evals_coarse.resize(0);
+  };
+
+  void Orthogonalise(void ) {
+    CoarseScalar InnerProd(_CoarseGrid);
+    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+  };
+
+  template<typename T>  static RealD normalise(T& v) 
+  {
+    RealD nn = norm2(v);
+    nn = ::sqrt(nn);
+    v = v * (1.0/nn);
+    return nn;
+  }
+  /*
+  void fakeFine(void)
+  {
+    int Nk = nbasis;
+    subspace.resize(Nk,_FineGrid);
+    subspace[0]=1.0;
+    subspace[0].checkerboard=_checkerboard;
+    normalise(subspace[0]);
+    PlainHermOp<FineField>    Op(_FineOp);
+    for(int k=1;k<Nk;k++){
+      subspace[k].checkerboard=_checkerboard;
+      Op(subspace[k-1],subspace[k]);
+      normalise(subspace[k]);
+    }
+  }
+  */
+
+  void testFine(RealD resid) 
+  {
+    assert(evals_fine.size() == nbasis);
+    assert(subspace.size() == nbasis);
+    PlainHermOp<FineField>    Op(_FineOp);
+    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
+    for(int k=0;k<nbasis;k++){
+      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
+    }
+  }
+
+  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
+  {
+    assert(evals_fine.size() == nbasis);
+    assert(subspace.size() == nbasis);
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
+
+    for(int k=0;k<evec_coarse.size();k++){
+      if ( k < nbasis ) { 
+	assert(ChebySmoothTester.ReconstructEval(k,resid,evec_coarse[k],evals_coarse[k],1.0)==1);
+      } else { 
+	assert(ChebySmoothTester.ReconstructEval(k,resid*relax,evec_coarse[k],evals_coarse[k],1.0)==1);
+      }
+    }
+  }
+
+  void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, 
+		RealD MaxIt, RealD betastp, int MinRes)
+  {
+    assert(nbasis<=Nm);
+    Chebyshev<FineField>      Cheby(cheby_parms);
+    FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
+    PlainHermOp<FineField>    Op(_FineOp);
+
+    evals_fine.resize(Nm);
+    subspace.resize(Nm,_FineGrid);
+
+    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
+
+    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
+
+    int Nconv;
+    IRL.calc(evals_fine,subspace,src,Nconv,false);
+    
+    // Shrink down to number saved
+    assert(Nstop>=nbasis);
+    assert(Nconv>=nbasis);
+    evals_fine.resize(nbasis);
+    subspace.resize(nbasis,_FineGrid);
+  }
+  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
+		  int Nstop, int Nk, int Nm,RealD resid, 
+		  RealD MaxIt, RealD betastp, int MinRes)
+  {
+    Chebyshev<FineField>                          Cheby(cheby_op);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
+
+    evals_coarse.resize(Nm);
+    evec_coarse.resize(Nm,_CoarseGrid);
+
+    CoarseField src(_CoarseGrid);     src=1.0; 
+
+    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
+    int Nconv=0;
+    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
+    assert(Nconv>=Nstop);
+    evals_coarse.resize(Nstop);
+    evec_coarse.resize (Nstop,_CoarseGrid);
+    for (int i=0;i<Nstop;i++){
+      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
+    }
+  }
+};
+
+}
+#endif
@@ -0,0 +1,60 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/NormalEquations.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_NORMAL_EQUATIONS_H
+#define GRID_NORMAL_EQUATIONS_H
+
+namespace Grid {
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Take a matrix and form an NE solver calling a Herm solver
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class NormalEquations : public OperatorFunction<Field>{
+  private:
+    SparseMatrixBase<Field> & _Matrix;
+    OperatorFunction<Field> & _HermitianSolver;
+
+  public:
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations trick
+    /////////////////////////////////////////////////////
+  NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver) 
+    :  _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; 
+
+    void operator() (const Field &in, Field &out){
+ 
+      Field src(in._grid);
+
+      _Matrix.Mdag(in,src);
+      _HermitianSolver(src,out);  // Mdag M out = Mdag in
+ 
+    }     
+  };
+
+}
+#endif
@@ -1,3 +1,30 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/PrecConjugateResidual.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
 #define GRID_PREC_CONJUGATE_RESIDUAL_H

@@ -1,3 +1,31 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_PREC_GCR_H
 #define GRID_PREC_GCR_H

@@ -19,6 +47,10 @@ namespace Grid {
    int mmax;
    int nstep;
    int steps;
+    GridStopWatch PrecTimer;
+    GridStopWatch MatTimer;
+    GridStopWatch LinalgTimer;
+
    LinearFunction<Field> &Preconditioner;

   PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
@@ -40,14 +72,24 @@ namespace Grid {
      
      Field r(src._grid);

+        PrecTimer.Reset();
+         MatTimer.Reset();
+      LinalgTimer.Reset();
+
+      GridStopWatch SolverTimer;
+      SolverTimer.Start();
+
      steps=0;
      for(int k=0;k<MaxIterations;k++){

 	cp=GCRnStep(Linop,src,psi,rsq);

-	if ( verbose ) std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
+	std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;

 	if(cp<rsq) {
+
+	  SolverTimer.Stop();
+
 	  Linop.HermOp(psi,r);
 	  axpy(r,-1.0,src,r);
 	  RealD tr = norm2(r);
@@ -55,6 +97,11 @@ namespace Grid {
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "    <<sqrt(tr/ssq)
 	           << " target "           <<Tolerance <<std::endl;
+
+	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
+	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
+	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
+	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
 	  return;
 	}

@@ -62,6 +109,7 @@ namespace Grid {
      std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
      assert(0);
    }
+
    RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){

      RealD cp;
@@ -88,24 +136,25 @@ namespace Grid {
      // initial guess x0 is taken as nonzero.
      // r0=src-A x0 = src
      //////////////////////////////////
+      MatTimer.Start();
      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
+      MatTimer.Stop();
      r=src-Az;
      
      /////////////////////
      // p = Prec(r)
      /////////////////////
+      PrecTimer.Start();
      Preconditioner(r,z);
+      PrecTimer.Stop();

-      std::cout<<GridLogMessage<< " Preconditioner in " << norm2(r)<<std::endl; 
-      std::cout<<GridLogMessage<< " Preconditioner out " << norm2(z)<<std::endl; 
-      
+      MatTimer.Start();
      Linop.HermOp(z,tmp); 
+      MatTimer.Stop();

-      std::cout<<GridLogMessage<< " Preconditioner Aout " << norm2(tmp)<<std::endl; 
      ttmp=tmp;
      tmp=tmp-r;

-      std::cout<<GridLogMessage<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl; 
      /*
      std::cout<<GridLogMessage<<r<<std::endl;
      std::cout<<GridLogMessage<<z<<std::endl;
@@ -113,7 +162,9 @@ namespace Grid {
      std::cout<<GridLogMessage<<tmp<<std::endl;
      */

+      MatTimer.Start();
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
+      MatTimer.Stop();

      //p[0],q[0],qq[0] 
      p[0]= z;
@@ -137,18 +188,22 @@ namespace Grid {

 	cp = axpy_norm(r,-a,q[peri_k],r);  

-	std::cout<<GridLogMessage<< " VPGCR_step resid" <<sqrt(cp/rsq)<<std::endl; 
 	if((k==nstep-1)||(cp<rsq)){
 	  return cp;
 	}

+	std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " <<sqrt(cp/rsq)<<std::endl; 
+
+	PrecTimer.Start();
 	Preconditioner(r,z);// solve Az = r
+	PrecTimer.Stop();
+
+	MatTimer.Start();
 	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
-
-
 	Linop.HermOp(z,tmp);
+	MatTimer.Stop();
        tmp=tmp-r;
-	std::cout<<GridLogMessage<< " Preconditioner resid" <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
+	std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 

 	q[peri_kp]=Az;
 	p[peri_kp]=z;
@@ -0,0 +1,473 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_SCHUR_RED_BLACK_H
+#define GRID_SCHUR_RED_BLACK_H
+
+
+  /*
+   * Red black Schur decomposition
+   *
+   *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
+   *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
+   *                =         L                     D                     U
+   *
+   * L^-1 = (1              0 )
+   *        (-MoeMee^{-1}   1 )   
+   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
+   *           ( 0       1                    )
+   * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
+   *           ( 0       1                    )
+   *
+   * U^-1 = (1   -Mee^{-1} Meo)
+   *        (0    1           )
+   * U^{dag} = ( 1                 0)
+   *           (Meo^dag Mee^{-dag} 1)
+   * U^{-dag} = (  1                 0)
+   *            (-Meo^dag Mee^{-dag} 1)
+   ***********************
+   *     M psi = eta
+   ***********************
+   *Odd
+   * i)                 D_oo psi_o =  L^{-1}  eta_o
+   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * Wilson:
+   *      (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
+   * Stag:
+   *      D_oo psi_o = L^{-1}  eta =    (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * L^-1 eta_o= (1              0 ) (e
+   *             (-MoeMee^{-1}   1 )   
+   *
+   *Even
+   * ii)  Mee psi_e + Meo psi_o = src_e
+   *
+   *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+   *
+   * 
+   * TODO: Other options:
+   * 
+   * a) change checkerboards for Schur e<->o
+   *
+   * Left precon by Moo^-1
+   * b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 =  (D_oo)^dag M_oo^-dag Moo^-1 L^{-1}  eta_o
+   *                              eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * Right precon by Moo^-1
+   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
+   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
+   *                              psi_o = M_oo^-1 phi_o
+   * TODO: Deflation 
+   */
+namespace Grid {
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Use base class to share code
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Take a matrix and form a Red Black solver calling a Herm solver
+  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackBase {
+  protected:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+    OperatorFunction<Field> & _HermitianRBSolver;
+    int CBfactorise;
+    bool subGuess;
+  public:
+
+    SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
+    _HermitianRBSolver(HermitianRBSolver) 
+    { 
+      CBfactorise = 0;
+      subtractGuess(initSubGuess);
+    };
+    void subtractGuess(const bool initSubGuess)
+    {
+      subGuess = initSubGuess;
+    }
+    bool isSubtractGuess(void)
+    {
+      return subGuess;
+    }
+
+    /////////////////////////////////////////////////////////////
+    // Shared code
+    /////////////////////////////////////////////////////////////
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser<Field> guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out) 
+    {
+      ZeroGuesser<Field> guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+
+    template<class Guesser>
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) 
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+      int nblock = in.size();
+
+      std::vector<Field> src_o(nblock,grid);
+      std::vector<Field> sol_o(nblock,grid);
+      
+      std::vector<Field> guess_save;
+
+      Field resid(fgrid);
+      Field tmp(grid);
+
+      ////////////////////////////////////////////////
+      // Prepare RedBlack source
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++){
+	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
+      }
+      ////////////////////////////////////////////////
+      // Make the guesses
+      ////////////////////////////////////////////////
+      if ( subGuess ) guess_save.resize(nblock,grid);
+
+      for(int b=0;b<nblock;b++){
+	guess(src_o[b],sol_o[b]); 
+
+	if ( subGuess ) { 
+	  guess_save[b] = sol_o[b];
+	}
+      }
+      //////////////////////////////////////////////////////////////
+      // Call the block solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlackBase calling the solver for "<<nblock<<" RHS" <<std::endl;
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
+      // A2A boolean behavioural control & reconstruct other checkerboard
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++) {
+
+	if (subGuess)   sol_o[b] = sol_o[b] - guess_save[b];
+
+	///////// Needs even source //////////////
+	pickCheckerboard(Even,tmp,in[b]);
+	RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
+
+	/////////////////////////////////////////////////
+	// Check unprec residual if possible
+	/////////////////////////////////////////////////
+	if ( ! subGuess ) {
+	  _Matrix.M(out[b],resid); 
+	  resid = resid-in[b];
+	  RealD ns = norm2(in[b]);
+	  RealD nr = norm2(resid);
+	
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
+	} else {
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
+	}
+
+      }
+    }
+    template<class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
+
+      // FIXME CGdiagonalMee not implemented virtual function
+      // FIXME use CBfactorise to control schur decomp
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field resid(fgrid);
+      Field src_o(grid);
+      Field src_e(grid);
+      Field sol_o(grid);
+
+      ////////////////////////////////////////////////
+      // RedBlack source
+      ////////////////////////////////////////////////
+      RedBlackSource(_Matrix,in,src_e,src_o);
+
+      ////////////////////////////////
+      // Construct the guess
+      ////////////////////////////////
+      Field   tmp(grid);
+      guess(src_o,sol_o);
+
+      Field  guess_save(grid);
+      guess_save = sol_o;
+
+      //////////////////////////////////////////////////////////////
+      // Call the red-black solver
+      //////////////////////////////////////////////////////////////
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
+      // Fionn A2A boolean behavioural control
+      ////////////////////////////////////////////////
+      if (subGuess)      sol_o= sol_o-guess_save;
+
+      ///////////////////////////////////////////////////
+      // RedBlack solution needs the even source
+      ///////////////////////////////////////////////////
+      RedBlackSolution(_Matrix,sol_o,src_e,out);
+
+      // Verify the unprec residual
+      if ( ! subGuess ) {
+        _Matrix.M(out,resid); 
+        resid = resid-in;
+        RealD ns = norm2(in);
+        RealD nr = norm2(resid);
+
+        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
+      } else {
+        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
+      }
+    }     
+    
+    /////////////////////////////////////////////////////////////
+    // Override in derived. Not virtual as template methods
+    /////////////////////////////////////////////////////////////
+    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)=0;
+
+  };
+
+  template<class Field> class SchurRedBlackStaggeredSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) 
+      :    SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) 
+    {
+    }
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e_c,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field   sol_e(grid);
+      Field   src_e(grid);
+
+      src_e = src_e_c; // Const correctness
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal has Mooee on it.
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
+      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) {};
+
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      // get the right MpcDag
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  sol_e(grid);
+      Field  src_e_i(grid);
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);          assert(  tmp.checkerboard   ==Even);
+      src_e_i = src_e-tmp;               assert(  src_e_i.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e_i,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal is identity, right preconditioned by Mee^inv
+  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
+  //=> psi = MeeInv phi
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
+    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess) {};
+
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      // get the right MpcDag
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+    }
+
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   sol_o_i(grid);
+      Field   tmp(grid);
+      Field   sol_e(grid);
+
+      ////////////////////////////////////////////////
+      // MooeeInv due to pecond
+      ////////////////////////////////////////////////
+      _Matrix.MooeeInv(sol_o,tmp);
+      sol_o_i = tmp;
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o_i,tmp);    assert(  tmp.checkerboard   ==Even);
+      tmp = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(tmp,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(sol,sol_e);    assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(sol,sol_o_i);  assert(  sol_o_i.checkerboard ==Odd );
+    };
+
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+}
+#endif
@@ -0,0 +1,125 @@
+#include <Grid/GridCore.h>
+#include <fcntl.h>
+
+namespace Grid {
+
+MemoryStats *MemoryProfiler::stats = nullptr;
+bool         MemoryProfiler::debug = false;
+
+int PointerCache::victim;
+
+PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
+
+void *PointerCache::Insert(void *ptr,size_t bytes) {
+
+  if (bytes < 4096 ) return ptr;
+
+#ifdef GRID_OMP
+  assert(omp_in_parallel()==0);
+#endif 
+
+  void * ret = NULL;
+  int v = -1;
+
+  for(int e=0;e<Ncache;e++) {
+    if ( Entries[e].valid==0 ) {
+      v=e; 
+      break;
+    }
+  }
+
+  if ( v==-1 ) {
+    v=victim;
+    victim = (victim+1)%Ncache;
+  }
+
+  if ( Entries[v].valid ) {
+    ret = Entries[v].address;
+    Entries[v].valid = 0;
+    Entries[v].address = NULL;
+    Entries[v].bytes = 0;
+  }
+
+  Entries[v].address=ptr;
+  Entries[v].bytes  =bytes;
+  Entries[v].valid  =1;
+
+  return ret;
+}
+
+void *PointerCache::Lookup(size_t bytes) {
+
+ if (bytes < 4096 ) return NULL;
+
+#ifdef _OPENMP
+  assert(omp_in_parallel()==0);
+#endif 
+
+  for(int e=0;e<Ncache;e++){
+    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
+      Entries[e].valid = 0;
+      return Entries[e].address;
+    }
+  }
+  return NULL;
+}
+
+
+void check_huge_pages(void *Buf,uint64_t BYTES)
+{
+#ifdef __linux__
+  int fd = open("/proc/self/pagemap", O_RDONLY);
+  assert(fd >= 0);
+  const int page_size = 4096;
+  uint64_t virt_pfn = (uint64_t)Buf / page_size;
+  off_t offset = sizeof(uint64_t) * virt_pfn;
+  uint64_t npages = (BYTES + page_size-1) / page_size;
+  uint64_t pagedata[npages];
+  uint64_t ret = lseek(fd, offset, SEEK_SET);
+  assert(ret == offset);
+  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
+  assert(ret == sizeof(uint64_t) * npages);
+  int nhugepages = npages / 512;
+  int n4ktotal, nnothuge;
+  n4ktotal = 0;
+  nnothuge = 0;
+  for (int i = 0; i < nhugepages; ++i) {
+    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
+    for (int j = 0; j < 512; ++j) {
+      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
+      ++n4ktotal;
+      if (pageaddr != baseaddr + j * page_size)
+	++nnothuge;
+      }
+  }
+  int rank = CartesianCommunicator::RankWorld();
+  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
+#endif
+}
+
+std::string sizeString(const size_t bytes)
+{
+  constexpr unsigned int bufSize = 256;
+  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
+  char                   buf[256];
+  size_t                 s     = 0;
+  double                 count = bytes;
+  
+  while (count >= 1024 && s < 7)
+  {
+      s++;
+      count /= 1024;
+  }
+  if (count - floor(count) == 0.0)
+  {
+      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
+  }
+  else
+  {
+      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
+  }
+  
+  return std::string(buf);
+}
+
+}
@@ -0,0 +1,315 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/AlignedAllocator.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ALIGNED_ALLOCATOR_H
+#define GRID_ALIGNED_ALLOCATOR_H
+
+#ifdef HAVE_MALLOC_MALLOC_H
+#include <malloc/malloc.h>
+#endif
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#ifdef HAVE_MM_MALLOC_H
+#include <mm_malloc.h>
+#endif
+
+namespace Grid {
+
+  class PointerCache {
+  private:
+
+    static const int Ncache=8;
+    static int victim;
+
+    typedef struct { 
+      void *address;
+      size_t bytes;
+      int valid;
+    } PointerCacheEntry;
+    
+    static PointerCacheEntry Entries[Ncache];
+
+  public:
+
+
+    static void *Insert(void *ptr,size_t bytes) ;
+    static void *Lookup(size_t bytes) ;
+
+  };
+  
+  std::string sizeString(size_t bytes);
+
+  struct MemoryStats
+  {
+    size_t totalAllocated{0}, maxAllocated{0}, 
+           currentlyAllocated{0}, totalFreed{0};
+  };
+    
+  class MemoryProfiler
+  {
+  public:
+    static MemoryStats *stats;
+    static bool        debug;
+  };
+
+  #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
+  #define profilerDebugPrint \
+  if (MemoryProfiler::stats)\
+  {\
+    auto s = MemoryProfiler::stats;\
+    std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\
+    std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
+              << std::endl;\
+    std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
+              << std::endl;\
+    std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
+              << std::endl;\
+    std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
+              << std::endl;\
+  }
+
+  #define profilerAllocate(bytes)\
+  if (MemoryProfiler::stats)\
+  {\
+    auto s = MemoryProfiler::stats;\
+    s->totalAllocated     += (bytes);\
+    s->currentlyAllocated += (bytes);\
+    s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated);\
+  }\
+  if (MemoryProfiler::debug)\
+  {\
+    std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\
+    profilerDebugPrint;\
+  }
+
+  #define profilerFree(bytes)\
+  if (MemoryProfiler::stats)\
+  {\
+    auto s = MemoryProfiler::stats;\
+    s->totalFreed         += (bytes);\
+    s->currentlyAllocated -= (bytes);\
+  }\
+  if (MemoryProfiler::debug)\
+  {\
+    std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\
+    profilerDebugPrint;\
+  }
+
+  void check_huge_pages(void *Buf,uint64_t BYTES);
+
+////////////////////////////////////////////////////////////////////
+// A lattice of something, but assume the something is SIMDized.
+////////////////////////////////////////////////////////////////////
+
+template<typename _Tp>
+class alignedAllocator {
+public: 
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef _Tp*       pointer;
+  typedef const _Tp* const_pointer;
+  typedef _Tp&       reference;
+  typedef const _Tp& const_reference;
+  typedef _Tp        value_type;
+
+  template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; };
+  alignedAllocator() throw() { }
+  alignedAllocator(const alignedAllocator&) throw() { }
+  template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
+  ~alignedAllocator() throw() { }
+  pointer       address(reference __x)       const { return &__x; }
+  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+
+  pointer allocate(size_type __n, const void* _p= 0)
+  { 
+    size_type bytes = __n*sizeof(_Tp);
+    profilerAllocate(bytes);
+
+    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
+    //    if ( ptr != NULL ) 
+    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
+
+    //////////////////
+    // Hack 2MB align; could make option probably doesn't need configurability
+    //////////////////
+//define GRID_ALLOC_ALIGN (128)
+#define GRID_ALLOC_ALIGN (2*1024*1024)
+#ifdef HAVE_MM_MALLOC_H
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
+#else
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
+#endif
+    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
+    // First touch optimise in threaded loop
+    uint8_t *cp = (uint8_t *)ptr;
+#ifdef GRID_OMP
+#pragma omp parallel for
+#endif
+    for(size_type n=0;n<bytes;n+=4096){
+      cp[n]=0;
+    }
+    return ptr;
+  }
+
+  void deallocate(pointer __p, size_type __n) { 
+    size_type bytes = __n * sizeof(_Tp);
+
+    profilerFree(bytes);
+
+    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
+
+#ifdef HAVE_MM_MALLOC_H
+    if ( __freeme ) _mm_free((void *)__freeme); 
+#else
+    if ( __freeme ) free((void *)__freeme);
+#endif
+  }
+  void construct(pointer __p, const _Tp& __val) { };
+  void construct(pointer __p) { };
+  void destroy(pointer __p) { };
+};
+template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
+template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// MPI3 : comms must use shm region
+// SHMEM: comms must use symmetric heap
+//////////////////////////////////////////////////////////////////////////////////////////
+#ifdef GRID_COMMS_SHMEM
+extern "C" { 
+#include <mpp/shmem.h>
+extern void * shmem_align(size_t, size_t);
+extern void  shmem_free(void *);
+}
+#define PARANOID_SYMMETRIC_HEAP
+#endif
+
+template<typename _Tp>
+class commAllocator {
+public: 
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef _Tp*       pointer;
+  typedef const _Tp* const_pointer;
+  typedef _Tp&       reference;
+  typedef const _Tp& const_reference;
+  typedef _Tp        value_type;
+
+  template<typename _Tp1>  struct rebind { typedef commAllocator<_Tp1> other; };
+  commAllocator() throw() { }
+  commAllocator(const commAllocator&) throw() { }
+  template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
+  ~commAllocator() throw() { }
+  pointer       address(reference __x)       const { return &__x; }
+  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+
+#ifdef GRID_COMMS_SHMEM
+  pointer allocate(size_type __n, const void* _p= 0)
+  {
+    size_type bytes = __n*sizeof(_Tp);
+
+    profilerAllocate(bytes);
+#ifdef CRAY
+    _Tp *ptr = (_Tp *) shmem_align(bytes,64);
+#else
+    _Tp *ptr = (_Tp *) shmem_align(64,bytes);
+#endif
+#ifdef PARANOID_SYMMETRIC_HEAP
+    static void * bcast;
+    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+
+    bcast = (void *) ptr;
+    shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
+
+    if ( bcast != ptr ) {
+      std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
+      //      BACKTRACEFILE();
+      exit(0);
+    }
+    assert( bcast == (void *) ptr);
+#endif 
+    return ptr;
+  }
+  void deallocate(pointer __p, size_type __n) { 
+    size_type bytes = __n*sizeof(_Tp);
+
+    profilerFree(bytes);
+    shmem_free((void *)__p);
+  }
+#else
+  pointer allocate(size_type __n, const void* _p= 0) 
+  {
+    size_type bytes = __n*sizeof(_Tp);
+    
+    profilerAllocate(bytes);
+#ifdef HAVE_MM_MALLOC_H
+    _Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
+#else
+    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
+#endif
+    uint8_t *cp = (uint8_t *)ptr;
+    if ( ptr ) { 
+    // One touch per 4k page, static OMP loop to catch same loop order
+#ifdef GRID_OMP
+#pragma omp parallel for schedule(static)
+#endif
+      for(size_type n=0;n<bytes;n+=4096){
+	cp[n]=0;
+      }
+    }
+    return ptr;
+  }
+  void deallocate(pointer __p, size_type __n) {
+    size_type bytes = __n*sizeof(_Tp);
+
+    profilerFree(bytes);
+#ifdef HAVE_MM_MALLOC_H
+    _mm_free((void *)__p); 
+#else
+    free((void *)__p);
+#endif
+  }
+#endif
+  void construct(pointer __p, const _Tp& __val) { };
+  void construct(pointer __p) { };
+  void destroy(pointer __p) { };
+};
+template<typename _Tp>  inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
+template<typename _Tp>  inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
+
+////////////////////////////////////////////////////////////////////////////////
+// Template typedefs
+////////////////////////////////////////////////////////////////////////////////
+template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
+template<class T> using commVector = std::vector<T,commAllocator<T> >;              
+template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
+    
+}; // namespace Grid
+#endif
@@ -0,0 +1,35 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Cartesian.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CARTESIAN_H
+#define GRID_CARTESIAN_H
+
+#include <Grid/cartesian/Cartesian_base.h>
+#include <Grid/cartesian/Cartesian_full.h>
+#include <Grid/cartesian/Cartesian_red_black.h> 
+
+#endif
@@ -1,7 +1,35 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cartesian/Cartesian_base.h
+
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_CARTESIAN_BASE_H
 #define GRID_CARTESIAN_BASE_H

-#include <Grid.h>

 namespace Grid{

@@ -16,16 +44,25 @@ namespace Grid{
  class GridBase : public CartesianCommunicator , public GridThread {

 public:
-
+    int dummy;
    // Give Lattice access
    template<class object> friend class Lattice;

    GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
+    GridBase(const std::vector<int> & processor_grid,
+	     const CartesianCommunicator &parent,
+	     int &split_rank) 
+      : CartesianCommunicator(processor_grid,parent,split_rank) {};
+    GridBase(const std::vector<int> & processor_grid,
+	     const CartesianCommunicator &parent) 
+      : CartesianCommunicator(processor_grid,parent,dummy) {};
+
+    virtual ~GridBase() = default;


    // Physics Grid information.
    std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
-    std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
+    std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
    std::vector<int> _gdimensions;// Global dimensions of array after cb removal
    std::vector<int> _ldimensions;// local dimensions of array with processor images removed
    std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed 
@@ -35,13 +72,14 @@ public:
    int _isites;
    int _fsites;                  // _isites*_osites = product(dimensions).
    int _gsites;
-    std::vector<int> _slice_block;   // subslice information
+    std::vector<int> _slice_block;// subslice information
    std::vector<int> _slice_stride;
    std::vector<int> _slice_nblock;

-    // Might need these at some point
-    //    std::vector<int> _lstart;     // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
-    //    std::vector<int> _lend;       // local end of array in gcoors    _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
+    std::vector<int> _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
+    std::vector<int> _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
+
+    bool _isCheckerBoarded; 

 public:

@@ -50,15 +88,12 @@ public:
    // GridCartesian / GridRedBlackCartesian
    ////////////////////////////////////////////////////////////////
    virtual int CheckerBoarded(int dim)=0;
-    virtual int CheckerBoard(std::vector<int> site)=0;
+    virtual int CheckerBoard(const std::vector<int> &site)=0;
    virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
-    int  CheckerBoardFromOindex (int Oindex){
-      std::vector<int> ocoor;
-      oCoorFromOindex(ocoor,Oindex); 
-      return CheckerBoard(ocoor);
-    }
+    virtual int CheckerBoardFromOindex (int Oindex)=0;
+    virtual int CheckerBoardFromOindexTable (int Oindex)=0;

    //////////////////////////////////////////////////////////////////////////////////////////////
    // Local layout calculations
@@ -75,10 +110,16 @@ public:
    virtual int oIndex(std::vector<int> &coor)
    {
        int idx=0;
-	// Works with either global or local coordinates
+        // Works with either global or local coordinates
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
+    virtual int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+        return idx;
+    }
    inline int oIndexReduced(std::vector<int> &ocoor)
    {
      int idx=0; 
@@ -87,47 +128,50 @@ public:
      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
      return idx;
    }
-    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
-      int nd= dims.size();
-      coor.resize(nd);
-      for(int d=0;d<nd;d++){
-	coor[d] = index % dims[d];
-	index   = index / dims[d];
-      }
-    }
    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
-      CoorFromIndex(coor,Oindex,_rdimensions);
+      Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
    }
-    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
-      int nd=dims.size();
-      int stride=1;
-      index=0;
-      for(int d=0;d<nd;d++){
-	index = index+stride*coor[d];
-	stride=stride*dims[d];
-      }
+
+    inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) {
+      lcoor.resize(_ndimension);
+      for (int d = 0; d < _ndimension; d++)
+        lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
    }

    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
-    inline int iIndex(std::vector<int> &lcoor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
-        return idx;
-    }
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
    {
-      CoorFromIndex(coor,lane,_simd_layout);
+      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
    }
+
    inline int PermuteDim(int dimension){
      return _simd_layout[dimension]>1;
    }
    inline int PermuteType(int dimension){
      int permute_type=0;
+      //
+      // FIXME:
+      //
+      // Best way to encode this would be to present a mask 
+      // for which simd dimensions are rotated, and the rotation
+      // size. If there is only one simd dimension rotated, this is just 
+      // a permute. 
+      //
+      // Cases: PermuteType == 1,2,4,8
+      // Distance should be either 0,1,2..
+      //
+      if ( _simd_layout[dimension] > 2 ) { 
+        for(int d=0;d<_ndimension;d++){
+          if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
+        }
+        permute_type = RotateBit; // How to specify distance; this is not just direction.
+        return permute_type;
+      }
+
      for(int d=_ndimension-1;d>dimension;d--){
-	if (_simd_layout[d]>1 ) permute_type++;
+        if (_simd_layout[d]>1 ) permute_type++;
      }
      return permute_type;
    }
@@ -135,30 +179,55 @@ public:
    // Array sizing queries
    ////////////////////////////////////////////////////////////////

-    inline int iSites(void) { return _isites; };
-    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
-    inline int oSites(void) { return _osites; };
-    inline int lSites(void) { return _isites*_osites; }; 
-    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
-    inline int Nd    (void) { return _ndimension;};
+    inline int iSites(void) const { return _isites; };
+    inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
+    inline int oSites(void) const { return _osites; };
+    inline int lSites(void) const { return _isites*_osites; }; 
+    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
+    inline int Nd    (void) const { return _ndimension;};

+    inline const std::vector<int> LocalStarts(void)             { return _lstart;    };
    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
    inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
    inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};

+    ////////////////////////////////////////////////////////////////
+    // Utility to print the full decomposition details 
+    ////////////////////////////////////////////////////////////////
+
+    void show_decomposition(){
+      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
+      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
+      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
+      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
+      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
+      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
+      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
+      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
+      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
+      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
+      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
+      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
+    } 
+
    ////////////////////////////////////////////////////////////////
    // Global addressing
    ////////////////////////////////////////////////////////////////
    void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
-      CoorFromIndex(gcoor,gidx,_gdimensions);
+      assert(gidx< gSites());
+      Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
+    }
+    void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
+      assert(lidx<lSites());
+      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
    }
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
      gidx=0;
      int mult=1;
      for(int mu=0;mu<_ndimension;mu++) {
-	gidx+=mult*gcoor[mu];
-	mult*=_gdimensions[mu];
+        gidx+=mult*gcoor[mu];
+        mult*=_gdimensions[mu];
      }
    }
    void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
@@ -166,9 +235,9 @@ public:
      pcoor.resize(_ndimension);
      lcoor.resize(_ndimension);
      for(int mu=0;mu<_ndimension;mu++){
-	int _fld  = _fdimensions[mu]/_processors[mu];
-	pcoor[mu] = gcoor[mu]/_fld;
-	lcoor[mu] = gcoor[mu]%_fld;
+        int _fld  = _fdimensions[mu]/_processors[mu];
+        pcoor[mu] = gcoor[mu]/_fld;
+        lcoor[mu] = gcoor[mu]%_fld;
      }
    }
    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
@@ -177,16 +246,16 @@ public:
      std::vector<int> lcoor;
      GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
      rank = RankFromProcessorCoor(pcoor);
-
+      /*
      std::vector<int> cblcoor(lcoor);
      for(int d=0;d<cblcoor.size();d++){
-	if( this->CheckerBoarded(d) ) {
-	  cblcoor[d] = lcoor[d]/2;
-	}
+        if( this->CheckerBoarded(d) ) {
+          cblcoor[d] = lcoor[d]/2;
+        }
      }
-
-      i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
-      o_idx= oIndex(lcoor);// this implies divide by 2 on checkerdim
+      */
+      i_idx= iIndex(lcoor);
+      o_idx= oIndex(lcoor);
    }

    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
@@ -208,7 +277,7 @@ public:
    {
      RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
      if(CheckerBoarded(0)){
-	fcoor[0] = fcoor[0]*2+cb;
+        fcoor[0] = fcoor[0]*2+cb;
      }
    }
    void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
@@ -0,0 +1,174 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cartesian/Cartesian_full.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CARTESIAN_FULL_H
+#define GRID_CARTESIAN_FULL_H
+
+namespace Grid{
+    
+/////////////////////////////////////////////////////////////////////////////////////////
+// Grid Support.
+/////////////////////////////////////////////////////////////////////////////////////////
+
+
+class GridCartesian: public GridBase {
+
+public:
+    int dummy;
+    virtual int  CheckerBoardFromOindexTable (int Oindex) {
+      return 0;
+    }
+    virtual int  CheckerBoardFromOindex (int Oindex)
+    {
+      return 0;
+    }
+    virtual int CheckerBoarded(int dim){
+      return 0;
+    }
+    virtual int CheckerBoard(const std::vector<int> &site){
+        return 0;
+    }
+    virtual int CheckerBoardDestination(int cb,int shift,int dim){
+        return 0;
+    }
+    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
+      return shift;
+    }
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
+      return shift;
+    }
+    /////////////////////////////////////////////////////////////////////////
+    // Constructor takes a parent grid and possibly subdivides communicator.
+    /////////////////////////////////////////////////////////////////////////
+    GridCartesian(const std::vector<int> &dimensions,
+		  const std::vector<int> &simd_layout,
+		  const std::vector<int> &processor_grid,
+		  const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
+    {
+      Init(dimensions,simd_layout,processor_grid);
+    }
+    GridCartesian(const std::vector<int> &dimensions,
+		  const std::vector<int> &simd_layout,
+		  const std::vector<int> &processor_grid,
+		  const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
+    {
+      Init(dimensions,simd_layout,processor_grid);
+    }
+    /////////////////////////////////////////////////////////////////////////
+    // Construct from comm world
+    /////////////////////////////////////////////////////////////////////////
+    GridCartesian(const std::vector<int> &dimensions,
+		  const std::vector<int> &simd_layout,
+		  const std::vector<int> &processor_grid) : GridBase(processor_grid)
+    {
+      Init(dimensions,simd_layout,processor_grid);
+    }
+
+    virtual ~GridCartesian() = default;
+
+    void Init(const std::vector<int> &dimensions,
+	      const std::vector<int> &simd_layout,
+	      const std::vector<int> &processor_grid)
+    {
+      ///////////////////////
+      // Grid information
+      ///////////////////////
+      _isCheckerBoarded = false;
+      _ndimension = dimensions.size();
+
+      _fdimensions.resize(_ndimension);
+      _gdimensions.resize(_ndimension);
+      _ldimensions.resize(_ndimension);
+      _rdimensions.resize(_ndimension);
+      _simd_layout.resize(_ndimension);
+      _lstart.resize(_ndimension);
+      _lend.resize(_ndimension);
+
+      _ostride.resize(_ndimension);
+      _istride.resize(_ndimension);
+
+      _fsites = _gsites = _osites = _isites = 1;
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        _fdimensions[d] = dimensions[d];   // Global dimensions
+        _gdimensions[d] = _fdimensions[d]; // Global dimensions
+        _simd_layout[d] = simd_layout[d];
+        _fsites = _fsites * _fdimensions[d];
+        _gsites = _gsites * _gdimensions[d];
+
+        // Use a reduced simd grid
+        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
+        //std::cout << _ldimensions[d] << "  " << _gdimensions[d] << "  " << _processors[d] << std::endl;
+        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
+
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
+
+        _lstart[d] = _processor_coor[d] * _ldimensions[d];
+        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
+        _osites *= _rdimensions[d];
+        _isites *= _simd_layout[d];
+
+        // Addressing support
+        if (d == 0)
+        {
+          _ostride[d] = 1;
+          _istride[d] = 1;
+        }
+        else
+        {
+          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
+          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
+        }
+      }
+
+      ///////////////////////
+      // subplane information
+      ///////////////////////
+      _slice_block.resize(_ndimension);
+      _slice_stride.resize(_ndimension);
+      _slice_nblock.resize(_ndimension);
+
+      int block = 1;
+      int nblock = 1;
+      for (int d = 0; d < _ndimension; d++)
+        nblock *= _rdimensions[d];
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        nblock /= _rdimensions[d];
+        _slice_block[d] = block;
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
+        _slice_nblock[d] = nblock;
+        block = block * _rdimensions[d];
+      }
+    };
+
+};
+}
+#endif
@@ -0,0 +1,320 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cartesian/Cartesian_red_black.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CARTESIAN_RED_BLACK_H
+#define GRID_CARTESIAN_RED_BLACK_H
+
+
+namespace Grid {
+
+  static const int CbRed  =0;
+  static const int CbBlack=1;
+  static const int Even   =CbRed;
+  static const int Odd    =CbBlack;
+    
+// Specialise this for red black grids storing half the data like a chess board.
+class GridRedBlackCartesian : public GridBase
+{
+public:
+    std::vector<int> _checker_dim_mask;
+    int              _checker_dim;
+    std::vector<int> _checker_board;
+
+    virtual int CheckerBoarded(int dim){
+      if( dim==_checker_dim) return 1;
+      else return 0;
+    }
+    virtual int CheckerBoard(const std::vector<int> &site){
+      int linear=0;
+      assert(site.size()==_ndimension);
+      for(int d=0;d<_ndimension;d++){ 
+	if(_checker_dim_mask[d])
+	  linear=linear+site[d];
+      }
+      return (linear&0x1);
+    }
+
+
+    // Depending on the cb of site, we toggle source cb.
+    // for block #b, element #e = (b, e)
+    // we need 
+    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){
+      if(dim != _checker_dim) return shift;
+
+      int fulldim =_fdimensions[dim];
+      shift = (shift+fulldim)%fulldim;
+
+      // Probably faster with table lookup;
+      // or by looping over x,y,z and multiply rather than computing checkerboard.
+	  
+      if ( (source_cb+ocb)&1 ) {
+	return (shift)/2;
+      } else {
+	return (shift+1)/2;
+      }
+    }
+    virtual int  CheckerBoardFromOindexTable (int Oindex) {
+      return _checker_board[Oindex];
+    }
+    virtual int  CheckerBoardFromOindex (int Oindex)
+    {
+      std::vector<int> ocoor;
+      oCoorFromOindex(ocoor,Oindex);
+      return CheckerBoard(ocoor);
+    }
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
+
+      if(dim != _checker_dim) return shift;
+
+      int ocb=CheckerBoardFromOindex(osite);
+      
+      return CheckerBoardShiftForCB(source_cb,dim,shift,ocb);
+    }
+    
+    virtual int CheckerBoardDestination(int source_cb,int shift,int dim){
+      if ( _checker_dim_mask[dim]  ) {
+	// If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims
+	// does NOT cause a parity hop.
+	int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim];
+        if ( (shift+add) &0x1) {
+            return 1-source_cb;
+        } else {
+            return source_cb;
+        }
+      } else {
+	return source_cb;
+
+      }
+    };
+
+    ////////////////////////////////////////////////////////////
+    // Create Redblack from original grid; require full grid pointer ?
+    ////////////////////////////////////////////////////////////
+    GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
+    {
+      int dims = base->_ndimension;
+      std::vector<int> checker_dim_mask(dims,1);
+      int checker_dim = 0;
+      Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
+    };
+
+    ////////////////////////////////////////////////////////////
+    // Create redblack from original grid, with non-trivial checker dim mask
+    ////////////////////////////////////////////////////////////
+    GridRedBlackCartesian(const GridBase *base,
+			  const std::vector<int> &checker_dim_mask,
+			  int checker_dim
+			  ) :  GridBase(base->_processors,*base) 
+    {
+      Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim)  ;
+    }
+
+    virtual ~GridRedBlackCartesian() = default;
+#if 0
+    ////////////////////////////////////////////////////////////
+    // Create redblack grid ;; deprecate these. Should not
+    // need direct creation of redblack without a full grid to base on
+    ////////////////////////////////////////////////////////////
+    GridRedBlackCartesian(const GridBase *base,
+			  const std::vector<int> &dimensions,
+			  const std::vector<int> &simd_layout,
+			  const std::vector<int> &processor_grid,
+			  const std::vector<int> &checker_dim_mask,
+			  int checker_dim
+			  ) :  GridBase(processor_grid,*base) 
+    {
+      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
+    }
+
+    ////////////////////////////////////////////////////////////
+    // Create redblack grid
+    ////////////////////////////////////////////////////////////
+    GridRedBlackCartesian(const GridBase *base,
+			  const std::vector<int> &dimensions,
+			  const std::vector<int> &simd_layout,
+			  const std::vector<int> &processor_grid) : GridBase(processor_grid,*base) 
+    {
+      std::vector<int> checker_dim_mask(dimensions.size(),1);
+      int checker_dim = 0;
+      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
+    }
+#endif
+
+    void Init(const std::vector<int> &dimensions,
+              const std::vector<int> &simd_layout,
+              const std::vector<int> &processor_grid,
+              const std::vector<int> &checker_dim_mask,
+              int checker_dim)
+    {
+
+      _isCheckerBoarded = true;
+      _checker_dim = checker_dim;
+      assert(checker_dim_mask[checker_dim] == 1);
+      _ndimension = dimensions.size();
+      assert(checker_dim_mask.size() == _ndimension);
+      assert(processor_grid.size() == _ndimension);
+      assert(simd_layout.size() == _ndimension);
+
+      _fdimensions.resize(_ndimension);
+      _gdimensions.resize(_ndimension);
+      _ldimensions.resize(_ndimension);
+      _rdimensions.resize(_ndimension);
+      _simd_layout.resize(_ndimension);
+      _lstart.resize(_ndimension);
+      _lend.resize(_ndimension);
+
+      _ostride.resize(_ndimension);
+      _istride.resize(_ndimension);
+
+      _fsites = _gsites = _osites = _isites = 1;
+
+      _checker_dim_mask = checker_dim_mask;
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        _fdimensions[d] = dimensions[d];
+        _gdimensions[d] = _fdimensions[d];
+        _fsites = _fsites * _fdimensions[d];
+        _gsites = _gsites * _gdimensions[d];
+
+        if (d == _checker_dim)
+        {
+          assert((_gdimensions[d] & 0x1) == 0);
+          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
+	  _gsites /= 2;
+        }
+        _ldimensions[d] = _gdimensions[d] / _processors[d];
+        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
+        _lstart[d] = _processor_coor[d] * _ldimensions[d];
+        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
+
+        // Use a reduced simd grid
+        _simd_layout[d] = simd_layout[d];
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
+        assert(_rdimensions[d] > 0);
+
+        // all elements of a simd vector must have same checkerboard.
+        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
+        if (_simd_layout[d] > 1)
+        {
+          if (checker_dim_mask[d])
+          {
+            assert((_rdimensions[d] & 0x1) == 0);
+          }
+        }
+
+        _osites *= _rdimensions[d];
+        _isites *= _simd_layout[d];
+
+        // Addressing support
+        if (d == 0)
+        {
+          _ostride[d] = 1;
+          _istride[d] = 1;
+        }
+        else
+        {
+          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
+          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
+        }
+      }
+
+      ////////////////////////////////////////////////////////////////////////////////////////////
+      // subplane information
+      ////////////////////////////////////////////////////////////////////////////////////////////
+      _slice_block.resize(_ndimension);
+      _slice_stride.resize(_ndimension);
+      _slice_nblock.resize(_ndimension);
+
+      int block = 1;
+      int nblock = 1;
+      for (int d = 0; d < _ndimension; d++)
+        nblock *= _rdimensions[d];
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        nblock /= _rdimensions[d];
+        _slice_block[d] = block;
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
+        _slice_nblock[d] = nblock;
+        block = block * _rdimensions[d];
+      }
+
+      ////////////////////////////////////////////////
+      // Create a checkerboard lookup table
+      ////////////////////////////////////////////////
+      int rvol = 1;
+      for (int d = 0; d < _ndimension; d++)
+      {
+        rvol = rvol * _rdimensions[d];
+      }
+      _checker_board.resize(rvol);
+      for (int osite = 0; osite < _osites; osite++)
+      {
+        _checker_board[osite] = CheckerBoardFromOindex(osite);
+      }
+    };
+
+  protected:
+    virtual int oIndex(std::vector<int> &coor)
+    {
+      int idx = 0;
+      for (int d = 0; d < _ndimension; d++)
+      {
+        if (d == _checker_dim)
+        {
+          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
+        }
+        else
+        {
+          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
+        }
+      }
+      return idx;
+    };
+
+    virtual int iIndex(std::vector<int> &lcoor)
+    {
+      int idx = 0;
+      for (int d = 0; d < _ndimension; d++)
+      {
+        if (d == _checker_dim)
+        {
+          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
+        }
+        else
+        {
+          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
+        }
+      }
+      return idx;
+    }
+};
+}
+#endif
@@ -0,0 +1,34 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Communicator.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_COMMUNICATOR_H
+#define GRID_COMMUNICATOR_H
+
+#include <Grid/communicator/SharedMemory.h>
+#include <Grid/communicator/Communicator_base.h>
+
+#endif
@@ -0,0 +1,76 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/Communicator_none.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/GridCore.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/mman.h>
+
+namespace Grid {
+
+///////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////
+CartesianCommunicator::CommunicatorPolicy_t  
+CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+int CartesianCommunicator::nCommThreads = -1;
+
+/////////////////////////////////
+// Grid information queries
+/////////////////////////////////
+int                      CartesianCommunicator::Dimensions(void)        { return _ndimension; };
+int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
+int                      CartesianCommunicator::BossRank(void)          { return 0; };
+int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
+const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
+const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; };
+int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; };
+
+////////////////////////////////////////////////////////////////////////////////
+// very VERY rarely (Log, serial RNG) we need world without a grid
+////////////////////////////////////////////////////////////////////////////////
+
+void CartesianCommunicator::GlobalSum(ComplexF &c)
+{
+  GlobalSumVector((float *)&c,2);
+}
+void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
+{
+  GlobalSumVector((float *)c,2*N);
+}
+void CartesianCommunicator::GlobalSum(ComplexD &c)
+{
+  GlobalSumVector((double *)&c,2);
+}
+void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
+{
+  GlobalSumVector((double *)c,2*N);
+}
+  
+}
+
@@ -0,0 +1,207 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/Communicator_base.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_COMMUNICATOR_BASE_H
+#define GRID_COMMUNICATOR_BASE_H
+
+///////////////////////////////////
+// Processor layout information
+///////////////////////////////////
+#include <Grid/communicator/SharedMemory.h>
+
+namespace Grid {
+
+class CartesianCommunicator : public SharedMemory {
+
+public:    
+
+  ////////////////////////////////////////////
+  // Policies
+  ////////////////////////////////////////////
+  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
+  static CommunicatorPolicy_t CommunicatorPolicy;
+  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
+  static int       nCommThreads;
+
+  ////////////////////////////////////////////
+  // Communicator should know nothing of the physics grid, only processor grid.
+  ////////////////////////////////////////////
+  int              _Nprocessors;     // How many in all
+  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
+  int              _processor;       // linear processor rank
+  std::vector<int> _processor_coor;  // linear processor coordinate
+  unsigned long    _ndimension;
+  static Grid_MPI_Comm      communicator_world;
+  Grid_MPI_Comm             communicator;
+  std::vector<Grid_MPI_Comm> communicator_halo;
+  
+  ////////////////////////////////////////////////
+  // Must call in Grid startup
+  ////////////////////////////////////////////////
+  static void Init(int *argc, char ***argv);
+
+  ////////////////////////////////////////////////
+  // Constructors to sub-divide a parent communicator
+  // and default to comm world
+  ////////////////////////////////////////////////
+  CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank);
+  CartesianCommunicator(const std::vector<int> &pdimensions_in);
+  virtual ~CartesianCommunicator();
+
+ private:
+
+  ////////////////////////////////////////////////
+  // Private initialise from an MPI communicator
+  // Can use after an MPI_Comm_split, but hidden from user so private
+  ////////////////////////////////////////////////
+  void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
+
+ public:
+
+  
+  ////////////////////////////////////////////////////////////////////////////////////////
+  // Wraps MPI_Cart routines, or implements equivalent on other impls
+  ////////////////////////////////////////////////////////////////////////////////////////
+  void ShiftedRanks(int dim,int shift,int & source, int & dest);
+  int  RankFromProcessorCoor(std::vector<int> &coor);
+  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
+  
+  int                      Dimensions(void)        ;
+  int                      IsBoss(void)            ;
+  int                      BossRank(void)          ;
+  int                      ThisRank(void)          ;
+  const std::vector<int> & ThisProcessorCoor(void) ;
+  const std::vector<int> & ProcessorGrid(void)     ;
+  int                      ProcessorCount(void)    ;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // very VERY rarely (Log, serial RNG) we need world without a grid
+  ////////////////////////////////////////////////////////////////////////////////
+  static int  RankWorld(void) ;
+  static void BroadcastWorld(int root,void* data, int bytes);
+  
+  ////////////////////////////////////////////////////////////
+  // Reduction
+  ////////////////////////////////////////////////////////////
+  void GlobalSum(RealF &);
+  void GlobalSumVector(RealF *,int N);
+  void GlobalSum(RealD &);
+  void GlobalSumVector(RealD *,int N);
+  void GlobalSum(uint32_t &);
+  void GlobalSum(uint64_t &);
+  void GlobalSum(ComplexF &c);
+  void GlobalSumVector(ComplexF *c,int N);
+  void GlobalSum(ComplexD &c);
+  void GlobalSumVector(ComplexD *c,int N);
+  void GlobalXOR(uint32_t &);
+  void GlobalXOR(uint64_t &);
+  
+  template<class obj> void GlobalSum(obj &o){
+    typedef typename obj::scalar_type scalar_type;
+    int words = sizeof(obj)/sizeof(scalar_type);
+    scalar_type * ptr = (scalar_type *)& o;
+    GlobalSumVector(ptr,words);
+  }
+  
+  ////////////////////////////////////////////////////////////
+  // Face exchange, buffer swap in translational invariant way
+  ////////////////////////////////////////////////////////////
+  void SendToRecvFrom(void *xmit,
+		      int xmit_to_rank,
+		      void *recv,
+		      int recv_from_rank,
+		      int bytes);
+  
+  void SendRecvPacket(void *xmit,
+		      void *recv,
+		      int xmit_to_rank,
+		      int recv_from_rank,
+		      int bytes);
+  
+  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			   void *xmit,
+			   int xmit_to_rank,
+			   void *recv,
+			   int recv_from_rank,
+			   int bytes);
+  
+  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+
+  double StencilSendToRecvFrom(void *xmit,
+			       int xmit_to_rank,
+			       void *recv,
+			       int recv_from_rank,
+			       int bytes,int dir);
+
+  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+				    void *xmit,
+				    int xmit_to_rank,
+				    void *recv,
+				    int recv_from_rank,
+				    int bytes,int dir);
+  
+  
+  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
+  void StencilBarrier(void);
+
+  ////////////////////////////////////////////////////////////
+  // Barrier
+  ////////////////////////////////////////////////////////////
+  void Barrier(void);
+  
+  ////////////////////////////////////////////////////////////
+  // Broadcast a buffer and composite larger
+  ////////////////////////////////////////////////////////////
+  void Broadcast(int root,void* data, int bytes);
+
+  ////////////////////////////////////////////////////////////
+  // All2All down one dimension
+  ////////////////////////////////////////////////////////////
+  template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
+    assert(dim>=0);
+    assert(dim<_ndimension);
+    assert(in.size()==out.size());
+    int numnode = _processors[dim];
+    uint64_t bytes=sizeof(T);
+    uint64_t words=in.size()/numnode;
+    assert(numnode * words == in.size());
+    assert(words < (1ULL<<31));
+    AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
+  }
+  void AllToAll(int dim  ,void *in,void *out,uint64_t words,uint64_t bytes);
+  void AllToAll(void  *in,void *out,uint64_t words         ,uint64_t bytes);
+  
+  template<class obj> void Broadcast(int root,obj &data)
+    {
+      Broadcast(root,(void *)&data,sizeof(data));
+    };
+
+}; 
+}
+
+#endif
@@ -0,0 +1,508 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/Communicator_mpi.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/GridCore.h>
+#include <Grid/communicator/SharedMemory.h>
+
+namespace Grid {
+
+Grid_MPI_Comm       CartesianCommunicator::communicator_world;
+
+////////////////////////////////////////////
+// First initialise of comms system
+////////////////////////////////////////////
+void CartesianCommunicator::Init(int *argc, char ***argv) 
+{
+
+  int flag;
+  int provided;
+
+  MPI_Initialized(&flag); // needed to coexist with other libs apparently
+  if ( !flag ) {
+    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
+    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
+    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
+        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
+      assert(0);
+  }
+
+  // Never clean up as done once.
+  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
+
+  GlobalSharedMemory::Init(communicator_world);
+  GlobalSharedMemory::SharedMemoryAllocate(
+		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
+		   GlobalSharedMemory::Hugepages);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Use cartesian communicators now even in MPI3
+///////////////////////////////////////////////////////////////////////////
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
+{
+  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
+  assert(ierr==0);
+}
+int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
+{
+  int rank;
+  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
+  assert(ierr==0);
+  return rank;
+}
+void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
+{
+  coor.resize(_ndimension);
+  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
+  assert(ierr==0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Initialises from communicator_world
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
+{
+  MPI_Comm optimal_comm;
+  ////////////////////////////////////////////////////
+  // Remap using the shared memory optimising routine
+  // The remap creates a comm which must be freed
+  ////////////////////////////////////////////////////
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
+  InitFromMPICommunicator(processors,optimal_comm);
+  SetCommunicator(optimal_comm);
+  ///////////////////////////////////////////////////
+  // Free the temp communicator
+  ///////////////////////////////////////////////////
+  MPI_Comm_free(&optimal_comm);
+}
+
+//////////////////////////////////
+// Try to subdivide communicator
+//////////////////////////////////
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)    
+{
+  _ndimension = processors.size();
+
+  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
+  std::vector<int> parent_processor_coor(_ndimension,0);
+  std::vector<int> parent_processors    (_ndimension,1);
+
+  // Can make 5d grid from 4d etc...
+  int pad = _ndimension-parent_ndimension;
+  for(int d=0;d<parent_ndimension;d++){
+    parent_processor_coor[pad+d]=parent._processor_coor[d];
+    parent_processors    [pad+d]=parent._processors[d];
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // split the communicator
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  int Nparent = parent._processors ; 
+  int Nparent;
+  MPI_Comm_size(parent.communicator,&Nparent);
+
+  int childsize=1;
+  for(int d=0;d<processors.size();d++) {
+    childsize *= processors[d];
+  }
+  int Nchild = Nparent/childsize;
+  assert (childsize * Nchild == Nparent);
+
+  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
+  std::vector<int> scoor(_ndimension); // coor of split within parent
+  std::vector<int> ssize(_ndimension); // coor of split within parent
+
+  for(int d=0;d<_ndimension;d++){
+    ccoor[d] = parent_processor_coor[d] % processors[d];
+    scoor[d] = parent_processor_coor[d] / processors[d];
+    ssize[d] = parent_processors[d]     / processors[d];
+  }
+
+  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
+  int crank;  
+  // Mpi uses the reverse Lexico convention to us; so reversed routines called
+  Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
+  Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);      // ssize is the number of split grids
+
+  MPI_Comm comm_split;
+  if ( Nchild > 1 ) { 
+
+    if(0){
+      std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
+      std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
+      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
+      for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    ";
+      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    ";
+      for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " ";
+      std::cout<<std::endl;
+      
+      std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
+      for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
+      std::cout<<std::endl;
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////
+      // Declare victory
+      //////////////////////////////////////////////////////////////////////////////////////////////////////
+      std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
+		<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
+      std::cout << " Split communicator " <<comm_split <<std::endl;
+    }
+
+    ////////////////////////////////////////////////////////////////
+    // Split the communicator
+    ////////////////////////////////////////////////////////////////
+    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
+    assert(ierr==0);
+
+  } else {
+    srank = 0;
+    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
+    assert(ierr==0);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Set up from the new split communicator
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  InitFromMPICommunicator(processors,comm_split);
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Take the right SHM buffers
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  SetCommunicator(comm_split);
+  
+  ///////////////////////////////////////////////
+  // Free the temp communicator 
+  ///////////////////////////////////////////////
+  MPI_Comm_free(&comm_split);
+
+  if(0){ 
+    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
+    for(int d=0;d<processors.size();d++){
+      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
+    }
+  }
+  for(int d=0;d<processors.size();d++){
+    assert(_processor_coor[d] == ccoor[d] );
+  }
+}
+
+void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
+{
+  ////////////////////////////////////////////////////
+  // Creates communicator, and the communicator_halo
+  ////////////////////////////////////////////////////
+  _ndimension = processors.size();
+  _processor_coor.resize(_ndimension);
+
+  /////////////////////////////////
+  // Count the requested nodes
+  /////////////////////////////////
+  _Nprocessors=1;
+  _processors = processors;
+  for(int i=0;i<_ndimension;i++){
+    _Nprocessors*=_processors[i];
+  }
+
+  std::vector<int> periodic(_ndimension,1);
+  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
+  MPI_Comm_rank(communicator,&_processor);
+  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
+
+  if ( 0 && (communicator_base != communicator_world) ) {
+    std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
+    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
+    for(int d=0;d<_processors.size();d++){
+      std::cout << _processor_coor[d]<<" ";
+    }
+    std::cout << std::endl;
+  }
+
+  int Size;
+  MPI_Comm_size(communicator,&Size);
+
+  communicator_halo.resize (2*_ndimension);
+  for(int i=0;i<_ndimension*2;i++){
+    MPI_Comm_dup(communicator,&communicator_halo[i]);
+  }
+  assert(Size==_Nprocessors);
+}
+
+CartesianCommunicator::~CartesianCommunicator()
+{
+  int MPI_is_finalised;
+  MPI_Finalized(&MPI_is_finalised);
+  if (communicator && !MPI_is_finalised) {
+    MPI_Comm_free(&communicator);
+    for(int i=0;i<communicator_halo.size();i++){
+      MPI_Comm_free(&communicator_halo[i]);
+    }
+  }  
+}
+void CartesianCommunicator::GlobalSum(uint32_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalXOR(uint32_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalXOR(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(float &f){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSumVector(float *f,int N)
+{
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSumVector(double *d,int N)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFrom(void *xmit,
+					   int dest,
+					   void *recv,
+					   int from,
+					   int bytes)
+{
+  std::vector<CommsRequest_t> reqs(0);
+  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
+  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
+  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
+  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
+  SendToRecvFromComplete(reqs);
+  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
+  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
+}
+void CartesianCommunicator::SendRecvPacket(void *xmit,
+					   void *recv,
+					   int sender,
+					   int receiver,
+					   int bytes)
+{
+  MPI_Status stat;
+  assert(sender != receiver);
+  int tag = sender;
+  if ( _processor == sender ) {
+    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
+  }
+  if ( _processor == receiver ) { 
+    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
+  }
+}
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  int myrank = _processor;
+  int ierr;
+
+  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
+    MPI_Request xrq;
+    MPI_Request rrq;
+
+    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    
+    assert(ierr==0);
+    list.push_back(xrq);
+    list.push_back(rrq);
+  } else { 
+    // Give the CPU to MPI immediately; can use threads to overlap optionally
+    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
+		      recv,bytes,MPI_CHAR,from, from,
+		      communicator,MPI_STATUS_IGNORE);
+    assert(ierr==0);
+  }
+}
+
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
+						     int dest,
+						     void *recv,
+						     int from,
+						     int bytes,int dir)
+{
+  std::vector<CommsRequest_t> list;
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
+  StencilSendToRecvFromComplete(list,dir);
+  return offbytes;
+}
+
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int dest,
+							 void *recv,
+							 int from,
+							 int bytes,int dir)
+{
+  int ncomm  =communicator_halo.size(); 
+  int commdir=dir%ncomm;
+
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  assert(gme  == ShmRank);
+  double off_node_bytes=0.0;
+
+  if ( gfrom ==MPI_UNDEFINED) {
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
+    assert(ierr==0);
+    list.push_back(rrq);
+    off_node_bytes+=bytes;
+  }
+
+  if ( gdest == MPI_UNDEFINED ) {
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
+    assert(ierr==0);
+    list.push_back(xrq);
+    off_node_bytes+=bytes;
+  }
+
+  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
+    this->StencilSendToRecvFromComplete(list,dir);
+  }
+
+  return off_node_bytes;
+}
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
+{
+  SendToRecvFromComplete(waitall);
+}
+void CartesianCommunicator::StencilBarrier(void)
+{
+  MPI_Barrier  (ShmComm);
+}
+void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  int nreq=list.size();
+
+  if (nreq==0) return;
+
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  assert(ierr==0);
+  list.resize(0);
+}
+void CartesianCommunicator::Barrier(void)
+{
+  int ierr = MPI_Barrier(communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+{
+  int ierr=MPI_Bcast(data,
+		     bytes,
+		     MPI_BYTE,
+		     root,
+		     communicator);
+  assert(ierr==0);
+}
+int CartesianCommunicator::RankWorld(void){ 
+  int r; 
+  MPI_Comm_rank(communicator_world,&r);
+  return r;
+}
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
+{
+  int ierr= MPI_Bcast(data,
+		      bytes,
+		      MPI_BYTE,
+		      root,
+		      communicator_world);
+  assert(ierr==0);
+}
+
+void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
+{
+  std::vector<int> row(_ndimension,1);
+  assert(dim>=0 && dim<_ndimension);
+
+  //  Split the communicator
+  row[dim] = _processors[dim];
+
+  int me;
+  CartesianCommunicator Comm(row,*this,me);
+  Comm.AllToAll(in,out,words,bytes);
+}
+void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
+{
+  // MPI is a pain and uses "int" arguments
+  // 64*64*64*128*16 == 500Million elements of data.
+  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
+  // (Turns up on 32^3 x 64 Gparity too)
+  MPI_Datatype object;
+  int iwords; 
+  int ibytes;
+  iwords = words;
+  ibytes = bytes;
+  assert(words == iwords); // safe to cast to int ?
+  assert(bytes == ibytes); // safe to cast to int ?
+  MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
+  MPI_Type_commit(&object);
+  MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
+  MPI_Type_free(&object);
+}
+
+
+
+}
+
@@ -0,0 +1,165 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/Communicator_none.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/GridCore.h>
+
+namespace Grid {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+Grid_MPI_Comm       CartesianCommunicator::communicator_world;
+
+void CartesianCommunicator::Init(int *argc, char *** arv)
+{
+  GlobalSharedMemory::Init(communicator_world);
+  GlobalSharedMemory::SharedMemoryAllocate(
+		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
+		   GlobalSharedMemory::Hugepages);
+}
+
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
+  : CartesianCommunicator(processors) 
+{
+  srank=0;
+  SetCommunicator(communicator_world);
+}
+
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
+{
+  _processors = processors;
+  _ndimension = processors.size();
+  _processor_coor.resize(_ndimension);
+  
+  // Require 1^N processor grid for fake
+  _Nprocessors=1;
+  _processor = 0;
+  for(int d=0;d<_ndimension;d++) {
+    assert(_processors[d]==1);
+    _processor_coor[d] = 0;
+  }
+  SetCommunicator(communicator_world);
+}
+
+CartesianCommunicator::~CartesianCommunicator(){}
+
+void CartesianCommunicator::GlobalSum(float &){}
+void CartesianCommunicator::GlobalSumVector(float *,int N){}
+void CartesianCommunicator::GlobalSum(double &){}
+void CartesianCommunicator::GlobalSum(uint32_t &){}
+void CartesianCommunicator::GlobalSum(uint64_t &){}
+void CartesianCommunicator::GlobalSumVector(double *,int N){}
+void CartesianCommunicator::GlobalXOR(uint32_t &){}
+void CartesianCommunicator::GlobalXOR(uint64_t &){}
+
+void CartesianCommunicator::SendRecvPacket(void *xmit,
+					   void *recv,
+					   int xmit_to_rank,
+					   int recv_from_rank,
+					   int bytes)
+{
+  assert(0);
+}
+
+
+// Basic Halo comms primitive -- should never call in single node
+void CartesianCommunicator::SendToRecvFrom(void *xmit,
+					   int dest,
+					   void *recv,
+					   int from,
+					   int bytes)
+{
+  assert(0);
+}
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  assert(0);
+}
+
+void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  assert(0);
+}
+void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
+{
+  bcopy(in,out,bytes*words);
+}
+void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
+{
+  bcopy(in,out,bytes*words);
+}
+
+int  CartesianCommunicator::RankWorld(void){return 0;}
+void CartesianCommunicator::Barrier(void){}
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
+int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
+void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  coor = _processor_coor; }
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
+{
+  source =0;
+  dest=0;
+}
+
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
+						     int xmit_to_rank,
+						     void *recv,
+						     int recv_from_rank,
+						     int bytes, int dir)
+{
+  std::vector<CommsRequest_t> list;
+  // Discard the "dir"
+  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
+  SendToRecvFromComplete(list);
+  return 2.0*bytes;
+}
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int xmit_to_rank,
+							 void *recv,
+							 int recv_from_rank,
+							 int bytes, int dir)
+{
+  // Discard the "dir"
+  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
+  return 2.0*bytes;
+}
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
+{
+  SendToRecvFromComplete(waitall);
+}
+
+void CartesianCommunicator::StencilBarrier(void){};
+
+
+}
+
@@ -0,0 +1,92 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/GridCore.h>
+
+namespace Grid { 
+
+// static data
+
+uint64_t            GlobalSharedMemory::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
+int                 GlobalSharedMemory::Hugepages = 0;
+int                 GlobalSharedMemory::_ShmSetup;
+int                 GlobalSharedMemory::_ShmAlloc;
+uint64_t            GlobalSharedMemory::_ShmAllocBytes;
+
+std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
+
+Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
+int                 GlobalSharedMemory::WorldShmRank;
+int                 GlobalSharedMemory::WorldShmSize;
+std::vector<int>    GlobalSharedMemory::WorldShmRanks;
+
+Grid_MPI_Comm       GlobalSharedMemory::WorldComm;
+int                 GlobalSharedMemory::WorldSize;
+int                 GlobalSharedMemory::WorldRank;
+
+int                 GlobalSharedMemory::WorldNodes;
+int                 GlobalSharedMemory::WorldNode;
+
+void GlobalSharedMemory::SharedMemoryFree(void)
+{
+  assert(_ShmAlloc);
+  assert(_ShmAllocBytes>0);
+  for(int r=0;r<WorldShmSize;r++){
+    munmap(WorldShmCommBufs[r],_ShmAllocBytes);
+  }
+  _ShmAlloc = 0;
+  _ShmAllocBytes = 0;
+}
+/////////////////////////////////
+// Alloc, free shmem region
+/////////////////////////////////
+void *SharedMemory::ShmBufferMalloc(size_t bytes){
+  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
+  void *ptr = (void *)heap_top;
+  heap_top  += bytes;
+  heap_bytes+= bytes;
+  if (heap_bytes >= heap_size) {
+    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
+    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
+    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
+    assert(heap_bytes<heap_size);
+  }
+  return ptr;
+}
+void SharedMemory::ShmBufferFreeAll(void) { 
+  heap_top  =(size_t)ShmBufferSelf();
+  heap_bytes=0;
+}
+void *SharedMemory::ShmBufferSelf(void)
+{
+  return ShmCommBufs[ShmRank];
+}
+
+
+
+}
@@ -0,0 +1,165 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+
+// TODO
+// 1) move includes into SharedMemory.cc
+//
+// 2) split shared memory into a) optimal communicator creation from comm world
+// 
+//                             b) shared memory buffers container
+//                                -- static globally shared; init once
+//                                -- per instance set of buffers.
+//                                   
+
+#pragma once 
+
+#include <Grid/GridCore.h>
+
+#if defined (GRID_COMMS_MPI3) 
+#include <mpi.h>
+#endif 
+#include <semaphore.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+#include <zlib.h>
+#ifdef HAVE_NUMAIF_H
+#include <numaif.h>
+#endif
+
+namespace Grid {
+
+#if defined (GRID_COMMS_MPI3) 
+  typedef MPI_Comm    Grid_MPI_Comm;
+  typedef MPI_Request CommsRequest_t;
+#else 
+  typedef int CommsRequest_t;
+  typedef int Grid_MPI_Comm;
+#endif
+
+class GlobalSharedMemory {
+ private:
+  static const int     MAXLOG2RANKSPERNODE = 16;            
+
+  // Init once lock on the buffer allocation
+  static int      _ShmSetup;
+  static int      _ShmAlloc;
+  static uint64_t _ShmAllocBytes;
+
+ public:
+  static int      ShmSetup(void)      { return _ShmSetup; }
+  static int      ShmAlloc(void)      { return _ShmAlloc; }
+  static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
+  static uint64_t      MAX_MPI_SHM_BYTES;
+  static int           Hugepages;
+
+  static std::vector<void *> WorldShmCommBufs;
+
+  static Grid_MPI_Comm WorldComm;
+  static int           WorldRank;
+  static int           WorldSize;
+
+  static Grid_MPI_Comm WorldShmComm;
+  static int           WorldShmRank;
+  static int           WorldShmSize;
+
+  static int           WorldNodes;
+  static int           WorldNode;
+
+  static std::vector<int>  WorldShmRanks;
+
+  //////////////////////////////////////////////////////////////////////////////////////
+  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
+  //////////////////////////////////////////////////////////////////////////////////////
+  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
+  static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  ///////////////////////////////////////////////////
+  // Provide shared memory facilities off comm world
+  ///////////////////////////////////////////////////
+  static void SharedMemoryAllocate(uint64_t bytes, int flags);
+  static void SharedMemoryFree(void);
+
+};
+
+//////////////////////////////
+// one per communicator
+//////////////////////////////
+class SharedMemory 
+{
+ private:
+  static const int     MAXLOG2RANKSPERNODE = 16;            
+
+  size_t heap_top;
+  size_t heap_bytes;
+  size_t heap_size;
+
+ protected:
+
+  Grid_MPI_Comm    ShmComm; // for barriers
+  int    ShmRank; 
+  int    ShmSize;
+  std::vector<void *> ShmCommBufs;
+  std::vector<int>    ShmRanks;// Mapping comm ranks to Shm ranks
+
+ public:
+  SharedMemory() {};
+  ~SharedMemory();
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // set the buffers & sizes
+  ///////////////////////////////////////////////////////////////////////////////////////
+  void SetCommunicator(Grid_MPI_Comm comm);
+
+  ////////////////////////////////////////////////////////////////////////
+  // For this instance ; disjoint buffer sets between splits if split grid
+  ////////////////////////////////////////////////////////////////////////
+  void ShmBarrier(void); 
+
+  ///////////////////////////////////////////////////
+  // Call on any instance
+  ///////////////////////////////////////////////////
+  void SharedMemoryTest(void);
+  void *ShmBufferSelf(void);
+  void *ShmBuffer    (int rank);
+  void *ShmBufferTranslate(int rank,void * local_p);
+  void *ShmBufferMalloc(size_t bytes);
+  void  ShmBufferFreeAll(void) ;
+  
+  //////////////////////////////////////////////////////////////////////////
+  // Make info on Nodes & ranks and Shared memory available
+  //////////////////////////////////////////////////////////////////////////
+  int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
+  int RankCount(void) { return GlobalSharedMemory::WorldSize;};
+
+};
+
+}
@@ -0,0 +1,651 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/GridCore.h>
+#include <pwd.h>
+
+namespace Grid { 
+
+/*Construct from an MPI communicator*/
+void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
+{
+  assert(_ShmSetup==0);
+  WorldComm = comm;
+  MPI_Comm_rank(WorldComm,&WorldRank);
+  MPI_Comm_size(WorldComm,&WorldSize);
+  // WorldComm, WorldSize, WorldRank
+
+  /////////////////////////////////////////////////////////////////////
+  // Split into groups that can share memory
+  /////////////////////////////////////////////////////////////////////
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
+  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
+  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
+  // WorldShmComm, WorldShmSize, WorldShmRank
+
+  // WorldNodes
+  WorldNodes = WorldSize/WorldShmSize;
+  assert( (WorldNodes * WorldShmSize) == WorldSize );
+
+  // FIXME: Check all WorldShmSize are the same ?
+
+  /////////////////////////////////////////////////////////////////////
+  // find world ranks in our SHM group (i.e. which ranks are on our node)
+  /////////////////////////////////////////////////////////////////////
+  MPI_Group WorldGroup, ShmGroup;
+  MPI_Comm_group (WorldComm, &WorldGroup); 
+  MPI_Comm_group (WorldShmComm, &ShmGroup);
+
+  std::vector<int> world_ranks(WorldSize);   for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
+
+  WorldShmRanks.resize(WorldSize); 
+  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]); 
+
+  ///////////////////////////////////////////////////////////////////
+  // Identify who is in my group and nominate the leader
+  ///////////////////////////////////////////////////////////////////
+  int g=0;
+  std::vector<int> MyGroup;
+  MyGroup.resize(WorldShmSize);
+  for(int rank=0;rank<WorldSize;rank++){
+    if(WorldShmRanks[rank]!=MPI_UNDEFINED){
+      assert(g<WorldShmSize);
+      MyGroup[g++] = rank;
+    }
+  }
+  
+  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
+  int myleader = MyGroup[0];
+  
+  std::vector<int> leaders_1hot(WorldSize,0);
+  std::vector<int> leaders_group(WorldNodes,0);
+  leaders_1hot [ myleader ] = 1;
+    
+  ///////////////////////////////////////////////////////////////////
+  // global sum leaders over comm world
+  ///////////////////////////////////////////////////////////////////
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
+  assert(ierr==0);
+
+  ///////////////////////////////////////////////////////////////////
+  // find the group leaders world rank
+  ///////////////////////////////////////////////////////////////////
+  int group=0;
+  for(int l=0;l<WorldSize;l++){
+    if(leaders_1hot[l]){
+      leaders_group[group++] = l;
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////
+  // Identify the node of the group in which I (and my leader) live
+  ///////////////////////////////////////////////////////////////////
+  WorldNode=-1;
+  for(int g=0;g<WorldNodes;g++){
+    if (myleader == leaders_group[g]){
+      WorldNode=g;
+    }
+  }
+  assert(WorldNode!=-1);
+  _ShmSetup=1;
+}
+// Gray encode support 
+int BinaryToGray (int  binary) {
+  int gray = (binary>>1)^binary;
+  return gray;
+}
+int Log2Size(int TwoToPower,int MAXLOG2)
+{
+  int log2size = -1;
+  for(int i=0;i<=MAXLOG2;i++){
+    if ( (0x1<<i) == TwoToPower ) {
+      log2size = i;
+      break;
+    }
+  }
+  return log2size;
+}
+void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
+{
+#ifdef HYPERCUBE
+  ////////////////////////////////////////////////////////////////
+  // Assert power of two shm_size.
+  ////////////////////////////////////////////////////////////////
+  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
+  assert(log2size != -1);
+
+  ////////////////////////////////////////////////////////////////
+  // Identify the hypercube coordinate of this node using hostname
+  ////////////////////////////////////////////////////////////////
+  // n runs 0...7 9...16 18...25 27...34     (8*4)  5 bits
+  // i runs 0..7                                    3 bits
+  // r runs 0..3                                    2 bits
+  // 2^10 = 1024 nodes
+  const int maxhdim = 10; 
+  std::vector<int> HyperCubeCoords(maxhdim,0);
+  std::vector<int> RootHyperCubeCoords(maxhdim,0);
+  int R;
+  int I;
+  int N;
+  const int namelen = _POSIX_HOST_NAME_MAX;
+  char name[namelen];
+
+  // Parse ICE-XA hostname to get hypercube location
+  gethostname(name,namelen);
+  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
+  assert(nscan==3);
+
+  int nlo = N%9;
+  int nhi = N/9;
+  uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
+  uint32_t rootcoor  = hypercoor;
+
+  //////////////////////////////////////////////////////////////////
+  // Print debug info
+  //////////////////////////////////////////////////////////////////
+  for(int d=0;d<maxhdim;d++){
+    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
+  }
+
+  std::string hname(name);
+  std::cout << "hostname "<<hname<<std::endl;
+  std::cout << "R " << R << " I " << I << " N "<< N
+            << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
+
+  //////////////////////////////////////////////////////////////////
+  // broadcast node 0's base coordinate for this partition.
+  //////////////////////////////////////////////////////////////////
+  MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm); 
+  hypercoor=hypercoor-rootcoor;
+  assert(hypercoor<WorldSize);
+  assert(hypercoor>=0);
+
+  //////////////////////////////////////
+  // Printing
+  //////////////////////////////////////
+  for(int d=0;d<maxhdim;d++){
+    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Identify subblock of ranks on node spreading across dims
+  // in a maximally symmetrical way
+  ////////////////////////////////////////////////////////////////
+  int ndimension              = processors.size();
+  std::vector<int> processor_coor(ndimension);
+  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
+  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
+  std::vector<int> HyperCoor(ndimension);
+  int dim = 0;
+  for(int l2=0;l2<log2size;l2++){
+    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
+    ShmDims[dim]*=2;
+    dim=(dim+1)%ndimension;
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Establish torus of processes and nodes with sub-blockings
+  ////////////////////////////////////////////////////////////////
+  for(int d=0;d<ndimension;d++){
+    NodeDims[d] = WorldDims[d]/ShmDims[d];
+  }
+  ////////////////////////////////////////////////////////////////
+  // Map Hcube according to physical lattice 
+  // must partition. Loop over dims and find out who would join.
+  ////////////////////////////////////////////////////////////////
+  int hcoor = hypercoor;
+  for(int d=0;d<ndimension;d++){
+     int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
+     int msk  = (0x1<<bits)-1;
+     HyperCoor[d]=hcoor & msk;  
+     HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
+     hcoor = hcoor >> bits;
+  } 
+  ////////////////////////////////////////////////////////////////
+  // Check processor counts match
+  ////////////////////////////////////////////////////////////////
+  int Nprocessors=1;
+  for(int i=0;i<ndimension;i++){
+    Nprocessors*=processors[i];
+  }
+  assert(WorldSize==Nprocessors);
+
+  ////////////////////////////////////////////////////////////////
+  // Establish mapping between lexico physics coord and WorldRank
+  ////////////////////////////////////////////////////////////////
+  int rank;
+
+  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
+
+  for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
+
+  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
+  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
+  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
+
+  /////////////////////////////////////////////////////////////////
+  // Build the new communicator
+  /////////////////////////////////////////////////////////////////
+  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
+  assert(ierr==0);
+#else 
+  ////////////////////////////////////////////////////////////////
+  // Assert power of two shm_size.
+  ////////////////////////////////////////////////////////////////
+  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
+  assert(log2size != -1);
+
+  ////////////////////////////////////////////////////////////////
+  // Identify subblock of ranks on node spreading across dims
+  // in a maximally symmetrical way
+  ////////////////////////////////////////////////////////////////
+  int ndimension              = processors.size();
+  std::vector<int> processor_coor(ndimension);
+  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
+  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
+  int dim = 0;
+  for(int l2=0;l2<log2size;l2++){
+    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
+    ShmDims[dim]*=2;
+    dim=(dim+1)%ndimension;
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Establish torus of processes and nodes with sub-blockings
+  ////////////////////////////////////////////////////////////////
+  for(int d=0;d<ndimension;d++){
+    NodeDims[d] = WorldDims[d]/ShmDims[d];
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Check processor counts match
+  ////////////////////////////////////////////////////////////////
+  int Nprocessors=1;
+  for(int i=0;i<ndimension;i++){
+    Nprocessors*=processors[i];
+  }
+  assert(WorldSize==Nprocessors);
+
+  ////////////////////////////////////////////////////////////////
+  // Establish mapping between lexico physics coord and WorldRank
+  ////////////////////////////////////////////////////////////////
+  int rank;
+
+  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
+  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
+  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
+  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
+
+  /////////////////////////////////////////////////////////////////
+  // Build the new communicator
+  /////////////////////////////////////////////////////////////////
+  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
+  assert(ierr==0);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////
+// SHMGET
+////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef GRID_MPI3_SHMGET
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+  std::vector<int> shmids(WorldShmSize);
+
+  if ( WorldShmRank == 0 ) {
+    for(int r=0;r<WorldShmSize;r++){
+      size_t size = bytes;
+      key_t key   = IPC_PRIVATE;
+      int flags = IPC_CREAT | SHM_R | SHM_W;
+#ifdef SHM_HUGETLB
+      if (Hugepages) flags|=SHM_HUGETLB;
+#endif
+      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
+        int errsv = errno;
+        printf("Errno %d\n",errsv);
+        printf("key   %d\n",key);
+        printf("size  %lld\n",size);
+        printf("flags %d\n",flags);
+        perror("shmget");
+        exit(1);
+      }
+    }
+  }
+  MPI_Barrier(WorldShmComm);
+  MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
+  MPI_Barrier(WorldShmComm);
+
+  for(int r=0;r<WorldShmSize;r++){
+    WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
+    if (WorldShmCommBufs[r] == (uint64_t *)-1) {
+      perror("Shared memory attach failure");
+      shmctl(shmids[r], IPC_RMID, NULL);
+      exit(2);
+    }
+  }
+  MPI_Barrier(WorldShmComm);
+  ///////////////////////////////////
+  // Mark for clean up
+  ///////////////////////////////////
+  for(int r=0;r<WorldShmSize;r++){
+    shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
+  }
+  MPI_Barrier(WorldShmComm);
+
+  _ShmAlloc=1;
+  _ShmAllocBytes  = bytes;
+}
+#endif
+ 
+////////////////////////////////////////////////////////////////////////////////////////////
+// Hugetlbfs mapping intended
+////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef GRID_MPI3_SHMMMAP
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+  
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Hugetlbfs and others map filesystems as mappable huge pages
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  char shm_name [NAME_MAX];
+  for(int r=0;r<WorldShmSize;r++){
+    
+    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
+    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
+    if ( fd == -1) { 
+      printf("open %s failed\n",shm_name);
+      perror("open hugetlbfs");
+      exit(0);
+    }
+    int mmap_flag = MAP_SHARED ;
+#ifdef MAP_POPULATE    
+    mmap_flag|=MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+    if ( flags ) mmap_flag |= MAP_HUGETLB;
+#endif
+    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
+    if ( ptr == (void *)MAP_FAILED ) {    
+      printf("mmap %s failed\n",shm_name);
+      perror("failed mmap");      assert(0);    
+    }
+    assert(((uint64_t)ptr&0x3F)==0);
+    close(fd);
+    WorldShmCommBufs[r] =ptr;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+  }
+  _ShmAlloc=1;
+  _ShmAllocBytes  = bytes;
+};
+#endif // MMAP
+
+#ifdef GRID_MPI3_SHM_NONE
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+  
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Hugetlbf and others map filesystems as mappable huge pages
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  char shm_name [NAME_MAX];
+  assert(WorldShmSize == 1);
+  for(int r=0;r<WorldShmSize;r++){
+    
+    int fd=-1;
+    int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
+#ifdef MAP_POPULATE    
+    mmap_flag|=MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+    if ( flags ) mmap_flag |= MAP_HUGETLB;
+#endif
+    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
+    if ( ptr == (void *)MAP_FAILED ) {    
+      printf("mmap %s failed\n",shm_name);
+      perror("failed mmap");      assert(0);    
+    }
+    assert(((uint64_t)ptr&0x3F)==0);
+    close(fd);
+    WorldShmCommBufs[r] =ptr;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+  }
+  _ShmAlloc=1;
+  _ShmAllocBytes  = bytes;
+};
+#endif // MMAP
+
+#ifdef GRID_MPI3_SHMOPEN
+////////////////////////////////////////////////////////////////////////////////////////////
+// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
+// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
+// the posix shm virtual file system
+////////////////////////////////////////////////////////////////////////////////////////////
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{ 
+  std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0); 
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+
+  char shm_name [NAME_MAX];
+  if ( WorldShmRank == 0 ) {
+    for(int r=0;r<WorldShmSize;r++){
+	
+      size_t size = bytes;
+      
+      struct passwd *pw = getpwuid (getuid());
+      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
+      
+      shm_unlink(shm_name);
+      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
+      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
+      ftruncate(fd, size);
+	
+      int mmap_flag = MAP_SHARED;
+#ifdef MAP_POPULATE 
+      mmap_flag |= MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+      if (flags) mmap_flag |= MAP_HUGETLB;
+#endif
+      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
+      
+      //      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
+      if ( ptr == (void * )MAP_FAILED ) {       
+	perror("failed mmap");     
+	assert(0);    
+      }
+      assert(((uint64_t)ptr&0x3F)==0);
+      
+      WorldShmCommBufs[r] =ptr;
+      close(fd);
+    }
+  }
+
+  MPI_Barrier(WorldShmComm);
+  
+  if ( WorldShmRank != 0 ) { 
+    for(int r=0;r<WorldShmSize;r++){
+
+      size_t size = bytes ;
+      
+      struct passwd *pw = getpwuid (getuid());
+      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
+      
+      int fd=shm_open(shm_name,O_RDWR,0666);
+      if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
+      
+      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
+      assert(((uint64_t)ptr&0x3F)==0);
+      WorldShmCommBufs[r] =ptr;
+
+      close(fd);
+    }
+  }
+  _ShmAlloc=1;
+  _ShmAllocBytes = bytes;
+}
+#endif
+
+
+
+
+  ////////////////////////////////////////////////////////
+  // Global shared functionality finished
+  // Now move to per communicator functionality
+  ////////////////////////////////////////////////////////
+void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
+{
+  int rank, size;
+  MPI_Comm_rank(comm,&rank);
+  MPI_Comm_size(comm,&size);
+  ShmRanks.resize(size);
+
+  /////////////////////////////////////////////////////////////////////
+  // Split into groups that can share memory
+  /////////////////////////////////////////////////////////////////////
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
+  MPI_Comm_rank(ShmComm     ,&ShmRank);
+  MPI_Comm_size(ShmComm     ,&ShmSize);
+  ShmCommBufs.resize(ShmSize);
+
+  //////////////////////////////////////////////////////////////////////
+  // Map ShmRank to WorldShmRank and use the right buffer
+  //////////////////////////////////////////////////////////////////////
+  assert (GlobalSharedMemory::ShmAlloc()==1);
+  heap_size = GlobalSharedMemory::ShmAllocBytes();
+  for(int r=0;r<ShmSize;r++){
+
+    uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
+
+    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
+
+    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
+    //    std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< "  wsr = "<<wsr<<std::endl;
+  }
+  ShmBufferFreeAll();
+
+  /////////////////////////////////////////////////////////////////////
+  // find comm ranks in our SHM group (i.e. which ranks are on our node)
+  /////////////////////////////////////////////////////////////////////
+  MPI_Group FullGroup, ShmGroup;
+  MPI_Comm_group (comm   , &FullGroup); 
+  MPI_Comm_group (ShmComm, &ShmGroup);
+
+  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
+  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 
+}
+//////////////////////////////////////////////////////////////////
+// On node barrier
+//////////////////////////////////////////////////////////////////
+void SharedMemory::ShmBarrier(void)
+{
+  MPI_Barrier  (ShmComm);
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Test the shared memory is working
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+void SharedMemory::SharedMemoryTest(void)
+{
+  ShmBarrier();
+  if ( ShmRank == 0 ) {
+    for(int r=0;r<ShmSize;r++){
+      uint64_t * check = (uint64_t *) ShmCommBufs[r];
+      check[0] = GlobalSharedMemory::WorldNode;
+      check[1] = r;
+      check[2] = 0x5A5A5A;
+    }
+  }
+  ShmBarrier();
+  for(int r=0;r<ShmSize;r++){
+    uint64_t * check = (uint64_t *) ShmCommBufs[r];
+    
+    assert(check[0]==GlobalSharedMemory::WorldNode);
+    assert(check[1]==r);
+    assert(check[2]==0x5A5A5A);
+    
+  }
+  ShmBarrier();
+}
+
+void *SharedMemory::ShmBuffer(int rank)
+{
+  int gpeer = ShmRanks[rank];
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    return ShmCommBufs[gpeer];
+  }
+}
+void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
+{
+  static int count =0;
+  int gpeer = ShmRanks[rank];
+  assert(gpeer!=ShmRank); // never send to self
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
+    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
+    return (void *) remote;
+  }
+}
+SharedMemory::~SharedMemory()
+{
+  int MPI_is_finalised;  MPI_Finalized(&MPI_is_finalised);
+  if ( !MPI_is_finalised ) { 
+    MPI_Comm_free(&ShmComm);
+  }
+};
+
+}
@@ -0,0 +1,128 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/GridCore.h>
+
+namespace Grid { 
+
+/*Construct from an MPI communicator*/
+void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
+{
+  assert(_ShmSetup==0);
+  WorldComm = 0;
+  WorldRank = 0;
+  WorldSize = 1;
+  WorldShmComm = 0 ;
+  WorldShmRank = 0 ;
+  WorldShmSize = 1 ;
+  WorldNodes   = 1 ;
+  WorldNode    = 0 ;
+  WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
+  WorldShmCommBufs.resize(1);
+  _ShmSetup=1;
+}
+
+void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
+{
+  optimal_comm = WorldComm;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Hugetlbfs mapping intended, use anonymous mmap
+////////////////////////////////////////////////////////////////////////////////////////////
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  void * ShmCommBuf ; 
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+  int mmap_flag =0;
+#ifdef MAP_ANONYMOUS
+  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
+#endif
+#ifdef MAP_ANON
+  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
+#endif
+#ifdef MAP_HUGETLB
+  if ( flags ) mmap_flag |= MAP_HUGETLB;
+#endif
+  ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
+  if (ShmCommBuf == (void *)MAP_FAILED) {
+    perror("mmap failed ");
+    exit(EXIT_FAILURE);  
+  }
+#ifdef MADV_HUGEPAGE
+  if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
+#endif
+  bzero(ShmCommBuf,bytes);
+  WorldShmCommBufs[0] = ShmCommBuf;
+  _ShmAllocBytes=bytes;
+  _ShmAlloc=1;
+};
+
+  ////////////////////////////////////////////////////////
+  // Global shared functionality finished
+  // Now move to per communicator functionality
+  ////////////////////////////////////////////////////////
+void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
+{
+  assert(GlobalSharedMemory::ShmAlloc()==1);
+  ShmRanks.resize(1);
+  ShmCommBufs.resize(1);
+  ShmRanks[0] = 0;
+  ShmRank     = 0;
+  ShmSize     = 1;
+  //////////////////////////////////////////////////////////////////////
+  // Map ShmRank to WorldShmRank and use the right buffer
+  //////////////////////////////////////////////////////////////////////
+  ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
+  heap_size      = GlobalSharedMemory::ShmAllocBytes();
+  ShmBufferFreeAll();
+  return;
+}
+//////////////////////////////////////////////////////////////////
+// On node barrier
+//////////////////////////////////////////////////////////////////
+void SharedMemory::ShmBarrier(void){ return ; }
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Test the shared memory is working
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+void SharedMemory::SharedMemoryTest(void) { return; }
+
+void *SharedMemory::ShmBuffer(int rank)
+{
+  return NULL;
+}
+void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
+{
+  return NULL;
+}
+SharedMemory::~SharedMemory()
+{};
+
+}
@@ -0,0 +1,52 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Cshift.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_CSHIFT_H_
+#define _GRID_CSHIFT_H_
+
+#include <Grid/cshift/Cshift_common.h>
+
+#ifdef GRID_COMMS_NONE
+#include <Grid/cshift/Cshift_none.h>
+#endif
+
+#ifdef GRID_COMMS_MPI
+#include <Grid/cshift/Cshift_mpi.h>
+#endif 
+
+#ifdef GRID_COMMS_MPI3
+#include <Grid/cshift/Cshift_mpi.h>
+#endif 
+
+#ifdef GRID_COMMS_MPIT
+#include <Grid/cshift/Cshift_mpi.h>
+#endif 
+
+#ifdef GRID_COMMS_SHMEM
+#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
+#endif 
+#endif
@@ -1,23 +1,40 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cshift/Cshift_common.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef _GRID_CSHIFT_COMMON_H_
 #define _GRID_CSHIFT_COMMON_H_

 namespace Grid {

-template<class vobj>
-class SimpleCompressor {
-public:
-  void Point(int) {};
-
-  vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) {
-    return arg;
-  }
-};
-
 ///////////////////////////////////////////////////////////////////
-// Gather for when there is no need to SIMD split with compression
+// Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
-template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
+template<class vobj> void 
+Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
  int rd = rhs._grid->_rdimensions[dimension];

@@ -25,40 +42,44 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
    cbmask = 0x3;
  }
  
-  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-  
+  int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+  int ent = 0;

+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+
+  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
-PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-	int o  = n*rhs._grid->_slice_stride[dimension];
-	int bo = n*rhs._grid->_slice_block[dimension];
-	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	int o  = n*stride;
+	int bo = n*e2;
+	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
      }
    }
  } else { 
     int bo=0;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
-	 int o  = n*rhs._grid->_slice_stride[dimension];
-	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	 int o  = n*stride;
+	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	 if ( ocb &cbmask ) {
-	   buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
 	 }
       }
     }
  }
+  parallel_for(int i=0;i<ent;i++){
+    buffer[table[i].first]=rhs._odata[table[i].second];
+  }
 }

-
 ///////////////////////////////////////////////////////////////////
-// Gather for when there *is* need to SIMD split with compression
+// Gather for when there *is* need to SIMD split 
 ///////////////////////////////////////////////////////////////////
-template<class cobj,class vobj,class compressor> void 
-Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_object *> pointers,int dimension,int plane,int cbmask,compressor &compress)
+template<class vobj> void 
+Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];

@@ -70,60 +91,45 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_

  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  
+  int n1=rhs._grid->_slice_stride[dimension];
+
  if ( cbmask ==0x3){
-PARALLEL_NESTED_LOOP2
-    for(int n=0;n<e1;n++){
+    parallel_for_nest2(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){

-	int o=n*rhs._grid->_slice_stride[dimension];
-	int offset = b+n*rhs._grid->_slice_block[dimension];
-
-	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
-	extract<cobj>(temp,pointers,offset);
+	int o      =   n*n1;
+	int offset = b+n*e2;
+	
+	vobj temp =rhs._odata[so+o+b];
+	extract<vobj>(temp,pointers,offset);

      }
    }
  } else { 

-    assert(0); //Fixme think this is buggy
-    for(int n=0;n<e1;n++){
+    // Case of SIMD split AND checker dim cannot currently be hit, except in 
+    // Test_cshift_red_black code.
+    std::cout << " Dense packed buffer WARNING " <<std::endl;
+    parallel_for_nest2(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-	int o=n*rhs._grid->_slice_stride[dimension];
+
+	int o=n*n1;
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
-	int offset = b+n*rhs._grid->_slice_block[dimension];
+	int offset = b+n*e2;

 	if ( ocb & cbmask ) {
-	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
-	  extract<cobj>(temp,pointers,offset);
+	  vobj temp =rhs._odata[so+o+b];
+	  extract<vobj>(temp,pointers,offset);
 	}
      }
    }
  }
 }

-//////////////////////////////////////////////////////
-// Gather for when there is no need to SIMD split
-//////////////////////////////////////////////////////
-template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
-{
-  SimpleCompressor<vobj> dontcompress;
-  Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
-}
-
-//////////////////////////////////////////////////////
-// Gather for when there *is* need to SIMD split
-//////////////////////////////////////////////////////
-template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
-{
-  SimpleCompressor<vobj> dontcompress;
-  Gather_plane_extract<vobj,vobj,decltype(dontcompress)>(rhs,pointers,dimension,plane,cbmask,dontcompress);
-}
-
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];

@@ -135,35 +141,43 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
    
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  
+  int stride=rhs._grid->_slice_stride[dimension];
+
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent    =0;
+
  if ( cbmask ==0x3 ) {
-PARALLEL_NESTED_LOOP2
+
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int bo  =n*rhs._grid->_slice_block[dimension];
-	rhs._odata[so+o+b]=buffer[bo+b];
+	table[ent++] = std::pair<int,int>(so+o+b,bo+b);
      }
    }
+
  } else { 
    int bo=0;
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
-	int bo  =n*rhs._grid->_slice_block[dimension];
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
-	  rhs._odata[so+o+b]=buffer[bo++];
+	  table[ent++]=std::pair<int,int> (so+o+b,bo++);
 	}
      }
    }
  }
+
+  parallel_for(int i=0;i<ent;i++){
+    rhs._odata[table[i].first]=buffer[table[i].second];
+  }
 }

 //////////////////////////////////////////////////////
 // Scatter for when there *is* need to SIMD split
 //////////////////////////////////////////////////////
- template<class vobj,class cobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<cobj *> pointers,int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];

@@ -177,8 +191,7 @@ PARALLEL_NESTED_LOOP2
  int e2=rhs._grid->_slice_block[dimension];

  if(cbmask ==0x3 ) {
-PARALLEL_NESTED_LOOP2
-    for(int n=0;n<e1;n++){
+    parallel_for_nest2(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      = n*rhs._grid->_slice_stride[dimension];
 	int offset = b+n*rhs._grid->_slice_block[dimension];
@@ -186,7 +199,11 @@ PARALLEL_NESTED_LOOP2
      }
    }
  } else { 
-    assert(0); // think this is buggy FIXME
+
+    // Case of SIMD split AND checker dim cannot currently be hit, except in 
+    // Test_cshift_red_black code.
+    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
+    std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      = n*rhs._grid->_slice_stride[dimension];
@@ -216,32 +233,33 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs

  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
+  int stride = rhs._grid->_slice_stride[dimension];
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent=0;

  if(cbmask == 0x3 ){
-PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
- 
-        int o =n*rhs._grid->_slice_stride[dimension]+b;
-  	//lhs._odata[lo+o]=rhs._odata[ro+o];
-	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+        int o =n*stride+b;
+	table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
  } else { 
-PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
- 
-        int o =n*rhs._grid->_slice_stride[dimension]+b;
+        int o =n*stride+b;
        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
        if ( ocb&cbmask ) {
-  	//lhs._odata[lo+o]=rhs._odata[ro+o];
-	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+	  table[ent++] = std::pair<int,int>(lo+o,ro+o);
 	}
      }
    }
  }
-  
+
+  parallel_for(int i=0;i<ent;i++){
+    lhs._odata[table[i].first]=rhs._odata[table[i].second];
+  }
+
 }

 template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
@@ -258,17 +276,30 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo

  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block [dimension];
-PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-  for(int b=0;b<e2;b++){
+  int stride = rhs._grid->_slice_stride[dimension];

-      int o  =n*rhs._grid->_slice_stride[dimension];
+  static std::vector<std::pair<int,int> > table;  table.resize(e1*e2);
+  int ent=0;
+
+  double t_tab,t_perm;
+  if ( cbmask == 0x3 ) {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
+      int o  =n*stride;
+      table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  } else {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
+      int o  =n*stride;
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb&cbmask ) {
-	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
-      }
+      if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  }

-  }}
+  parallel_for(int i=0;i<ent;i++){
+    permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
+  }
 }

 //////////////////////////////////////////////////////
@@ -281,6 +312,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);

+  double t_local;
+  
  if ( sshift[0] == sshift[1] ) {
    Cshift_local(ret,rhs,dimension,shift,0x3);
  } else {
@@ -289,46 +322,70 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
  }
 }

-template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid = rhs._grid;
  int fd = grid->_fdimensions[dimension];
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int gd = grid->_gdimensions[dimension];
+  int ly = grid->_simd_layout[dimension];

  // Map to always positive shift modulo global full dimension.
  shift = (shift+fd)%fd;

-  ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
  // the permute type
+  ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
  int permute_dim =grid->PermuteDim(dimension);
  int permute_type=grid->PermuteType(dimension);
+  int permute_type_dist;

  for(int x=0;x<rd;x++){       

    int o   = 0;
    int bo  = x * grid->_ostride[dimension];
-    
    int cb= (cbmask==0x2)? Odd : Even;

    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
    int sx     = (x+sshift)%rd;
-
+    
+    // wrap is whether sshift > rd.
+    //  num is sshift mod rd.
+    // 
+    //  shift 7
+    //
+    //  XoXo YcYc 
+    //  oXoX cYcY
+    //  XoXo YcYc
+    //  oXoX cYcY
+    //
+    //  sshift -- 
+    //
+    //  XX YY ; 3
+    //  XX YY ; 0
+    //  XX YY ; 3
+    //  XX YY ; 0
+    //
    int permute_slice=0;
    if(permute_dim){
-      int wrap = sshift/rd;
+      int wrap = sshift/rd; wrap=wrap % ly;
      int  num = sshift%rd;
+
      if ( x< rd-num ) permute_slice=wrap;
-      else permute_slice = 1-wrap;
+      else permute_slice = (wrap+1)%ly;
+
+      if ( (ly>2) && (permute_slice) ) {
+	assert(permute_type & RotateBit);
+	permute_type_dist = permute_type|permute_slice;
+      } else {
+	permute_type_dist = permute_type;
+      }
    }

-    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
+    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-
  
  }
-  return ret;
 }
 }
 #endif
@@ -1,3 +1,31 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cshift/Cshift_mpi.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef _GRID_CSHIFT_MPI_H_
 #define _GRID_CSHIFT_MPI_H_

@@ -26,13 +54,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension


  if ( !comm_dim ) {
-    //    std::cout << "Cshift_local" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
  } else if ( splice_dim ) {
-    //    std::cout << "Cshift_comms_simd" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift);
  } else {
-    //    std::cout << "Cshift_comms" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
  return ret;
@@ -46,7 +74,6 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);

  //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
-
  if ( sshift[0] == sshift[1] ) {
    //    std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift,0x3);
@@ -64,9 +91,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);

+  //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
  if ( sshift[0] == sshift[1] ) {
+    //std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
  } else {
+    //std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
@@ -91,8 +121,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  
  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
-  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
-  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
+  commVector<vobj> send_buf(buffer_size);
+  commVector<vobj> recv_buf(buffer_size);

  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@@ -126,10 +156,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
+      grid->Barrier();

-      //      for(int i=0;i<words;i++){
-      //	std::cout << "SendRecv ["<<i<<"] snd "<<send_buf[i]<<" rcv " << recv_buf[i] << "  0x" << cbmask<<std::endl;
-      //      }
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
    }
  }
@@ -150,6 +178,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;

+  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+
  assert(comm_dim==1);
  assert(simd_layout==2);
  assert(shift>=0);
@@ -163,11 +195,12 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  int words = sizeof(vobj)/sizeof(vector_type);

-  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
-  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
+  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
+  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
+
  int bytes = buffer_size*sizeof(scalar_object);

-  std::vector<scalar_object *>  pointers(Nsimd);  // 
+  std::vector<scalar_object *>  pointers(Nsimd); // 
  std::vector<scalar_object *> rpointers(Nsimd); // received pointers

  ///////////////////////////////////////////
@@ -214,7 +247,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 			     (void *)&recv_buf_extract[i][0],
 			     recv_from_rank,
 			     bytes);
-
+	grid->Barrier();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
@@ -0,0 +1,39 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cshift/Cshift_none.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_CSHIFT_NONE_H_
+#define _GRID_CSHIFT_NONE_H_
+namespace Grid {
+template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
+{
+  Lattice<vobj> ret(rhs._grid);
+  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
+  Cshift_local(ret,rhs,dimension,shift);
+  return ret;
+}
+}
+#endif
@@ -0,0 +1,33 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Lattice.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_H
+#define GRID_LATTICE_H
+
+#include <Grid/lattice/Lattice_base.h>
+
+#endif
@@ -0,0 +1,466 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/lattice/Lattice_ET.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_ET_H
+#define GRID_LATTICE_ET_H
+
+#include <iostream>
+#include <tuple>
+#include <typeinfo>
+#include <vector>
+
+namespace Grid {
+
+////////////////////////////////////////////////////
+// Predicated where support
+////////////////////////////////////////////////////
+template <class iobj, class vobj, class robj>
+inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
+                            const robj &iffalse) {
+  typename std::remove_const<vobj>::type ret;
+
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  const int Nsimd = vobj::vector_type::Nsimd();
+  const int words = sizeof(vobj) / sizeof(vector_type);
+
+  std::vector<Integer> mask(Nsimd);
+  std::vector<scalar_object> truevals(Nsimd);
+  std::vector<scalar_object> falsevals(Nsimd);
+
+  extract(iftrue, truevals);
+  extract(iffalse, falsevals);
+  extract<vInteger, Integer>(TensorRemove(predicate), mask);
+
+  for (int s = 0; s < Nsimd; s++) {
+    if (mask[s]) falsevals[s] = truevals[s];
+  }
+
+  merge(ret, falsevals);
+  return ret;
+}
+
+////////////////////////////////////////////
+// recursive evaluation of expressions; Could
+// switch to generic approach with variadics, a la
+// Antonin's Lat Sim but the repack to variadic with popped
+// from tuple is hideous; C++14 introduces std::make_index_sequence for this
+////////////////////////////////////////////
+
+// leaf eval of lattice ; should enable if protect using traits
+
+template <typename T>
+using is_lattice = std::is_base_of<LatticeBase, T>;
+
+template <typename T>
+using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
+
+template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
+
+//Specialization of getVectorType for lattices
+template<typename T>
+struct getVectorType<Lattice<T> >{
+  typedef typename Lattice<T>::vector_object type;
+};
+ 
+template<class sobj>
+inline sobj eval(const unsigned int ss, const sobj &arg)
+{
+  return arg;
+}
+template <class lobj>
+inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
+  return arg._odata[ss];
+}
+
+// handle nodes in syntax tree
+template <typename Op, typename T1>
+auto inline eval(
+    const unsigned int ss,
+    const LatticeUnaryExpression<Op, T1> &expr)  // eval one operand
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)));
+}
+
+template <typename Op, typename T1, typename T2>
+auto inline eval(
+    const unsigned int ss,
+    const LatticeBinaryExpression<Op, T1, T2> &expr)  // eval two operands
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+                                eval(ss, std::get<1>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)),
+                         eval(ss, std::get<1>(expr.second)));
+}
+
+template <typename Op, typename T1, typename T2, typename T3>
+auto inline eval(const unsigned int ss,
+                 const LatticeTrinaryExpression<Op, T1, T2, T3>
+                     &expr)  // eval three operands
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+                                eval(ss, std::get<1>(expr.second)),
+                                eval(ss, std::get<2>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)),
+                         eval(ss, std::get<1>(expr.second)),
+                         eval(ss, std::get<2>(expr.second)));
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Obtain the grid from an expression, ensuring conformable. This must follow a
+// tree recursion
+//////////////////////////////////////////////////////////////////////////
+template <class T1,
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf
+{
+  if (grid) {
+    conformable(grid, lat._grid);
+  }
+  grid = lat._grid;
+}
+template <class T1,
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void GridFromExpression(GridBase *&grid,
+                               const T1 &notlat)  // non-lattice leaf
+{}
+template <typename Op, typename T1>
+inline void GridFromExpression(GridBase *&grid,
+                               const LatticeUnaryExpression<Op, T1> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+}
+
+template <typename Op, typename T1, typename T2>
+inline void GridFromExpression(
+    GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid, std::get<1>(expr.second));
+}
+template <typename Op, typename T1, typename T2, typename T3>
+inline void GridFromExpression(
+    GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid, std::get<1>(expr.second));
+  GridFromExpression(grid, std::get<2>(expr.second));
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Obtain the CB from an expression, ensuring conformable. This must follow a
+// tree recursion
+//////////////////////////////////////////////////////////////////////////
+template <class T1,
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
+{
+  if ((cb == Odd) || (cb == Even)) {
+    assert(cb == lat.checkerboard);
+  }
+  cb = lat.checkerboard;
+  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
+}
+template <class T1,
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
+{
+  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
+}
+template <typename Op, typename T1>
+inline void CBFromExpression(int &cb,
+                             const LatticeUnaryExpression<Op, T1> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
+}
+
+template <typename Op, typename T1, typename T2>
+inline void CBFromExpression(int &cb,
+                             const LatticeBinaryExpression<Op, T1, T2> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb, std::get<1>(expr.second));
+  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
+}
+template <typename Op, typename T1, typename T2, typename T3>
+inline void CBFromExpression(
+    int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb, std::get<1>(expr.second));
+  CBFromExpression(cb, std::get<2>(expr.second));
+  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
+}
+
+////////////////////////////////////////////
+// Unary operators and funcs
+////////////////////////////////////////////
+#define GridUnopClass(name, ret)                                          \
+  template <class arg>                                                    \
+  struct name {                                                           \
+    static auto inline func(const arg a) -> decltype(ret) { return ret; } \
+  };
+
+GridUnopClass(UnarySub, -a);
+GridUnopClass(UnaryNot, Not(a));
+GridUnopClass(UnaryAdj, adj(a));
+GridUnopClass(UnaryConj, conjugate(a));
+GridUnopClass(UnaryTrace, trace(a));
+GridUnopClass(UnaryTranspose, transpose(a));
+GridUnopClass(UnaryTa, Ta(a));
+GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
+GridUnopClass(UnaryReal, real(a));
+GridUnopClass(UnaryImag, imag(a));
+GridUnopClass(UnaryToReal, toReal(a));
+GridUnopClass(UnaryToComplex, toComplex(a));
+GridUnopClass(UnaryTimesI, timesI(a));
+GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
+GridUnopClass(UnaryAbs, abs(a));
+GridUnopClass(UnarySqrt, sqrt(a));
+GridUnopClass(UnaryRsqrt, rsqrt(a));
+GridUnopClass(UnarySin, sin(a));
+GridUnopClass(UnaryCos, cos(a));
+GridUnopClass(UnaryAsin, asin(a));
+GridUnopClass(UnaryAcos, acos(a));
+GridUnopClass(UnaryLog, log(a));
+GridUnopClass(UnaryExp, exp(a));
+
+////////////////////////////////////////////
+// Binary operators
+////////////////////////////////////////////
+#define GridBinOpClass(name, combination)                      \
+  template <class left, class right>                           \
+  struct name {                                                \
+    static auto inline func(const left &lhs, const right &rhs) \
+        -> decltype(combination) const {                       \
+      return combination;                                      \
+    }                                                          \
+  }
+GridBinOpClass(BinaryAdd, lhs + rhs);
+GridBinOpClass(BinarySub, lhs - rhs);
+GridBinOpClass(BinaryMul, lhs *rhs);
+GridBinOpClass(BinaryDiv, lhs /rhs);
+
+GridBinOpClass(BinaryAnd, lhs &rhs);
+GridBinOpClass(BinaryOr, lhs | rhs);
+GridBinOpClass(BinaryAndAnd, lhs &&rhs);
+GridBinOpClass(BinaryOrOr, lhs || rhs);
+
+////////////////////////////////////////////////////
+// Trinary conditional op
+////////////////////////////////////////////////////
+#define GridTrinOpClass(name, combination)                                     \
+  template <class predicate, class left, class right>                          \
+  struct name {                                                                \
+    static auto inline func(const predicate &pred, const left &lhs,            \
+                            const right &rhs) -> decltype(combination) const { \
+      return combination;                                                      \
+    }                                                                          \
+  }
+
+GridTrinOpClass(
+    TrinaryWhere,
+    (predicatedWhere<predicate, typename std::remove_reference<left>::type,
+                     typename std::remove_reference<right>::type>(pred, lhs,
+                                                                  rhs)));
+
+////////////////////////////////////////////
+// Operator syntactical glue
+////////////////////////////////////////////
+
+#define GRID_UNOP(name) name<decltype(eval(0, arg))>
+#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+#define GRID_TRINOP(name) \
+  name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+
+#define GRID_DEF_UNOP(op, name)                                             \
+  template <typename T1,                                                    \
+            typename std::enable_if<is_lattice<T1>::value ||                \
+                                        is_lattice_expr<T1>::value,         \
+                                    T1>::type * = nullptr>                  \
+  inline auto op(const T1 &arg)                                             \
+      ->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(       \
+          std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
+    return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(             \
+        std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)));     \
+  }
+
+#define GRID_BINOP_LEFT(op, name)                                             \
+  template <typename T1, typename T2,                                         \
+            typename std::enable_if<is_lattice<T1>::value ||                  \
+                                        is_lattice_expr<T1>::value,           \
+                                    T1>::type * = nullptr>                    \
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
+      ->decltype(                                                             \
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
+              std::make_pair(GRID_BINOP(name)(),                              \
+                             std::forward_as_tuple(lhs, rhs)))) {             \
+    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
+        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
+  }
+
+#define GRID_BINOP_RIGHT(op, name)                                            \
+  template <typename T1, typename T2,                                         \
+            typename std::enable_if<!is_lattice<T1>::value &&                 \
+                                        !is_lattice_expr<T1>::value,          \
+                                    T1>::type * = nullptr,                    \
+            typename std::enable_if<is_lattice<T2>::value ||                  \
+                                        is_lattice_expr<T2>::value,           \
+                                    T2>::type * = nullptr>                    \
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
+      ->decltype(                                                             \
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
+              std::make_pair(GRID_BINOP(name)(),                              \
+                             std::forward_as_tuple(lhs, rhs)))) {             \
+    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
+        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
+  }
+
+#define GRID_DEF_BINOP(op, name) \
+  GRID_BINOP_LEFT(op, name);     \
+  GRID_BINOP_RIGHT(op, name);
+
+#define GRID_DEF_TRINOP(op, name)                                              \
+  template <typename T1, typename T2, typename T3>                             \
+  inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)                 \
+      ->decltype(                                                              \
+          LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,  \
+                                   const T3 &>(std::make_pair(                 \
+              GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) {  \
+    return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
+                                    const T3 &>(std::make_pair(                \
+        GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)));          \
+  }
+////////////////////////
+// Operator definitions
+////////////////////////
+
+GRID_DEF_UNOP(operator-, UnarySub);
+GRID_DEF_UNOP(Not, UnaryNot);
+GRID_DEF_UNOP(operator!, UnaryNot);
+GRID_DEF_UNOP(adj, UnaryAdj);
+GRID_DEF_UNOP(conjugate, UnaryConj);
+GRID_DEF_UNOP(trace, UnaryTrace);
+GRID_DEF_UNOP(transpose, UnaryTranspose);
+GRID_DEF_UNOP(Ta, UnaryTa);
+GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
+GRID_DEF_UNOP(real, UnaryReal);
+GRID_DEF_UNOP(imag, UnaryImag);
+GRID_DEF_UNOP(toReal, UnaryToReal);
+GRID_DEF_UNOP(toComplex, UnaryToComplex);
+GRID_DEF_UNOP(timesI, UnaryTimesI);
+GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
+GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
+                               // abs-fabs-dabs-labs thing
+GRID_DEF_UNOP(sqrt, UnarySqrt);
+GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
+GRID_DEF_UNOP(sin, UnarySin);
+GRID_DEF_UNOP(cos, UnaryCos);
+GRID_DEF_UNOP(asin, UnaryAsin);
+GRID_DEF_UNOP(acos, UnaryAcos);
+GRID_DEF_UNOP(log, UnaryLog);
+GRID_DEF_UNOP(exp, UnaryExp);
+
+GRID_DEF_BINOP(operator+, BinaryAdd);
+GRID_DEF_BINOP(operator-, BinarySub);
+GRID_DEF_BINOP(operator*, BinaryMul);
+GRID_DEF_BINOP(operator/, BinaryDiv);
+
+GRID_DEF_BINOP(operator&, BinaryAnd);
+GRID_DEF_BINOP(operator|, BinaryOr);
+GRID_DEF_BINOP(operator&&, BinaryAndAnd);
+GRID_DEF_BINOP(operator||, BinaryOrOr);
+
+GRID_DEF_TRINOP(where, TrinaryWhere);
+
+/////////////////////////////////////////////////////////////
+// Closure convenience to force expression to evaluate
+/////////////////////////////////////////////////////////////
+template <class Op, class T1>
+auto closure(const LatticeUnaryExpression<Op, T1> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
+      expr);
+  return ret;
+}
+template <class Op, class T1, class T2>
+auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                        eval(0, std::get<1>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second))))>
+      ret(expr);
+  return ret;
+}
+template <class Op, class T1, class T2, class T3>
+auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                        eval(0, std::get<1>(expr.second)),
+                                        eval(0, std::get<2>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second)),
+                                   eval(0, std::get<2>(expr.second))))>
+      ret(expr);
+  return ret;
+}
+
+#undef GRID_UNOP
+#undef GRID_BINOP
+#undef GRID_TRINOP
+
+#undef GRID_DEF_UNOP
+#undef GRID_DEF_BINOP
+#undef GRID_DEF_TRINOP
+}
+
+#if 0
+using namespace Grid;
+        
+ int main(int argc,char **argv){
+   
+   Lattice<double> v1(16);
+   Lattice<double> v2(16);
+   Lattice<double> v3(16);
+
+   BinaryAdd<double,double> tmp;
+   LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &> 
+     expr(std::make_pair(tmp,
+    std::forward_as_tuple(v1,v2)));
+   tmp.func(eval(0,v1),eval(0,v2));
+
+   auto var = v1+v2;
+   std::cout<<GridLogMessage<<typeid(var).name()<<std::endl;
+
+   v3=v1+v2;
+   v3=v1+v2+v1*v2;
+ };
+
+void testit(Lattice<double> &v1,Lattice<double> &v2,Lattice<double> &v3)
+{
+   v3=v1+v2+v1*v2;
+}
+#endif
+
+#endif
@@ -1,3 +1,30 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_arith.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_LATTICE_ARITH_H
 #define GRID_LATTICE_ARITH_H

@@ -12,8 +39,7 @@ namespace Grid {
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -29,8 +55,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -46,8 +71,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -62,8 +86,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -81,8 +104,7 @@ PARALLEL_FOR_LOOP
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(lhs,ret);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
      obj1 tmp;
      mult(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
@@ -93,8 +115,7 @@ PARALLEL_FOR_LOOP
    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,lhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
      obj1 tmp;
      mac(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
@@ -105,8 +126,7 @@ PARALLEL_FOR_LOOP
    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,lhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs);
@@ -120,8 +140,7 @@ PARALLEL_FOR_LOOP
    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(lhs,ret);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs);
@@ -139,8 +158,7 @@ PARALLEL_FOR_LOOP
    void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs,&rhs._odata[ss]);
@@ -155,8 +173,7 @@ PARALLEL_FOR_LOOP
    void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs,&rhs._odata[ss]);
@@ -171,8 +188,7 @@ PARALLEL_FOR_LOOP
    void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs,&rhs._odata[ss]);
@@ -186,8 +202,7 @@ PARALLEL_FOR_LOOP
    void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs,&rhs._odata[ss]);
@@ -203,8 +218,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = x.checkerboard;
    conformable(ret,x);
    conformable(x,y);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<x._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+y._odata[ss];
      vstream(ret._odata[ss],tmp);
@@ -218,8 +232,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = x.checkerboard;
    conformable(ret,x);
    conformable(x,y);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<x._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
      vstream(ret._odata[ss],tmp);
@@ -231,19 +244,11 @@ PARALLEL_FOR_LOOP

  template<class sobj,class vobj> strong_inline
  RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
-    ret.checkerboard = x.checkerboard;
-    conformable(ret,x);
-    conformable(x,y);
-    axpy(ret,a,x,y);
-    return norm2(ret);
+    return axpy_norm_fast(ret,a,x,y);
  }
  template<class sobj,class vobj> strong_inline
  RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
-    ret.checkerboard = x.checkerboard;
-    conformable(ret,x);
-    conformable(x,y);
-    axpby(ret,a,b,x,y);
-    return norm2(ret); // FIXME implement parallel norm in ss loop
+    return axpby_norm_fast(ret,a,b,x,y);
  }

 }
@@ -1,3 +1,33 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/lattice/Lattice_base.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_LATTICE_BASE_H
 #define GRID_LATTICE_BASE_H

@@ -26,12 +56,15 @@ extern int GridCshiftPermuteMap[4][16];
 // Basic expressions used in Expression Template
 ////////////////////////////////////////////////

-class LatticeBase {};
+class LatticeBase
+{
+public:
+    virtual ~LatticeBase(void) = default;
+    GridBase *_grid;
+};
+    
 class LatticeExpressionBase {};

-template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
-template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
-
 template <typename Op, typename T1>                           
 class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
 public:
@@ -59,8 +92,6 @@ template<class vobj>
 class Lattice : public LatticeBase
 {
 public:
-
-    GridBase *_grid;
    int checkerboard;
    Vector<vobj> _odata;
    
@@ -68,12 +99,12 @@ public:
    int begin(void) { return 0;};
    int end(void)   { return _odata.size(); }
    vobj & operator[](int i) { return _odata[i]; };
+    const vobj & operator[](int i) const { return _odata[i]; };

 public:
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
    typedef vobj vector_object;
- 
   
  ////////////////////////////////////////////////////////////////////////////////
  // Expression Template closure support
@@ -90,8 +121,7 @@ public:
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;

-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -113,8 +143,7 @@ PARALLEL_FOR_LOOP
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;

-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -136,8 +165,7 @@ PARALLEL_FOR_LOOP
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;

-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      //vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,eval(ss,expr));
@@ -149,8 +177,8 @@ PARALLEL_FOR_LOOP
  }
  //GridFromExpression is tricky to do
  template<class Op,class T1>
-    Lattice(const LatticeUnaryExpression<Op,T1> & expr):    _grid(nullptr){
-
+    Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
+    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);

@@ -160,8 +188,7 @@ PARALLEL_FOR_LOOP
    checkerboard=cb;

    _odata.resize(_grid->oSites());
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -171,7 +198,8 @@ PARALLEL_FOR_LOOP
    }
  };
  template<class Op,class T1, class T2>
-  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr):    _grid(nullptr){
+  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
+    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);

@@ -181,8 +209,7 @@ PARALLEL_FOR_LOOP
    checkerboard=cb;

    _odata.resize(_grid->oSites());
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -192,7 +219,8 @@ PARALLEL_FOR_LOOP
    }
  };
  template<class Op,class T1, class T2, class T3>
-  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr):    _grid(nullptr){
+  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
+    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);

@@ -202,69 +230,103 @@ PARALLEL_FOR_LOOP
    checkerboard=cb;

    _odata.resize(_grid->oSites());
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
      vstream(_odata[ss] ,eval(ss,expr));
    }
  };

-    //////////////////////////////////////////////////////////////////
-    // Constructor requires "grid" passed.
-    // what about a default grid?
-    //////////////////////////////////////////////////////////////////
-    Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
-      //        _odata.reserve(_grid->oSites());
-      //        _odata.resize(_grid->oSites());
+  //////////////////////////////////////////////////////////////////
+  // Constructor requires "grid" passed.
+  // what about a default grid?
+  //////////////////////////////////////////////////////////////////
+  Lattice(GridBase *grid) : _odata(grid->oSites()) {
+    _grid = grid;
+    //        _odata.reserve(_grid->oSites());
+    //        _odata.resize(_grid->oSites());
    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
-        assert((((uint64_t)&_odata[0])&0xF) ==0);
-        checkerboard=0;
-    }
+    assert((((uint64_t)&_odata[0])&0xF) ==0);
+    checkerboard=0;
+  }
+  
+  Lattice(const Lattice& r){ // copy constructor
+    _grid = r._grid;
+    checkerboard = r.checkerboard;
+    _odata.resize(_grid->oSites());// essential
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      _odata[ss]=r._odata[ss];
+    }  	
+  }

-    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<_grid->oSites();ss++){
-            this->_odata[ss]=r;
-        }
-        return *this;
-    }
-    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
-      this->checkerboard = r.checkerboard;
-      conformable(*this,r);
-      std::cout<<GridLogMessage<<"Lattice operator ="<<std::endl;
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<_grid->oSites();ss++){
-            this->_odata[ss]=r._odata[ss];
-        }
-        return *this;
-    }
+  Lattice(Lattice&& r){ // move constructor
+    _grid = r._grid;
+    checkerboard = r.checkerboard;
+    _odata=std::move(r._odata);
+  }
+  
+  inline Lattice<vobj> & operator = (Lattice<vobj> && r)
+  {
+    _grid        = r._grid;
+    checkerboard = r.checkerboard;
+    _odata       =std::move(r._odata);
+    return *this;
+  }

-    // *=,+=,-= operators inherit behvour from correspond */+/- operation
-    template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
-        *this = (*this)*r;
-        return *this;
-    }
-
-    template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
-        *this = (*this)-r;
-        return *this;
-    }
-    template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
-        *this = (*this)+r;
-        return *this;
-    }
+  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
+    _grid        = r._grid;
+    checkerboard = r.checkerboard;
+    _odata.resize(_grid->oSites());// essential
    
-    strong_inline friend Lattice<vobj> operator / (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
-        conformable(lhs,rhs);
-        Lattice<vobj> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  ret._odata[ss] = lhs._odata[ss]*pow(rhs._odata[ss],-1.0);
-        }
-        return ret;
-    };
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      _odata[ss]=r._odata[ss];
+    }  	
+    return *this;
+  }

- }; // class Lattice
+  template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
+    this->checkerboard = r.checkerboard;
+    conformable(*this,r);
+    
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      this->_odata[ss]=r._odata[ss];
+    }
+    return *this;
+  }

+  virtual ~Lattice(void) = default;
+    
+  void reset(GridBase* grid) {
+    if (_grid != grid) {
+      _grid = grid;
+      _odata.resize(grid->oSites());
+      checkerboard = 0;
+    }
+  }
+  
+
+  template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      this->_odata[ss]=r;
+    }
+    return *this;
+  }
+  
+  
+  // *=,+=,-= operators inherit behvour from correspond */+/- operation
+  template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
+    *this = (*this)*r;
+    return *this;
+  }
+  
+  template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
+    *this = (*this)-r;
+    return *this;
+  }
+  template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
+    *this = (*this)+r;
+    return *this;
+  }
+}; // class Lattice
+  
  template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
    std::vector<int> gcoor;
    typedef typename vobj::scalar_object sobj;
@@ -282,32 +344,32 @@ PARALLEL_FOR_LOOP
    }
    return stream;
  }
-
+  
 }



-#include <lattice/Lattice_conformable.h>
+#include "Lattice_conformable.h"
 #define GRID_LATTICE_EXPRESSION_TEMPLATES
 #ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES
-#include <lattice/Lattice_ET.h>
+#include "Lattice_ET.h"
 #else 
-#include <lattice/Lattice_overload.h>
+#include "Lattice_overload.h"
 #endif
-#include <lattice/Lattice_arith.h>
-#include <lattice/Lattice_trace.h>
-#include <lattice/Lattice_transpose.h>
-#include <lattice/Lattice_local.h>
-#include <lattice/Lattice_reduction.h>
-#include <lattice/Lattice_peekpoke.h>
-#include <lattice/Lattice_reality.h>
-#include <lattice/Lattice_comparison_utils.h>
-#include <lattice/Lattice_comparison.h>
-#include <lattice/Lattice_coordinate.h>
-#include <lattice/Lattice_where.h>
-#include <lattice/Lattice_rng.h>
-#include <lattice/Lattice_unary.h>
-#include <lattice/Lattice_transfer.h>
+#include "Lattice_arith.h"
+#include "Lattice_trace.h"
+#include "Lattice_transpose.h"
+#include "Lattice_local.h"
+#include "Lattice_reduction.h"
+#include "Lattice_peekpoke.h"
+#include "Lattice_reality.h"
+#include "Lattice_comparison_utils.h"
+#include "Lattice_comparison.h"
+#include "Lattice_coordinate.h"
+#include "Lattice_where.h"
+#include "Lattice_rng.h"
+#include "Lattice_unary.h"
+#include "Lattice_transfer.h"


 #endif
@@ -0,0 +1,169 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_comparison.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_COMPARISON_H
+#define GRID_LATTICE_COMPARISON_H
+
+namespace Grid {
+
+    //////////////////////////////////////////////////////////////////////////
+    // relational operators
+    // 
+    // Support <,>,<=,>=,==,!=
+    //
+    //Query supporting bitwise &, |, ^, !
+    //Query supporting logical &&, ||, 
+    //////////////////////////////////////////////////////////////////////////
+
+  //////////////////////////////////////////////////////////////////////////
+  // compare lattice to lattice
+  //////////////////////////////////////////////////////////////////////////
+  template<class vfunctor,class lobj,class robj>  
+    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
+  {
+    Lattice<vInteger> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
+    }
+    return ret;
+  }
+  //////////////////////////////////////////////////////////////////////////
+  // compare lattice to scalar
+  //////////////////////////////////////////////////////////////////////////
+  template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
+  {
+    Lattice<vInteger> ret(lhs._grid);
+    parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs);
+    }
+    return ret;
+  }
+  //////////////////////////////////////////////////////////////////////////
+  // compare scalar to lattice
+  //////////////////////////////////////////////////////////////////////////
+  template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
+  {
+    Lattice<vInteger> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs);
+    }
+    return ret;
+  }
+  
+  //////////////////////////////////////////////////////////////////////////
+  // Map to functors
+  //////////////////////////////////////////////////////////////////////////
+  // Less than
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    return LLComparison(vlt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
+    return LSComparison(vlt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
+    return SLComparison(vlt<lobj,robj>(),lhs,rhs);
+  }
+  
+  // Less than equal
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    return LLComparison(vle<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
+    return LSComparison(vle<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
+    return SLComparison(vle<lobj,robj>(),lhs,rhs);
+  }
+  
+  // Greater than 
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    return LLComparison(vgt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
+    return LSComparison(vgt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vgt<lobj,robj>(),lhs,rhs);
+  }
+  
+  
+  // Greater than equal
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   
+   // equal
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   
+   
+   // not equal
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+}
+#endif
@@ -1,3 +1,31 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_comparison_utils.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_COMPARISON_H
 #define GRID_COMPARISON_H

@@ -151,7 +179,7 @@ namespace Grid {
      return ret;
    }

-#define DECLARE_RELATIONAL(op,functor) \
+#define DECLARE_RELATIONAL_EQ(op,functor) \
  template<class vsimd,IfSimd<vsimd> = 0>\
    inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
    {\
@@ -170,11 +198,6 @@ namespace Grid {
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
-  template<class vsimd>\
-    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
-    {									\
-      return lhs._internal op rhs._internal;				\
-    }									\
  template<class vsimd>\
    inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
    {									\
@@ -184,14 +207,21 @@ namespace Grid {
    inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
    {									\
      return lhs op rhs._internal;					\
-    }									
+    }									\

+#define DECLARE_RELATIONAL(op,functor) \
+  DECLARE_RELATIONAL_EQ(op,functor)    \
+  template<class vsimd>\
+    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
+    {									\
+      return lhs._internal op rhs._internal;				\
+    }									

 DECLARE_RELATIONAL(<,slt);
 DECLARE_RELATIONAL(<=,sle);
 DECLARE_RELATIONAL(>,sgt);
 DECLARE_RELATIONAL(>=,sge);
-DECLARE_RELATIONAL(==,seq);
+DECLARE_RELATIONAL_EQ(==,seq);
 DECLARE_RELATIONAL(!=,sne);

 #undef DECLARE_RELATIONAL
@@ -0,0 +1,40 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_conformable.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_CONFORMABLE_H
+#define GRID_LATTICE_CONFORMABLE_H
+
+namespace Grid {
+
+    template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
+    {
+        assert(lhs._grid == rhs._grid);
+        assert(lhs.checkerboard == rhs.checkerboard);
+    }
+
+}
+#endif
@@ -0,0 +1,56 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_coordinate.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_COORDINATE_H
+#define GRID_LATTICE_COORDINATE_H
+
+namespace Grid {
+
+    template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
+    {
+      typedef typename iobj::scalar_type scalar_type;
+      typedef typename iobj::vector_type vector_type;
+
+      GridBase *grid = l._grid;
+      int Nsimd = grid->iSites();
+
+      std::vector<int> gcoor;
+      std::vector<scalar_type> mergebuf(Nsimd);
+
+      vector_type vI;
+      for(int o=0;o<grid->oSites();o++){
+	for(int i=0;i<grid->iSites();i++){
+	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
+	  mergebuf[i]=(Integer)gcoor[mu];
+	}
+	merge<vector_type,scalar_type>(vI,mergebuf);
+	l._odata[o]=vI;
+      }
+    };
+
+}
+#endif
@@ -0,0 +1,75 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_local.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_LOCALREDUCTION_H
+#define GRID_LATTICE_LOCALREDUCTION_H
+
+///////////////////////////////////////////////
+// localInner, localNorm, outerProduct
+///////////////////////////////////////////////
+
+namespace Grid {
+
+  /////////////////////////////////////////////////////
+  // Non site, reduced locally reduced routines
+  /////////////////////////////////////////////////////
+  
+  // localNorm2,
+  template<class vobj>
+    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
+      }
+      return ret;
+    }
+  
+  // localInnerProduct
+  template<class vobj>
+    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
+      }
+      return ret;
+    }
+  
+  // outerProduct Scalar x Scalar -> Scalar
+  //              Vector x Vector -> Matrix
+  template<class ll,class rr>
+    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
+  {
+    Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
+    }
+    return ret;
+  }
+}
+#endif
@@ -1,3 +1,30 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_overload.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_LATTICE_OVERLOAD_H
 #define GRID_LATTICE_OVERLOAD_H

@@ -10,8 +37,7 @@ namespace Grid {
  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
  {
    Lattice<vobj> ret(r._grid);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<r._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
      vstream(ret._odata[ss], -r._odata[ss]);
    }
    return ret;
@@ -47,8 +73,7 @@ PARALLEL_FOR_LOOP
  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
  {
    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites(); ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
      vstream(ret._odata[ss],tmp);
 	   //      ret._odata[ss]=lhs*rhs._odata[ss];
@@ -59,8 +84,7 @@ PARALLEL_FOR_LOOP
    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
    {
      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
 	vstream(ret._odata[ss],tmp);
 	//	ret._odata[ss]=lhs+rhs._odata[ss];
@@ -71,11 +95,9 @@ PARALLEL_FOR_LOOP
    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
  {
    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites(); ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
      vstream(ret._odata[ss],tmp);
-      //      ret._odata[ss]=lhs-rhs._odata[ss];
    }
    return ret;
  }
@@ -83,8 +105,7 @@ PARALLEL_FOR_LOOP
      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
    {
      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<lhs._grid->oSites(); ss++){
+      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
 	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
 	vstream(ret._odata[ss],tmp);
 	//            ret._odata[ss]=lhs._odata[ss]*rhs;
@@ -95,8 +116,7 @@ PARALLEL_FOR_LOOP
      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
    {
        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
 	  vstream(ret._odata[ss],tmp);
 	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
@@ -107,15 +127,12 @@ PARALLEL_FOR_LOOP
      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
    {
      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
 	  vstream(ret._odata[ss],tmp);
 	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
      }
      return ret;
    }
-
-
 }
 #endif
@@ -1,3 +1,32 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_peekpoke.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_LATTICE_PEEK_H
 #define GRID_LATTICE_PEEK_H

@@ -15,22 +44,20 @@ namespace Grid {
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
      ret.checkerboard=lhs.checkerboard;
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
-        }
-        return ret;
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
+      }
+      return ret;
    };
    template<int Index,class vobj>
-       auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
+      auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
      ret.checkerboard=lhs.checkerboard;
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
-        }
-        return ret;
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
+      }
+      return ret;
    };

    ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -39,25 +66,23 @@ PARALLEL_FOR_LOOP
    template<int Index,class vobj> 
    void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
    {
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
-	}      
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
+      }      
    }
    template<int Index,class vobj>
      void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
    {
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
-	}      
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
+      }      
    }

    //////////////////////////////////////////////////////
    // Poke a scalar object into the SIMD array
    //////////////////////////////////////////////////////
    template<class vobj,class sobj>
-    void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+    void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){

      GridBase *grid=l._grid;

@@ -91,7 +116,7 @@ PARALLEL_FOR_LOOP
    // Peek a scalar object from the SIMD array
    //////////////////////////////////////////////////////////
    template<class vobj,class sobj>
-      void peekSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
+      void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){
        
      GridBase *grid=l._grid;

@@ -102,9 +127,6 @@ PARALLEL_FOR_LOOP

      assert( l.checkerboard == l._grid->CheckerBoard(site));

-      // FIXME
-      //      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
-
      int rank,odx,idx;
      grid->GlobalCoorToRankIndex(rank,odx,idx,site);

@@ -123,9 +145,9 @@ PARALLEL_FOR_LOOP
    // Peek a scalar object from the SIMD array
    //////////////////////////////////////////////////////////
    template<class vobj,class sobj>
-    void peekLocalSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+    void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
        
-      GridBase *grid=l._grid;
+      GridBase *grid = l._grid;

      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
@@ -135,16 +157,18 @@ PARALLEL_FOR_LOOP
      assert( l.checkerboard== l._grid->CheckerBoard(site));
      assert( sizeof(sobj)*Nsimd == sizeof(vobj));

+      static const int words=sizeof(vobj)/sizeof(vector_type);
      int odx,idx;
      idx= grid->iIndex(site);
      odx= grid->oIndex(site);

-      std::vector<sobj> buf(Nsimd);
-
-      extract(l._odata[odx],buf);
+      scalar_type * vp = (scalar_type *)&l._odata[odx];
+      scalar_type * pt = (scalar_type *)&s;
+      
+      for(int w=0;w<words;w++){
+        pt[w] = vp[idx+w*Nsimd];
+      }
      
-      s = buf[idx];
-
      return;
    };

@@ -161,18 +185,17 @@ PARALLEL_FOR_LOOP
      assert( l.checkerboard== l._grid->CheckerBoard(site));
      assert( sizeof(sobj)*Nsimd == sizeof(vobj));

+      static const int words=sizeof(vobj)/sizeof(vector_type);
      int odx,idx;
      idx= grid->iIndex(site);
      odx= grid->oIndex(site);

-      std::vector<sobj> buf(Nsimd);
-
-      // extract-modify-merge cycle is easiest way and this is not perf critical
-      extract(l._odata[odx],buf);
+      scalar_type * vp = (scalar_type *)&l._odata[odx];
+      scalar_type * pt = (scalar_type *)&s;
      
-      buf[idx] = s;
-
-      merge(l._odata[odx],buf);
+      for(int w=0;w<words;w++){
+        vp[idx+w*Nsimd] = pt[w];
+      }

      return;
    };
@@ -0,0 +1,57 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_reality.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_REALITY_H
+#define GRID_LATTICE_REALITY_H
+
+
+// FIXME .. this is the sector of the code 
+// I am most worried about the directions
+// The choice of burying complex in the SIMD
+// is making the use of "real" and "imag" very cumbersome
+
+namespace Grid {
+
+    template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = adj(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+    template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  ret._odata[ss] = conjugate(lhs._odata[ss]);
+        }
+        return ret;
+    };
+}
+#endif
@@ -0,0 +1,733 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid 
+    Source file: ./lib/lattice/Lattice_reduction.h
+    Copyright (C) 2015
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_REDUCTION_H
+#define GRID_LATTICE_REDUCTION_H
+
+#include <Grid/Grid_Eigen_Dense.h>
+
+namespace Grid {
+#ifdef GRID_WARN_SUBOPTIMAL
+#warning "Optimisation alert all these reduction loops are NOT threaded "
+#endif     
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Deterministic Reduction operations
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
+  auto nrm = innerProduct(arg,arg);
+  return std::real(nrm); 
+}
+
+// Double inner product
+template<class vobj>
+inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
+  GridBase *grid = left._grid;
+  const int pad = 8;
+
+  ComplexD  inner;
+  Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
+
+  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
+    int nwork, mywork, myoff;
+    GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
+    
+    decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
+    }
+    // All threads sum across SIMD; reduce serial work at end
+    // one write per cacheline with streaming store
+    ComplexD tmp = Reduce(TensorRemove(vinner)) ;
+    vstream(sumarray[thr*pad],tmp);
+  }
+  
+  inner=0.0;
+  for(int i=0;i<grid->SumArraySize();i++){
+    inner = inner+sumarray[i*pad];
+  } 
+  right._grid->GlobalSum(inner);
+  return inner;
+}
+
+/////////////////////////
+// Fast axpby_norm
+// z = a x + b y
+// return norm z
+/////////////////////////
+template<class sobj,class vobj> strong_inline RealD 
+axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) 
+{
+  sobj one(1.0);
+  return axpby_norm_fast(z,a,one,x,y);
+}
+
+template<class sobj,class vobj> strong_inline RealD 
+axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) 
+{
+  const int pad = 8;
+  z.checkerboard = x.checkerboard;
+  conformable(z,x);
+  conformable(x,y);
+
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
+  RealD  nrm;
+  
+  GridBase *grid = x._grid;
+  
+  Vector<RealD> sumarray(grid->SumArraySize()*pad);
+  
+  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
+    int nwork, mywork, myoff;
+    GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
+    
+    // private to thread; sub summation
+    decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero; 
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
+      vnrm = vnrm + innerProductD(tmp,tmp);
+      vstream(z._odata[ss],tmp);
+    }
+    vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
+  }
+  
+  nrm = 0.0; // sum across threads; linear in thread count but fast
+  for(int i=0;i<grid->SumArraySize();i++){
+    nrm = nrm+sumarray[i*pad];
+  } 
+  z._grid->GlobalSum(nrm);
+  return nrm; 
+}
+
+ 
+template<class Op,class T1>
+inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
+  ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+template<class Op,class T1,class T2>
+inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
+      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+
+template<class Op,class T1,class T2,class T3>
+inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
+  ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+				      eval(0,std::get<1>(expr.second)),
+				      eval(0,std::get<2>(expr.second))
+				      ))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+template<class vobj>
+inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
+{
+  GridBase *grid=arg._grid;
+  int Nsimd = grid->Nsimd();
+  
+  std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
+  for(int i=0;i<grid->SumArraySize();i++){
+    sumarray[i]=zero;
+  }
+  
+  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
+    int nwork, mywork, myoff;
+    GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
+    
+    vobj vvsum=zero;
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vvsum = vvsum + arg._odata[ss];
+    }
+    sumarray[thr]=vvsum;
+  }
+  
+  vobj vsum=zero;  // sum across threads
+  for(int i=0;i<grid->SumArraySize();i++){
+    vsum = vsum+sumarray[i];
+  } 
+  
+  typedef typename vobj::scalar_object sobj;
+  sobj ssum=zero;
+  
+  std::vector<sobj>               buf(Nsimd);
+  extract(vsum,buf);
+  
+  for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
+  arg._grid->GlobalSum(ssum);
+  
+  return ssum;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
+{
+  ///////////////////////////////////////////////////////
+  // FIXME precision promoted summation
+  // may be important for correlation functions
+  // But easily avoided by using double precision fields
+  ///////////////////////////////////////////////////////
+  typedef typename vobj::scalar_object sobj;
+  GridBase  *grid = Data._grid;
+  assert(grid!=NULL);
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
+  std::vector<sobj> lsSum(ld,zero);                    // sum across these down to scalars
+  std::vector<sobj> extracted(Nsimd);                  // splitting the SIMD
+
+  result.resize(fd); // And then global sum to return the same vector to every node 
+  for(int r=0;r<rd;r++){
+    lvSum[r]=zero;
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  // sum over reduced dimension planes, breaking out orthog dir
+  // Parallel over orthog direction
+  parallel_for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	lvSum[r]=lvSum[r]+Data._odata[ss];
+      }
+    }
+  }
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  std::vector<int> icoor(Nd);
+
+  for(int rt=0;rt<rd;rt++){
+
+    extract(lvSum[rt],extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx];
+
+    }
+  }
+  
+  // sum over nodes.
+  sobj gsum;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum=lsSum[lt];
+    } else {
+      gsum=zero;
+    }
+
+    grid->GlobalSum(gsum);
+
+    result[t]=gsum;
+  }
+}
+
+template<class vobj>
+static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
+{
+  // std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
+
+  typedef typename vobj::scalar_type scalar_type;
+  std::vector<scalar_type> lsSum;
+  localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
+  globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
+  // std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
+}
+
+template <class vobj>
+static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
+{
+  // std::cout << GridLogMessage << "Start prep" << std::endl;
+  typedef typename vobj::vector_type   vector_type;
+  typedef typename vobj::scalar_type   scalar_type;
+  GridBase  *grid = lhs._grid;
+  assert(grid!=NULL);
+  conformable(grid,rhs._grid);
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+  // std::cout << GridLogMessage << "Start alloc" << std::endl;
+
+  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
+  lsSum.resize(ld,scalar_type(0.0));                    // sum across these down to scalars
+  std::vector<iScalar<scalar_type>> extracted(Nsimd);   // splitting the SIMD  
+  // std::cout << GridLogMessage << "End alloc" << std::endl;
+
+  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
+  for(int r=0;r<rd;r++){
+    lvSum[r]=zero;
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+  // std::cout << GridLogMessage << "End prep" << std::endl;
+  // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
+  vector_type vv;
+  parallel_for(int r=0;r<rd;r++)
+  {
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+        int ss = so + n * stride + b;
+        vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss]));
+        lvSum[r] = lvSum[r] + vv;
+      }
+    }
+  }
+  // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  std::vector<int> icoor(Nd);
+  for(int rt=0;rt<rd;rt++){
+
+    iScalar<vector_type> temp; 
+    temp._internal = lvSum[rt];
+    extract(temp,extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
+
+    }
+  }
+  // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
+}
+template <class vobj>
+static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  GridBase *grid = lhs._grid;
+  int fd = result.size();
+  int ld = lsSum.size();
+  // sum over nodes.
+  std::vector<scalar_type> gsum;
+  gsum.resize(fd, scalar_type(0.0));
+  // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum[t]=lsSum[lt];
+    }
+  }
+  // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
+  // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
+  grid->GlobalSumVector(&gsum[0], fd);
+  // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
+
+  result = gsum;
+}
+template<class vobj>
+static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
+{
+  typedef typename vobj::vector_type   vector_type;
+  typedef typename vobj::scalar_type   scalar_type;
+  GridBase  *grid = lhs._grid;
+  assert(grid!=NULL);
+  conformable(grid,rhs._grid);
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
+  std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
+  std::vector<iScalar<scalar_type> > extracted(Nsimd);                  // splitting the SIMD
+
+  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
+  for(int r=0;r<rd;r++){
+    lvSum[r]=zero;
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  parallel_for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss]));
+	lvSum[r]=lvSum[r]+vv;
+      }
+    }
+  }
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  std::vector<int> icoor(Nd);
+  for(int rt=0;rt<rd;rt++){
+
+    iScalar<vector_type> temp; 
+    temp._internal = lvSum[rt];
+    extract(temp,extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
+
+    }
+  }
+  
+  // sum over nodes.
+  scalar_type gsum;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum=lsSum[lt];
+    } else {
+      gsum=scalar_type(0.0);
+    }
+
+    grid->GlobalSum(gsum);
+
+    result[t]=gsum;
+  }
+}
+template<class vobj>
+static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  int Nblock = rhs._grid->GlobalDimensions()[Orthog];
+  std::vector<ComplexD> ip(Nblock);
+  sn.resize(Nblock);
+  
+  sliceInnerProductVector(ip,rhs,rhs,Orthog);
+  for(int ss=0;ss<Nblock;ss++){
+    sn[ss] = real(ip[ss]);
+  }
+};
+
+
+template<class vobj>
+static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
+			    int orthogdim,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::tensor_reduced tensor_reduced;
+  
+  scalar_type zscale(scale);
+
+  GridBase *grid  = X._grid;
+
+  int Nsimd  =grid->Nsimd();
+  int Nblock =grid->GlobalDimensions()[orthogdim];
+
+  int fd     =grid->_fdimensions[orthogdim];
+  int ld     =grid->_ldimensions[orthogdim];
+  int rd     =grid->_rdimensions[orthogdim];
+
+  int e1     =grid->_slice_nblock[orthogdim];
+  int e2     =grid->_slice_block [orthogdim];
+  int stride =grid->_slice_stride[orthogdim];
+
+  std::vector<int> icoor;
+
+  for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    vector_type    av;
+
+    for(int l=0;l<Nsimd;l++){
+      grid->iCoorFromIindex(icoor,l);
+      int ldx =r+icoor[orthogdim]*rd;
+      scalar_type *as =(scalar_type *)&av;
+      as[l] = scalar_type(a[ldx])*zscale;
+    }
+
+    tensor_reduced at; at=av;
+
+    parallel_for_nest2(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	R._odata[ss] = at*X._odata[ss]+Y._odata[ss];
+      }
+    }
+  }
+};
+
+/*
+inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
+{
+  int NN    = BlockSolverGrid->_ndimension;
+  int nsimd = BlockSolverGrid->Nsimd();
+  
+  std::vector<int> latt_phys(0);
+  std::vector<int> simd_phys(0);
+  std::vector<int>  mpi_phys(0);
+  
+  for(int d=0;d<NN;d++){
+    if( d!=Orthog ) { 
+      latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
+      simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
+      mpi_phys.push_back(BlockSolverGrid->_processors[d]);
+    }
+  }
+  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
+}
+*/
+
+template<class vobj>
+static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X._grid->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X._grid;
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+#pragma omp parallel 
+  {
+    std::vector<vobj> s_x(Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = Y[o+i*ostride];
+	for(int j=0;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }}
+  }
+};
+
+template<class vobj>
+static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X._grid->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X._grid;
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl=1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+#pragma omp parallel 
+  {
+    std::vector<vobj> s_x(Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = s_x[0]*(scale*aa(0,i));
+	for(int j=1;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }}
+  }
+
+};
+
+
+template<class vobj>
+static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  GridBase *FullGrid  = lhs._grid;
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  
+  int Nblock = FullGrid->GlobalDimensions()[Orthog];
+  
+  //  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+  
+  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+
+  typedef typename vobj::vector_typeD vector_typeD;
+
+#pragma omp parallel 
+  {
+    std::vector<vobj> Left(Nblock);
+    std::vector<vobj> Right(Nblock);
+    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	Left [i] = lhs[o+i*ostride];
+	Right[i] = rhs[o+i*ostride];
+      }
+
+      for(int i=0;i<Nblock;i++){
+      for(int j=0;j<Nblock;j++){
+	auto tmp = innerProduct(Left[i],Right[j]);
+	auto rtmp = TensorRemove(tmp);
+	mat_thread(i,j) += Reduce(rtmp);
+      }}
+    }}
+#pragma omp critical
+    {
+      mat += mat_thread;
+    }  
+  }
+
+  for(int i=0;i<Nblock;i++){
+  for(int j=0;j<Nblock;j++){
+    ComplexD sum = mat(i,j);
+    FullGrid->GlobalSum(sum);
+    mat(i,j)=sum;
+  }}
+
+  return;
+}
+
+} /*END NAMESPACE GRID*/
+#endif
+
+
+
@@ -0,0 +1,516 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_rng.h
+
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_RNG_H
+#define GRID_LATTICE_RNG_H
+
+#include <random>
+
+#ifdef RNG_SITMO
+#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#endif 
+
+#if defined(RNG_SITMO)
+#define RNG_FAST_DISCARD
+#else 
+#undef  RNG_FAST_DISCARD
+#endif
+
+namespace Grid {
+
+  //////////////////////////////////////////////////////////////
+  // Allow the RNG state to be less dense than the fine grid
+  //////////////////////////////////////////////////////////////
+  inline int RNGfillable(GridBase *coarse,GridBase *fine)
+  {
+
+    int rngdims = coarse->_ndimension;
+
+    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+    int lowerdims   = fine->_ndimension - coarse->_ndimension;
+    assert(lowerdims >= 0);
+    for(int d=0;d<lowerdims;d++){
+      assert(fine->_simd_layout[d]==1);
+      assert(fine->_processors[d]==1);
+    }
+
+    int multiplicity=1;
+    for(int d=0;d<lowerdims;d++){
+      multiplicity=multiplicity*fine->_rdimensions[d];
+    }
+    // local and global volumes subdivide cleanly after SIMDization
+    for(int d=0;d<rngdims;d++){
+      int fd= d+lowerdims;
+      assert(coarse->_processors[d]  == fine->_processors[fd]);
+      assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
+      assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
+
+      multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
+    }
+    return multiplicity;
+  }
+
+  
+// merge of April 11 2017
+  // this function is necessary for the LS vectorised field
+  inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
+  {
+    int rngdims = coarse->_ndimension;
+    
+    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+    int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0);
+    // assumes that the higher dimensions are not using more processors
+    // all further divisions are local
+    for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
+    for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
+
+    // then divide the number of local sites
+    // check that the total number of sims agree, meanse the iSites are the same
+    assert(fine->Nsimd() == coarse->Nsimd());
+
+    // check that the two grids divide cleanly
+    assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
+
+    return fine->lSites() / coarse->lSites();
+  }
+  
+  // real scalars are one component
+  template<class scalar,class distribution,class generator> 
+  void fillScalar(scalar &s,distribution &dist,generator & gen)
+  {
+    s=dist(gen);
+  }
+  template<class distribution,class generator> 
+  void fillScalar(ComplexF &s,distribution &dist, generator &gen)
+  {
+    s=ComplexF(dist(gen),dist(gen));
+  }
+  template<class distribution,class generator> 
+  void fillScalar(ComplexD &s,distribution &dist,generator &gen)
+  {
+    s=ComplexD(dist(gen),dist(gen));
+  }
+  
+  class GridRNGbase {
+  public:
+    // One generator per site.
+    // Uniform and Gaussian distributions from these generators.
+#ifdef RNG_RANLUX
+    typedef std::ranlux48 RngEngine;
+    typedef uint64_t      RngStateType;
+    static const int RngStateCount = 15;
+#endif 
+#ifdef RNG_MT19937 
+    typedef std::mt19937 RngEngine;
+    typedef uint32_t     RngStateType;
+    static const int     RngStateCount = std::mt19937::state_size;
+#endif
+#ifdef RNG_SITMO
+    typedef sitmo::prng_engine 	RngEngine;
+    typedef uint64_t    	RngStateType;
+    static const int    	RngStateCount = 13;
+#endif
+
+    std::vector<RngEngine>                             _generators;
+    std::vector<std::uniform_real_distribution<RealD> > _uniform;
+    std::vector<std::normal_distribution<RealD> >       _gaussian;
+    std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
+    std::vector<std::uniform_int_distribution<uint32_t> > _uid;
+
+    ///////////////////////
+    // support for parallel init
+    ///////////////////////
+#ifdef RNG_FAST_DISCARD
+    static void Skip(RngEngine &eng,uint64_t site)
+    {
+      /////////////////////////////////////////////////////////////////////////////////////
+      // Skip by 2^40 elements between successive lattice sites
+      // This goes by 10^12.
+      // Consider quenched updating; likely never exceeding rate of 1000 sweeps
+      // per second on any machine. This gives us of order 10^9 seconds, or 100 years
+      // skip ahead.
+      // For HMC unlikely to go at faster than a solve per second, and 
+      // tens of seconds per trajectory so this is clean in all reasonable cases,
+      // and margin of safety is orders of magnitude.
+      // We could hack Sitmo to skip in the higher order words of state if necessary
+      //
+      // Replace with 2^30 ; avoid problem on large volumes
+      //
+      /////////////////////////////////////////////////////////////////////////////////////
+      //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
+      const int shift = 30;
+
+      uint64_t skip = site;
+
+      skip = skip<<shift;
+
+      assert((skip >> shift)==site); // check for overflow
+
+      eng.discard(skip);
+      //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
+    } 
+#endif
+    static RngEngine Reseed(RngEngine &eng)
+    {
+      std::vector<uint32_t> newseed;
+      std::uniform_int_distribution<uint32_t> uid;
+      return Reseed(eng,newseed,uid);
+    }
+    static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
+			    std::uniform_int_distribution<uint32_t> &uid)
+    {
+      const int reseeds=4;
+      
+      newseed.resize(reseeds);
+      for(int i=0;i<reseeds;i++){
+	newseed[i] = uid(eng);
+      }
+      std::seed_seq sseq(newseed.begin(),newseed.end());
+      return RngEngine(sseq);
+    }    
+
+    void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
+      saved.resize(RngStateCount);
+      std::stringstream ss;
+      ss<<eng;
+      ss.seekg(0,ss.beg);
+      for(int i=0;i<RngStateCount;i++){
+        ss>>saved[i];
+      }
+    }
+    void GetState(std::vector<RngStateType> & saved,int gen) {
+      GetState(saved,_generators[gen]);
+    }
+    void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
+      assert(saved.size()==RngStateCount);
+      std::stringstream ss;
+      for(int i=0;i<RngStateCount;i++){
+        ss<< saved[i]<<" ";
+      }
+      ss.seekg(0,ss.beg);
+      ss>>eng;
+    }
+    void SetState(std::vector<RngStateType> & saved,int gen){
+      SetState(saved,_generators[gen]);
+    }
+    void SetEngine(RngEngine &Eng, int gen){
+      _generators[gen]=Eng;
+    }
+    void GetEngine(RngEngine &Eng, int gen){
+      Eng=_generators[gen];
+    }
+    template<class source> void Seed(source &src, int gen)
+    {
+      _generators[gen] = RngEngine(src);
+    }    
+  };
+
+  class GridSerialRNG : public GridRNGbase {
+  public:
+
+    GridSerialRNG() : GridRNGbase() {
+      _generators.resize(1);
+      _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
+      _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
+      _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
+      _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
+    }
+
+    template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
+
+      typedef typename sobj::scalar_type scalar_type;
+ 
+      int words = sizeof(sobj)/sizeof(scalar_type);
+
+      scalar_type *buf = (scalar_type *) & l;
+
+      dist[0].reset();
+      for(int idx=0;idx<words;idx++){
+	fillScalar(buf[idx],dist[0],_generators[0]);
+      }
+
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+
+    };
+
+    template <class distribution>  inline void fill(ComplexF &l,std::vector<distribution> &dist){
+      dist[0].reset();
+      fillScalar(l,dist[0],_generators[0]);
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(ComplexD &l,std::vector<distribution> &dist){
+      dist[0].reset();
+      fillScalar(l,dist[0],_generators[0]);
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(RealF &l,std::vector<distribution> &dist){
+      dist[0].reset();
+      fillScalar(l,dist[0],_generators[0]);
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(RealD &l,std::vector<distribution> &dist){
+      dist[0].reset();
+      fillScalar(l,dist[0],_generators[0]);
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    // vector fill
+    template <class distribution>  inline void fill(vComplexF &l,std::vector<distribution> &dist){
+      RealF *pointer=(RealF *)&l;
+      dist[0].reset();
+      for(int i=0;i<2*vComplexF::Nsimd();i++){
+	fillScalar(pointer[i],dist[0],_generators[0]);
+      }
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(vComplexD &l,std::vector<distribution> &dist){
+      RealD *pointer=(RealD *)&l;
+      dist[0].reset();
+      for(int i=0;i<2*vComplexD::Nsimd();i++){
+	fillScalar(pointer[i],dist[0],_generators[0]);
+      }
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(vRealF &l,std::vector<distribution> &dist){
+      RealF *pointer=(RealF *)&l;
+      dist[0].reset();
+      for(int i=0;i<vRealF::Nsimd();i++){
+	fillScalar(pointer[i],dist[0],_generators[0]);
+      }
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(vRealD &l,std::vector<distribution> &dist){
+      RealD *pointer=(RealD *)&l;
+      dist[0].reset();
+      for(int i=0;i<vRealD::Nsimd();i++){
+	fillScalar(pointer[i],dist[0],_generators[0]);
+      }
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    
+    void SeedFixedIntegers(const std::vector<int> &seeds){
+      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
+      std::seed_seq src(seeds.begin(),seeds.end());
+      Seed(src,0);
+    }
+
+    void SeedUniqueString(const std::string &s){
+      std::vector<int> seeds;
+      std::stringstream sha;
+      seeds = GridChecksum::sha256_seeds(s);
+      for(int i=0;i<seeds.size();i++) { 
+        sha << std::hex << seeds[i];
+      }
+      std::cout << GridLogMessage << "Intialising serial RNG with unique string '" 
+                << s << "'" << std::endl;
+      std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
+      SeedFixedIntegers(seeds);
+    }
+  };
+
+  class GridParallelRNG : public GridRNGbase {
+
+    double _time_counter;
+
+  public:
+    GridBase *_grid;
+    unsigned int _vol;
+
+    int generator_idx(int os,int is) {
+      return is*_grid->oSites()+os;
+    }
+
+    GridParallelRNG(GridBase *grid) : GridRNGbase() {
+      _grid = grid;
+      _vol  =_grid->iSites()*_grid->oSites();
+
+      _generators.resize(_vol);
+      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
+      _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
+      _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
+      _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
+    }
+
+    template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
+
+      typedef typename vobj::scalar_object scalar_object;
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      double inner_time_counter = usecond();
+
+      int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
+      int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l._grid too
+      int osites = _grid->oSites();  // guaranteed to be <= l._grid->oSites() by a factor multiplicity
+      int words  = sizeof(scalar_object) / sizeof(scalar_type);
+
+      parallel_for(int ss=0;ss<osites;ss++){
+        std::vector<scalar_object> buf(Nsimd);
+        for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times
+
+          int sm = multiplicity * ss + m;  // Maps the generator site to the fine site
+
+          for (int si = 0; si < Nsimd; si++) {
+            
+            int gdx = generator_idx(ss, si);  // index of generator state
+            scalar_type *pointer = (scalar_type *)&buf[si];
+            dist[gdx].reset();
+            for (int idx = 0; idx < words; idx++) 
+              fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
+          }
+          // merge into SIMD lanes, FIXME suboptimal implementation
+          merge(l._odata[sm], buf);
+        }
+      }
+
+      _time_counter += usecond()- inner_time_counter;
+    };
+
+    void SeedUniqueString(const std::string &s){
+      std::vector<int> seeds;
+      seeds = GridChecksum::sha256_seeds(s);
+      std::cout << GridLogMessage << "Intialising parallel RNG with unique string '" 
+                << s << "'" << std::endl;
+      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
+      SeedFixedIntegers(seeds);
+    }
+    void SeedFixedIntegers(const std::vector<int> &seeds){
+
+      // Everyone generates the same seed_seq based on input seeds
+      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
+
+      std::seed_seq source(seeds.begin(),seeds.end());
+
+      RngEngine master_engine(source);
+
+#ifdef RNG_FAST_DISCARD
+      ////////////////////////////////////////////////
+      // Skip ahead through a single stream.
+      // Applicable to SITMO and other has based/crypto RNGs
+      // Should be applicable to Mersenne Twister, but the C++11
+      // MT implementation does not implement fast discard even though
+      // in principle this is possible
+      ////////////////////////////////////////////////
+
+      // Everybody loops over global volume.
+      parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){
+
+	// Where is it?
+	int rank,o_idx,i_idx;
+	std::vector<int> gcoor;
+
+	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
+	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+
+	// If this is one of mine we take it
+	if( rank == _grid->ThisRank() ){
+	  int l_idx=generator_idx(o_idx,i_idx);
+	  _generators[l_idx] = master_engine;
+	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
+	}
+
+      }
+#else 
+      ////////////////////////////////////////////////////////////////
+      // Machine and thread decomposition dependent seeding is efficient
+      // and maximally parallel; but NOT reproducible from machine to machine. 
+      // Not ideal, but fastest way to reseed all nodes.
+      ////////////////////////////////////////////////////////////////
+      {
+	// Obtain one Reseed per processor
+	int Nproc = _grid->ProcessorCount();
+	std::vector<RngEngine> seeders(Nproc);
+	int me= _grid->ThisRank();
+	for(int p=0;p<Nproc;p++){
+	  seeders[p] = Reseed(master_engine);
+	}
+	master_engine = seeders[me];
+      }
+
+      {
+	// Obtain one reseeded generator per thread
+	int Nthread = GridThread::GetThreads();
+	std::vector<RngEngine> seeders(Nthread);
+	for(int t=0;t<Nthread;t++){
+	  seeders[t] = Reseed(master_engine);
+	}
+
+	parallel_for(int t=0;t<Nthread;t++) {
+	  // set up one per local site in threaded fashion
+	  std::vector<uint32_t> newseeds;
+	  std::uniform_int_distribution<uint32_t> uid;	
+	  for(int l=0;l<_grid->lSites();l++) {
+	    if ( (l%Nthread)==t ) {
+	      _generators[l] = Reseed(seeders[t],newseeds,uid);
+	    }
+	  }
+	}
+      }
+#endif
+    }
+
+    void Report(){
+      std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////
+    // Support for rigorous test of RNG's
+    // Return uniform random uint32_t from requested site generator
+    ////////////////////////////////////////////////////////////////////////
+    uint32_t GlobalU01(int gsite){
+
+      uint32_t the_number;
+      // who
+      std::vector<int> gcoor;
+      int rank,o_idx,i_idx;
+      _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
+      _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+
+      // draw
+      int l_idx=generator_idx(o_idx,i_idx);
+      if( rank == _grid->ThisRank() ){
+	the_number = _uid[l_idx](_generators[l_idx]);
+      }
+      
+      // share & return
+      _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
+      return the_number;
+    }
+
+  };
+
+  template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
+  template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
+  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
+
+  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
+  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
+  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
+
+}
+#endif
@@ -0,0 +1,67 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_trace.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_TRACE_H
+#define GRID_LATTICE_TRACE_H
+
+///////////////////////////////////////////////
+// Tracing, transposing, peeking, poking
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Trace
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class vobj>
+    inline auto trace(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(trace(lhs._odata[0]))>
+    {
+      Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = trace(lhs._odata[ss]);
+        }
+        return ret;
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Trace Index level dependent operation
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj>
+    inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
+    {
+      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
+      }
+      return ret;
+    };
+
+
+}
+#endif
+
@@ -0,0 +1,63 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_transpose.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_TRANSPOSE_H
+#define GRID_LATTICE_TRANSPOSE_H
+
+///////////////////////////////////////////////
+// Transpose
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Transpose
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj>
+    inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
+    Lattice<vobj> ret(lhs._grid);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+      ret._odata[ss] = transpose(lhs._odata[ss]);
+    }
+    return ret;
+  };
+    
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Index level dependent transpose
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<int Index,class vobj>
+    inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
+  {
+    Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+      ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
+    }
+    return ret;
+  };
+}
+#endif
@@ -0,0 +1,84 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_unary.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_UNARY_H
+#define GRID_LATTICE_UNARY_H
+
+namespace Grid {
+
+  template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs,RealD y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=pow(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+  template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs,Integer y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=mod(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+
+  template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=div(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+
+  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
+    }
+
+    return ret;
+
+    
+    
+
+    
+  }
+
+
+
+}
+#endif
@@ -1,3 +1,32 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_where.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_LATTICE_WHERE_H
 #define GRID_LATTICE_WHERE_H
 namespace Grid {
@@ -27,8 +56,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
  std::vector<scalar_object> truevals (Nsimd);
  std::vector<scalar_object> falsevals(Nsimd);

-PARALLEL_FOR_LOOP
-  for(int ss=0;ss<iftrue._grid->oSites(); ss++){
+  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){

    extract(iftrue._odata[ss]   ,truevals);
    extract(iffalse._odata[ss]  ,falsevals);
@@ -0,0 +1,116 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/Log.cc
+
+Copyright (C) 2015
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/GridCore.h>
+#include <Grid/util/CompilerCompatible.h>
+
+#include <cxxabi.h>
+#include <memory>
+
+namespace Grid {
+
+  std::string demangle(const char* name) {
+    
+    int status = -4; // some arbitrary value to eliminate the compiler warning
+    
+    // enable c++11 by passing the flag -std=c++11 to g++
+    std::unique_ptr<char, void(*)(void*)> res {
+      abi::__cxa_demangle(name, NULL, NULL, &status),
+	std::free
+	};
+    
+    return (status==0) ? res.get() : name ;
+  }
+  
+GridStopWatch Logger::GlobalStopWatch;
+int Logger::timestamp;
+std::ostream Logger::devnull(0);
+
+void GridLogTimestamp(int on){
+  Logger::Timestamp(on);
+}
+
+Colours GridLogColours(0);
+GridLogger GridLogIRL    (1, "IRL"   , GridLogColours, "NORMAL");
+GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
+GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
+GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
+GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
+GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
+GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
+GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
+GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
+
+void GridLogConfigure(std::vector<std::string> &logstreams) {
+  GridLogError.Active(0);
+  GridLogWarning.Active(0);
+  GridLogMessage.Active(1); // at least the messages should be always on
+  GridLogIterative.Active(0);
+  GridLogDebug.Active(0);
+  GridLogPerformance.Active(0);
+  GridLogIntegrator.Active(0);
+  GridLogColours.Active(0);
+
+  for (int i = 0; i < logstreams.size(); i++) {
+    if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
+    if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
+    if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
+    if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
+    if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
+    if (logstreams[i] == std::string("Performance"))
+      GridLogPerformance.Active(1);
+    if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
+  }
+}
+
+////////////////////////////////////////////////////////////
+// Verbose limiter on MPI tasks
+////////////////////////////////////////////////////////////
+void Grid_quiesce_nodes(void) {
+  int me = 0;
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+#endif
+#ifdef GRID_COMMS_SHMEM
+  me = shmem_my_pe();
+#endif
+  if (me) {
+    std::cout.setstate(std::ios::badbit);
+  }
+}
+
+void Grid_unquiesce_nodes(void) {
+#ifdef GRID_COMMS_MPI
+  std::cout.clear();
+#endif
+}
+}
@@ -0,0 +1,218 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Log.h
+
+    Copyright (C) 2015
+
+    Author: Antonin Portelli <antonin.portelli@me.com>
+    Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <map>
+
+#ifndef GRID_LOG_H
+#define GRID_LOG_H
+
+#ifdef HAVE_EXECINFO_H
+#include <execinfo.h>
+#endif
+
+namespace Grid {
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Dress the output; use std::chrono for time stamping via the StopWatch class
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+class Colours{
+protected:
+  bool is_active;
+public:
+  std::map<std::string, std::string> colour;
+
+  Colours(bool activate=false){
+    Active(activate);
+  };
+
+  void Active(bool activate){
+    is_active=activate;
+    if (is_active){
+     colour["BLACK"]  ="\033[30m";
+     colour["RED"]    ="\033[31m";
+     colour["GREEN"]  ="\033[32m";
+     colour["YELLOW"] ="\033[33m";
+     colour["BLUE"]   ="\033[34m";
+     colour["PURPLE"] ="\033[35m";
+     colour["CYAN"]   ="\033[36m";
+     colour["WHITE"]  ="\033[37m";
+     colour["NORMAL"] ="\033[0;39m";
+    } else {
+      colour["BLACK"] ="";
+      colour["RED"]   ="";
+      colour["GREEN"] ="";
+      colour["YELLOW"]="";
+      colour["BLUE"]  ="";
+      colour["PURPLE"]="";
+      colour["CYAN"]  ="";
+      colour["WHITE"] ="";
+      colour["NORMAL"]="";
+    }
+  };
+};
+
+
+class Logger {
+protected:
+  Colours &Painter;
+  int active;
+  int timing_mode;
+  int topWidth{-1}, chanWidth{-1};
+  static int timestamp;
+  std::string name, topName;
+  std::string COLOUR;
+
+public:
+  static GridStopWatch GlobalStopWatch;
+  GridStopWatch         LocalStopWatch;
+  GridStopWatch *StopWatch;
+  static std::ostream devnull;
+
+  std::string background() {return Painter.colour["NORMAL"];}
+  std::string evidence() {return Painter.colour["YELLOW"];}
+  std::string colour() {return Painter.colour[COLOUR];}
+
+  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)  : active(on),
+    name(nm),
+    topName(topNm),
+    Painter(col_class),
+    timing_mode(0),
+    COLOUR(col) 
+    {
+      StopWatch = & GlobalStopWatch;
+    };
+  
+  void Active(int on) {active = on;};
+  int  isActive(void) {return active;};
+  static void Timestamp(int on) {timestamp = on;};
+  void Reset(void) { 
+    StopWatch->Reset(); 
+    StopWatch->Start(); 
+  }
+  void TimingMode(int on) { 
+    timing_mode = on; 
+    if(on) { 
+      StopWatch = &LocalStopWatch;
+      Reset(); 
+    }
+  }
+  void setTopWidth(const int w) {topWidth = w;}
+  void setChanWidth(const int w) {chanWidth = w;}
+
+  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
+
+    if ( log.active ) {
+      stream << log.background()<<  std::left;
+      if (log.topWidth > 0)
+      {
+        stream << std::setw(log.topWidth);
+      }
+      stream << log.topName << log.background()<< " : ";
+      stream << log.colour() <<  std::left;
+      if (log.chanWidth > 0)
+      {
+        stream << std::setw(log.chanWidth);
+      }
+      stream << log.name << log.background() << " : ";
+      if ( log.timestamp ) {
+	log.StopWatch->Stop();
+	GridTime now = log.StopWatch->Elapsed();
+	
+	if ( log.timing_mode==1 ) log.StopWatch->Reset();
+	log.StopWatch->Start();
+	stream << log.evidence()
+	       << now	       << log.background() << " : " ;
+      }
+      stream << log.colour();
+      return stream;
+    } else { 
+      return devnull;
+    }
+  }
+
+};
+
+class GridLogger: public Logger {
+public:
+  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
+  Logger("Grid", on, nm, col_class, col_key){};
+};
+
+void GridLogConfigure(std::vector<std::string> &logstreams);
+
+extern GridLogger GridLogIRL;
+extern GridLogger GridLogSolver;
+extern GridLogger GridLogError;
+extern GridLogger GridLogWarning;
+extern GridLogger GridLogMessage;
+extern GridLogger GridLogDebug  ;
+extern GridLogger GridLogPerformance;
+extern GridLogger GridLogIterative  ;
+extern GridLogger GridLogIntegrator  ;
+extern Colours    GridLogColours;
+
+ std::string demangle(const char* name) ;
+
+#define _NBACKTRACE (256)
+extern void * Grid_backtrace_buffer[_NBACKTRACE];
+
+#define BACKTRACEFILE() {\
+char string[20];					\
+std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
+std::FILE * fp = std::fopen(string,"w");				\
+BACKTRACEFP(fp)\
+std::fclose(fp);	    \
+}
+
+
+#ifdef HAVE_EXECINFO_H
+#define BACKTRACEFP(fp) { \
+int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
+char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
+for (int i = 0; i < symbols; i++){\
+  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, demangle(strings[i]).c_str()); std::fflush(fp); \
+}\
+}
+#else 
+#define BACKTRACEFP(fp) { \
+std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
+}
+#endif
+
+#define BACKTRACE() BACKTRACEFP(stdout) 
+
+
+}
+#endif
@@ -0,0 +1,729 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/parallelIO/BinaryIO.h
+
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu<guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_BINARY_IO_H
+#define GRID_BINARY_IO_H
+
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
+#define USE_MPI_IO
+#else
+#undef  USE_MPI_IO
+#endif
+
+#ifdef HAVE_ENDIAN_H
+#include <endian.h>
+#endif
+
+#include <arpa/inet.h>
+#include <algorithm>
+
+namespace Grid { 
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// Byte reversal garbage
+/////////////////////////////////////////////////////////////////////////////////
+inline uint32_t byte_reverse32(uint32_t f) { 
+      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+      return f;
+}
+inline uint64_t byte_reverse64(uint64_t f) { 
+  uint64_t g;
+  g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+  g = g << 32;
+  f = f >> 32;
+  g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+  return g;
+}
+
+#if BYTE_ORDER == BIG_ENDIAN 
+inline uint64_t Grid_ntohll(uint64_t A) { return A; }
+#else
+inline uint64_t Grid_ntohll(uint64_t A) { 
+  return byte_reverse64(A);
+}
+#endif
+
+// A little helper
+inline void removeWhitespace(std::string &key)
+{
+  key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Static class holding the parallel IO code
+// Could just use a namespace
+///////////////////////////////////////////////////////////////////////////////////////////////////
+class BinaryIO {
+ public:
+
+  /////////////////////////////////////////////////////////////////////////////
+  // more byte manipulation helpers
+  /////////////////////////////////////////////////////////////////////////////
+
+  template<class vobj> static inline void Uint32Checksum(Lattice<vobj> &lat,uint32_t &nersc_csum)
+  {
+    typedef typename vobj::scalar_object sobj;
+
+    GridBase *grid = lat._grid;
+    uint64_t lsites = grid->lSites();
+
+    std::vector<sobj> scalardata(lsites); 
+    unvectorizeToLexOrdArray(scalardata,lat);    
+
+    NerscChecksum(grid,scalardata,nersc_csum);
+  }
+
+  template <class fobj>
+  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
+  {
+    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
+
+    uint64_t lsites = grid->lSites();
+    if (fbuf.size() == 1)
+    {
+      lsites = 1;
+    }
+
+PARALLEL_REGION
+    {
+      uint32_t nersc_csum_thr = 0;
+
+PARALLEL_FOR_LOOP_INTERN
+      for (uint64_t local_site = 0; local_site < lsites; local_site++)
+      {
+        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
+        for (uint64_t j = 0; j < size32; j++)
+        {
+          nersc_csum_thr = nersc_csum_thr + site_buf[j];
+        }
+      }
+
+PARALLEL_CRITICAL
+      {
+        nersc_csum += nersc_csum_thr;
+      }
+    }
+  }
+
+  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
+  {
+    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
+
+
+    int nd = grid->_ndimension;
+
+    uint64_t lsites              =grid->lSites();
+    if (fbuf.size()==1) {
+      lsites=1;
+    }
+    std::vector<int> local_vol   =grid->LocalDimensions();
+    std::vector<int> local_start =grid->LocalStarts();
+    std::vector<int> global_vol  =grid->FullDimensions();
+
+PARALLEL_REGION
+    { 
+      std::vector<int> coor(nd);
+      uint32_t scidac_csuma_thr=0;
+      uint32_t scidac_csumb_thr=0;
+      uint32_t site_crc=0;
+
+PARALLEL_FOR_LOOP_INTERN
+      for(uint64_t local_site=0;local_site<lsites;local_site++){
+
+	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
+
+	/* 
+	 * Scidac csum  is rather more heavyweight
+	 * FIXME -- 128^3 x 256 x 16 will overflow.
+	 */
+	
+	int global_site;
+
+	Lexicographic::CoorFromIndex(coor,local_site,local_vol);
+
+	for(int d=0;d<nd;d++) {
+	  coor[d] = coor[d]+local_start[d];
+	}
+
+	Lexicographic::IndexFromCoor(coor,global_site,global_vol);
+
+	uint32_t gsite29   = global_site%29;
+	uint32_t gsite31   = global_site%31;
+	
+	site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
+	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
+	//	std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
+	scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
+	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
+      }
+
+PARALLEL_CRITICAL
+      {
+	scidac_csuma^= scidac_csuma_thr;
+	scidac_csumb^= scidac_csumb_thr;
+      }
+    }
+  }
+
+  // Network is big endian
+  static inline void htobe32_v(void *file_object,uint32_t bytes){ be32toh_v(file_object,bytes);} 
+  static inline void htobe64_v(void *file_object,uint32_t bytes){ be64toh_v(file_object,bytes);} 
+  static inline void htole32_v(void *file_object,uint32_t bytes){ le32toh_v(file_object,bytes);} 
+  static inline void htole64_v(void *file_object,uint32_t bytes){ le64toh_v(file_object,bytes);} 
+
+  static inline void be32toh_v(void *file_object,uint64_t bytes)
+  {
+    uint32_t * f = (uint32_t *)file_object;
+    uint64_t count = bytes/sizeof(uint32_t);
+    parallel_for(uint64_t i=0;i<count;i++){  
+      f[i] = ntohl(f[i]);
+    }
+  }
+  // LE must Swap and switch to host
+  static inline void le32toh_v(void *file_object,uint64_t bytes)
+  {
+    uint32_t *fp = (uint32_t *)file_object;
+    uint32_t f;
+
+    uint64_t count = bytes/sizeof(uint32_t);
+    parallel_for(uint64_t i=0;i<count;i++){  
+      f = fp[i];
+      // got network order and the network to host
+      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+      fp[i] = ntohl(f);
+    }
+  }
+
+  // BE is same as network
+  static inline void be64toh_v(void *file_object,uint64_t bytes)
+  {
+    uint64_t * f = (uint64_t *)file_object;
+    uint64_t count = bytes/sizeof(uint64_t);
+    parallel_for(uint64_t i=0;i<count;i++){  
+      f[i] = Grid_ntohll(f[i]);
+    }
+  }
+  
+  // LE must swap and switch;
+  static inline void le64toh_v(void *file_object,uint64_t bytes)
+  {
+    uint64_t *fp = (uint64_t *)file_object;
+    uint64_t f,g;
+    
+    uint64_t count = bytes/sizeof(uint64_t);
+    parallel_for(uint64_t i=0;i<count;i++){  
+      f = fp[i];
+      // got network order and the network to host
+      g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+      g = g << 32;
+      f = f >> 32;
+      g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+      fp[i] = Grid_ntohll(g);
+    }
+  }
+  /////////////////////////////////////////////////////////////////////////////
+  // Real action:
+  // Read or Write distributed lexico array of ANY object to a specific location in file 
+  //////////////////////////////////////////////////////////////////////////////////////
+
+  static const int BINARYIO_MASTER_APPEND = 0x10;
+  static const int BINARYIO_UNORDERED     = 0x08;
+  static const int BINARYIO_LEXICOGRAPHIC = 0x04;
+  static const int BINARYIO_READ          = 0x02;
+  static const int BINARYIO_WRITE         = 0x01;
+
+  template<class word,class fobj>
+  static inline void IOobject(word w,
+			      GridBase *grid,
+			      std::vector<fobj> &iodata,
+			      std::string file,
+			      uint64_t& offset,
+			      const std::string &format, int control,
+			      uint32_t &nersc_csum,
+			      uint32_t &scidac_csuma,
+			      uint32_t &scidac_csumb)
+  {
+    grid->Barrier();
+    GridStopWatch timer; 
+    GridStopWatch bstimer;
+    
+    nersc_csum=0;
+    scidac_csuma=0;
+    scidac_csumb=0;
+
+    int ndim                 = grid->Dimensions();
+    int nrank                = grid->ProcessorCount();
+    int myrank               = grid->ThisRank();
+
+    std::vector<int>  psizes = grid->ProcessorGrid(); 
+    std::vector<int>  pcoor  = grid->ThisProcessorCoor();
+    std::vector<int> gLattice= grid->GlobalDimensions();
+    std::vector<int> lLattice= grid->LocalDimensions();
+
+    std::vector<int> lStart(ndim);
+    std::vector<int> gStart(ndim);
+
+    // Flatten the file
+    uint64_t lsites = grid->lSites();
+    if ( control & BINARYIO_MASTER_APPEND )  {
+      assert(iodata.size()==1);
+    } else {
+      assert(lsites==iodata.size());
+    }
+    for(int d=0;d<ndim;d++){
+      gStart[d] = lLattice[d]*pcoor[d];
+      lStart[d] = 0;
+    }
+
+#ifdef USE_MPI_IO
+    std::vector<int> distribs(ndim,MPI_DISTRIBUTE_BLOCK);
+    std::vector<int> dargs   (ndim,MPI_DISTRIBUTE_DFLT_DARG);
+    MPI_Datatype mpiObject;
+    MPI_Datatype fileArray;
+    MPI_Datatype localArray;
+    MPI_Datatype mpiword;
+    MPI_Offset disp = offset;
+    MPI_File fh ;
+    MPI_Status status;
+    int numword;
+
+    if ( sizeof( word ) == sizeof(float ) ) {
+      numword = sizeof(fobj)/sizeof(float);
+      mpiword = MPI_FLOAT;
+    } else {
+      numword = sizeof(fobj)/sizeof(double);
+      mpiword = MPI_DOUBLE;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Sobj in MPI phrasing
+    //////////////////////////////////////////////////////////////////////////////
+    int ierr;
+    ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject);    assert(ierr==0);
+    ierr = MPI_Type_commit(&mpiObject);
+
+    //////////////////////////////////////////////////////////////////////////////
+    // File global array data type
+    //////////////////////////////////////////////////////////////////////////////
+    ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray);    assert(ierr==0);
+    ierr=MPI_Type_commit(&fileArray);    assert(ierr==0);
+
+    //////////////////////////////////////////////////////////////////////////////
+    // local lattice array
+    //////////////////////////////////////////////////////////////////////////////
+    ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray);    assert(ierr==0);
+    ierr=MPI_Type_commit(&localArray);    assert(ierr==0);
+#endif
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Byte order
+    //////////////////////////////////////////////////////////////////////////////
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64"));
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Do the I/O
+    //////////////////////////////////////////////////////////////////////////////
+    if ( control & BINARYIO_READ ) { 
+
+      timer.Start();
+
+      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
+#ifdef USE_MPI_IO
+	std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
+	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
+	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
+	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
+	MPI_File_close(&fh);
+	MPI_Type_free(&fileArray);
+	MPI_Type_free(&localArray);
+#else 
+	assert(0);
+#endif
+      } else {
+	std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
+                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
+        std::ifstream fin;
+	fin.open(file, std::ios::binary | std::ios::in);
+        if (control & BINARYIO_MASTER_APPEND)
+        {
+          fin.seekg(-sizeof(fobj), fin.end);
+        }
+        else
+        {
+          fin.seekg(offset + myrank * lsites * sizeof(fobj));
+        }
+        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
+        assert(fin.fail() == 0);
+        fin.close();
+      }
+      timer.Stop();
+
+      grid->Barrier();
+
+      bstimer.Start();
+      ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
+      if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee32)    le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee64)    le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      NerscChecksum(grid,iodata,nersc_csum);
+      bstimer.Stop();
+    }
+    
+    if ( control & BINARYIO_WRITE ) { 
+
+      bstimer.Start();
+      NerscChecksum(grid,iodata,nersc_csum);
+      if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee32)    htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      if (ieee64)    htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
+      ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
+      bstimer.Stop();
+
+      grid->Barrier();
+
+      timer.Start();
+      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
+#ifdef USE_MPI_IO
+        std::cout << GridLogMessage <<"IOobject: MPI write I/O " << file << std::endl;
+        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
+	//        std::cout << GridLogMessage << "Checking for errors" << std::endl;
+        if (ierr != MPI_SUCCESS)
+        {
+          char error_string[BUFSIZ];
+          int length_of_error_string, error_class;
+
+          MPI_Error_class(ierr, &error_class);
+          MPI_Error_string(error_class, error_string, &length_of_error_string);
+          fprintf(stderr, "%3d: %s\n", myrank, error_string);
+          MPI_Error_string(ierr, error_string, &length_of_error_string);
+          fprintf(stderr, "%3d: %s\n", myrank, error_string);
+          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
+        }
+
+        std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
+        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
+        assert(ierr == 0);
+
+        std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
+        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
+        assert(ierr == 0);
+
+        MPI_Offset os;
+        MPI_File_get_position(fh, &os);
+        MPI_File_get_byte_offset(fh, os, &disp);
+        offset = disp;
+
+
+        MPI_File_close(&fh);
+        MPI_Type_free(&fileArray);
+        MPI_Type_free(&localArray);
+#else 
+	assert(0);
+#endif
+      } else { 
+
+        std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
+                  << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
+        
+	std::ofstream fout; 
+	fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
+	try {
+	  if (offset) { // Must already exist and contain data
+	    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+	  } else {     // Allow create
+	    fout.open(file,std::ios::binary|std::ios::out);
+	  }
+	} catch (const std::fstream::failure& exc) {
+	  std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
+	  std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
+	  //	  std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
+#ifdef USE_MPI_IO
+	  MPI_Abort(MPI_COMM_WORLD,1);
+#else
+	  exit(1);
+#endif
+	}
+	
+	if ( control & BINARYIO_MASTER_APPEND )  {
+	  try {
+	    fout.seekp(0,fout.end);
+	  } catch (const std::fstream::failure& exc) {
+	    std::cout << "Exception in seeking file end " << file << std::endl;
+	  }
+	} else {
+	  try { 
+	    fout.seekp(offset+myrank*lsites*sizeof(fobj));
+	  } catch (const std::fstream::failure& exc) {
+	    std::cout << "Exception in seeking file " << file <<" offset "<< offset << std::endl;
+	  }
+	}
+
+	try {
+	  fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
+	}
+	catch (const std::fstream::failure& exc) {
+	  std::cout << "Exception in writing file " << file << std::endl;
+	  std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
+#ifdef USE_MPI_IO
+	  MPI_Abort(MPI_COMM_WORLD,1);
+#else
+	  exit(1);
+#endif
+	}
+  offset  = fout.tellp();
+	fout.close();
+      }
+      timer.Stop();
+    }
+    
+    std::cout<<GridLogMessage<<"IOobject: ";
+    if ( control & BINARYIO_READ) std::cout << " read  ";
+    else                          std::cout << " write ";
+    uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
+    std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
+
+    std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Safety check
+    //////////////////////////////////////////////////////////////////////////////
+    // if the data size is 1 we do not want to sum over the MPI ranks
+    if (iodata.size() != 1){
+      grid->Barrier();
+      grid->GlobalSum(nersc_csum);
+      grid->GlobalXOR(scidac_csuma);
+      grid->GlobalXOR(scidac_csumb);
+      grid->Barrier();
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Read a Lattice of object
+  //////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj,class fobj,class munger>
+  static inline void readLatticeObject(Lattice<vobj> &Umu,
+				       std::string file,
+				       munger munge,
+				       uint64_t offset,
+				       const std::string &format,
+				       uint32_t &nersc_csum,
+				       uint32_t &scidac_csuma,
+				       uint32_t &scidac_csumb)
+  {
+    typedef typename vobj::scalar_object sobj;
+    typedef typename vobj::Realified::scalar_type word;    word w=0;
+
+    GridBase *grid = Umu._grid;
+    uint64_t lsites = grid->lSites();
+
+    std::vector<sobj> scalardata(lsites); 
+    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
+    
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+	     nersc_csum,scidac_csuma,scidac_csumb);
+
+    GridStopWatch timer; 
+    timer.Start();
+
+    parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
+
+    vectorizeFromLexOrdArray(scalardata,Umu);    
+    grid->Barrier();
+
+    timer.Stop();
+    std::cout<<GridLogMessage<<"readLatticeObject: vectorize overhead "<<timer.Elapsed()  <<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Write a Lattice of object
+  //////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj,class fobj,class munger>
+    static inline void writeLatticeObject(Lattice<vobj> &Umu,
+					  std::string file,
+					  munger munge,
+					  uint64_t offset,
+					  const std::string &format,
+					  uint32_t &nersc_csum,
+					  uint32_t &scidac_csuma,
+					  uint32_t &scidac_csumb)
+  {
+    typedef typename vobj::scalar_object sobj;
+    typedef typename vobj::Realified::scalar_type word;    word w=0;
+    GridBase *grid = Umu._grid;
+    uint64_t lsites = grid->lSites();
+
+    std::vector<sobj> scalardata(lsites); 
+    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Munge [ .e.g 3rd row recon ]
+    //////////////////////////////////////////////////////////////////////////////
+    GridStopWatch timer; timer.Start();
+    unvectorizeToLexOrdArray(scalardata,Umu);    
+
+    parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
+
+    grid->Barrier();
+    timer.Stop();
+
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
+	     nersc_csum,scidac_csuma,scidac_csumb);
+
+    std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
+  }
+  
+  /////////////////////////////////////////////////////////////////////////////
+  // Read a RNG;  use IOobject and lexico map to an array of state 
+  //////////////////////////////////////////////////////////////////////////////////////
+  static inline void readRNG(GridSerialRNG &serial,
+			     GridParallelRNG &parallel,
+			     std::string file,
+			     uint64_t offset,
+			     uint32_t &nersc_csum,
+			     uint32_t &scidac_csuma,
+			     uint32_t &scidac_csumb)
+  {
+    typedef typename GridSerialRNG::RngStateType RngStateType;
+    const int RngStateCount = GridSerialRNG::RngStateCount;
+    typedef std::array<RngStateType,RngStateCount> RNGstate;
+    typedef RngStateType word;    word w=0;
+
+    std::string format = "IEEE32BIG";
+
+    GridBase *grid = parallel._grid;
+    uint64_t gsites = grid->gSites();
+    uint64_t lsites = grid->lSites();
+
+    uint32_t nersc_csum_tmp   = 0;
+    uint32_t scidac_csuma_tmp = 0;
+    uint32_t scidac_csumb_tmp = 0;
+
+    GridStopWatch timer;
+
+    std::cout << GridLogMessage << "RNG read I/O on file " << file << std::endl;
+
+    std::vector<RNGstate> iodata(lsites);
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+	     nersc_csum,scidac_csuma,scidac_csumb);
+
+    timer.Start();
+    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
+      std::vector<RngStateType> tmp(RngStateCount);
+      std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
+      parallel.SetState(tmp,lidx);
+    }
+    timer.Stop();
+
+    iodata.resize(1);
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_MASTER_APPEND,
+	     nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
+
+    {
+      std::vector<RngStateType> tmp(RngStateCount);
+      std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
+      serial.SetState(tmp,0);
+    }
+
+    nersc_csum   = nersc_csum   + nersc_csum_tmp;
+    scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
+    scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
+
+    std::cout << GridLogMessage << "RNG file nersc_checksum   " << std::hex << nersc_csum << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG file scidac_checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG file scidac_checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
+
+    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
+  }
+  /////////////////////////////////////////////////////////////////////////////
+  // Write a RNG; lexico map to an array of state and use IOobject
+  //////////////////////////////////////////////////////////////////////////////////////
+  static inline void writeRNG(GridSerialRNG &serial,
+			      GridParallelRNG &parallel,
+			      std::string file,
+			      uint64_t offset,
+			      uint32_t &nersc_csum,
+			      uint32_t &scidac_csuma,
+			      uint32_t &scidac_csumb)
+  {
+    typedef typename GridSerialRNG::RngStateType RngStateType;
+    typedef RngStateType word; word w=0;
+    const int RngStateCount = GridSerialRNG::RngStateCount;
+    typedef std::array<RngStateType,RngStateCount> RNGstate;
+
+    GridBase *grid = parallel._grid;
+    uint64_t gsites = grid->gSites();
+    uint64_t lsites = grid->lSites();
+
+    uint32_t nersc_csum_tmp;
+    uint32_t scidac_csuma_tmp;
+    uint32_t scidac_csumb_tmp;
+
+    GridStopWatch timer;
+    std::string format = "IEEE32BIG";
+
+    std::cout << GridLogMessage << "RNG write I/O on file " << file << std::endl;
+
+    timer.Start();
+    std::vector<RNGstate> iodata(lsites);
+    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
+      std::vector<RngStateType> tmp(RngStateCount);
+      parallel.GetState(tmp,lidx);
+      std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
+    }
+    timer.Stop();
+
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
+	     nersc_csum,scidac_csuma,scidac_csumb);
+    iodata.resize(1);
+    {
+      std::vector<RngStateType> tmp(RngStateCount);
+      serial.GetState(tmp,0);
+      std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
+    }
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
+	     nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
+
+    nersc_csum   = nersc_csum   + nersc_csum_tmp;
+    scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
+    scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
+    
+    std::cout << GridLogMessage << "RNG file checksum " << std::hex << nersc_csum    << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG file checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG file checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
+    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
+  }
+};
+}
+#endif
@@ -0,0 +1,876 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/parallelIO/IldgIO.h
+
+Copyright (C) 2015
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_ILDG_IO_H
+#define GRID_ILDG_IO_H
+
+#ifdef HAVE_LIME
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+
+#include <pwd.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+
+//C-Lime is a must have for this functionality
+extern "C" {  
+#include "lime.h"
+}
+
+namespace Grid {
+namespace QCD {
+
+  /////////////////////////////////
+  // Encode word types as strings
+  /////////////////////////////////
+ template<class word> inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); }
+ template<> inline std::string ScidacWordMnemonic<double>  (void){ return std::string("D"); }
+ template<> inline std::string ScidacWordMnemonic<float>   (void){ return std::string("F"); }
+ template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); }
+ template<> inline std::string ScidacWordMnemonic<uint32_t>(void){ return std::string("U32_t"); }
+ template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); }
+ template<> inline std::string ScidacWordMnemonic<uint64_t>(void){ return std::string("U64_t"); }
+
+  /////////////////////////////////////////
+  // Encode a generic tensor as a string
+  /////////////////////////////////////////
+ template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { 
+
+   typedef typename getPrecision<vobj>::real_scalar_type stype;
+
+   int _ColourN       = indexRank<ColourIndex,vobj>();
+   int _ColourScalar  =  isScalar<ColourIndex,vobj>();
+   int _ColourVector  =  isVector<ColourIndex,vobj>();
+   int _ColourMatrix  =  isMatrix<ColourIndex,vobj>();
+
+   int _SpinN       = indexRank<SpinIndex,vobj>();
+   int _SpinScalar  =  isScalar<SpinIndex,vobj>();
+   int _SpinVector  =  isVector<SpinIndex,vobj>();
+   int _SpinMatrix  =  isMatrix<SpinIndex,vobj>();
+
+   int _LorentzN       = indexRank<LorentzIndex,vobj>();
+   int _LorentzScalar  =  isScalar<LorentzIndex,vobj>();
+   int _LorentzVector  =  isVector<LorentzIndex,vobj>();
+   int _LorentzMatrix  =  isMatrix<LorentzIndex,vobj>();
+
+   std::stringstream stream;
+
+   stream << "GRID_";
+   stream << ScidacWordMnemonic<stype>();
+
+   if ( _LorentzVector )   stream << "_LorentzVector"<<_LorentzN;
+   if ( _LorentzMatrix )   stream << "_LorentzMatrix"<<_LorentzN;
+
+   if ( _SpinVector )   stream << "_SpinVector"<<_SpinN;
+   if ( _SpinMatrix )   stream << "_SpinMatrix"<<_SpinN;
+
+   if ( _ColourVector )   stream << "_ColourVector"<<_ColourN;
+   if ( _ColourMatrix )   stream << "_ColourMatrix"<<_ColourN;
+
+   if ( _ColourScalar && _LorentzScalar && _SpinScalar )   stream << "_Complex";
+
+
+   typesize = sizeof(typename vobj::scalar_type);
+
+   if ( _ColourMatrix ) typesize*= _ColourN*_ColourN;
+   else                 typesize*= _ColourN;
+
+   if ( _SpinMatrix )   typesize*= _SpinN*_SpinN;
+   else                 typesize*= _SpinN;
+
+   colors    = _ColourN;
+   spins     = _SpinN;
+   datacount = _LorentzN;
+
+   return stream.str();
+ }
+ 
+ template<class vobj> std::string ScidacRecordTypeString(Lattice<vobj> & lat,int &colors, int &spins, int & typesize,int &datacount) { 
+   return ScidacRecordTypeString<vobj>(colors,spins,typesize,datacount);
+ };
+
+
+ ////////////////////////////////////////////////////////////
+ // Helper to fill out metadata
+ ////////////////////////////////////////////////////////////
+ template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
+					  FieldMetaData &header,
+					  scidacRecord & _scidacRecord,
+					  scidacFile   & _scidacFile) 
+ {
+   typedef typename getPrecision<vobj>::real_scalar_type stype;
+
+   /////////////////////////////////////
+   // Pull Grid's metadata
+   /////////////////////////////////////
+   PrepareMetaData(field,header);
+
+   /////////////////////////////////////
+   // Scidac Private File structure
+   /////////////////////////////////////
+   _scidacFile              = scidacFile(field._grid);
+
+   /////////////////////////////////////
+   // Scidac Private Record structure
+   /////////////////////////////////////
+   scidacRecord sr;
+   sr.datatype   = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount);
+   sr.date       = header.creation_date;
+   sr.precision  = ScidacWordMnemonic<stype>();
+   sr.recordtype = GRID_IO_FIELD;
+
+   _scidacRecord = sr;
+
+   //   std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
+ }
+ 
+ ///////////////////////////////////////////////////////
+ // Scidac checksum
+ ///////////////////////////////////////////////////////
+ static int scidacChecksumVerify(scidacChecksum &scidacChecksum_,uint32_t scidac_csuma,uint32_t scidac_csumb)
+ {
+   uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
+   uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
+   if ( scidac_csuma !=scidac_checksuma) return 0;
+   if ( scidac_csumb !=scidac_checksumb) return 0;
+   return 1;
+ }
+
+////////////////////////////////////////////////////////////////////////////////////
+// Lime, ILDG and Scidac I/O classes
+////////////////////////////////////////////////////////////////////////////////////
+class GridLimeReader : public BinaryIO {
+ public:
+   ///////////////////////////////////////////////////
+   // FIXME: format for RNG? Now just binary out instead
+   ///////////////////////////////////////////////////
+
+   FILE       *File;
+   LimeReader *LimeR;
+   std::string filename;
+
+   /////////////////////////////////////////////
+   // Open the file
+   /////////////////////////////////////////////
+   void open(const std::string &_filename) 
+   {
+     filename= _filename;
+     File = fopen(filename.c_str(), "r");
+     if (File == nullptr)
+     {
+       std::cerr << "cannot open file '" << filename << "'" << std::endl;
+       abort();
+     }
+     LimeR = limeCreateReader(File);
+   }
+   /////////////////////////////////////////////
+   // Close the file
+   /////////////////////////////////////////////
+   void close(void){
+     fclose(File);
+     //     limeDestroyReader(LimeR);
+   }
+
+  ////////////////////////////////////////////
+  // Read a generic lattice field and verify checksum
+  ////////////////////////////////////////////
+  template<class vobj>
+  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
+  {
+    typedef typename vobj::scalar_object sobj;
+    scidacChecksum scidacChecksum_;
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+
+    std::string format = getFormatString<vobj>();
+
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
+
+      uint64_t file_bytes =limeReaderBytes(LimeR);
+
+      //      std::cout << GridLogMessage << limeReaderType(LimeR) << " "<< file_bytes <<" bytes "<<std::endl;
+      //      std::cout << GridLogMessage<< " readLimeObject seeking "<<  record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
+
+      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
+
+	//	std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
+
+	uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
+
+	//	std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
+	//	std::cout << "R Gsites " <<field._grid->_gsites<<std::endl;
+	//	std::cout << "R Payload expected " <<PayloadSize<<std::endl;
+	//	std::cout << "R file size " <<file_bytes <<std::endl;
+
+	assert(PayloadSize == file_bytes);// Must match or user error
+
+	uint64_t offset= ftello(File);
+	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
+	BinarySimpleMunger<sobj,sobj> munge;
+	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+  std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
+  std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
+	/////////////////////////////////////////////
+	// Insist checksum is next record
+	/////////////////////////////////////////////
+	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
+
+	/////////////////////////////////////////////
+	// Verify checksums
+	/////////////////////////////////////////////
+	assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
+	return;
+      }
+    }
+  }
+  ////////////////////////////////////////////
+  // Read a generic serialisable object
+  ////////////////////////////////////////////
+  void readLimeObject(std::string &xmlstring,std::string record_name)
+  {
+    // should this be a do while; can we miss a first record??
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
+
+      //      std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
+      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
+
+      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
+
+	//	std::cout << GridLogMessage<< " readLimeObject matches ! " << record_name <<std::endl;
+	std::vector<char> xmlc(nbytes+1,'\0');
+	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
+	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
+
+   xmlstring = std::string(&xmlc[0]);
+	return;
+      }
+
+    }  
+    assert(0);
+  }
+
+  template<class serialisable_object>
+  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
+  {
+    std::string xmlstring;
+
+    readLimeObject(xmlstring, record_name);
+	  XmlReader RD(xmlstring, true, "");
+	  read(RD,object_name,object);
+  }
+};
+
+class GridLimeWriter : public BinaryIO 
+{
+ public:
+
+   ///////////////////////////////////////////////////
+   // FIXME: format for RNG? Now just binary out instead
+   // FIXME: collective calls or not ?
+   //      : must know if I am the I/O boss
+   ///////////////////////////////////////////////////
+   FILE       *File;
+   LimeWriter *LimeW;
+   std::string filename;
+   bool        boss_node;
+   GridLimeWriter( bool isboss = true) {
+     boss_node = isboss;
+   }
+   void open(const std::string &_filename) { 
+     filename= _filename;
+     if ( boss_node ) {
+       File = fopen(filename.c_str(), "w");
+       LimeW = limeCreateWriter(File); assert(LimeW != NULL );
+     }
+   }
+   /////////////////////////////////////////////
+   // Close the file
+   /////////////////////////////////////////////
+   void close(void) {
+     if ( boss_node ) {
+       fclose(File);
+     }
+     //  limeDestroyWriter(LimeW);
+   }
+  ///////////////////////////////////////////////////////
+  // Lime utility functions
+  ///////////////////////////////////////////////////////
+  int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
+  {
+    if ( boss_node ) {
+      LimeRecordHeader *h;
+      h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
+      assert(limeWriteRecordHeader(h, LimeW) >= 0);
+      limeDestroyHeader(h);
+    }
+    return LIME_SUCCESS;
+  }
+  ////////////////////////////////////////////
+  // Write a generic serialisable object
+  ////////////////////////////////////////////
+  void writeLimeObject(int MB,int ME,XmlWriter &writer,std::string object_name,std::string record_name)
+  {
+    if ( boss_node ) {
+      std::string xmlstring = writer.docString();
+
+      //    std::cout << "WriteLimeObject" << record_name <<std::endl;
+      uint64_t nbytes = xmlstring.size();
+      //    std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
+      int err;
+      LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes); 
+      assert(h!= NULL);
+      
+      err=limeWriteRecordHeader(h, LimeW);                    assert(err>=0);
+      err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
+      err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
+      limeDestroyHeader(h);
+    }
+  }
+
+  template<class serialisable_object>
+  void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, const unsigned int scientificPrec = 0)
+  {
+    XmlWriter WR("","");
+
+    if (scientificPrec)
+    {
+      WR.scientificFormat(true);
+      WR.setPrecision(scientificPrec);
+    }
+    write(WR,object_name,object);
+    writeLimeObject(MB, ME, WR, object_name, record_name);
+  }
+  ////////////////////////////////////////////////////
+  // Write a generic lattice field and csum
+  // This routine is Collectively called by all nodes
+  // in communicator used by the field._grid
+  ////////////////////////////////////////////////////
+  template<class vobj>
+  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
+  {
+    ////////////////////////////////////////////////////////////////////
+    // NB: FILE and iostream are jointly writing disjoint sequences in the
+    // the same file through different file handles (integer units).
+    // 
+    // These are both buffered, so why I think this code is right is as follows.
+    //
+    // i)  write record header to FILE *File, telegraphing the size; flush
+    // ii) ftello reads the offset from FILE *File . 
+    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
+    //      Closes iostream and flushes.
+    // iv) fseek on FILE * to end of this disjoint section.
+    //  v) Continue writing scidac record.
+    ////////////////////////////////////////////////////////////////////
+    
+    GridBase *grid = field._grid;
+    assert(boss_node == field._grid->IsBoss() );
+
+    ////////////////////////////////////////////
+    // Create record header
+    ////////////////////////////////////////////
+    typedef typename vobj::scalar_object sobj;
+    int err;
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+    uint64_t PayloadSize = sizeof(sobj) * grid->_gsites;
+    if ( boss_node ) {
+      createLimeRecordHeader(record_name, 0, 0, PayloadSize);
+      fflush(File);
+    }
+    
+    //    std::cout << "W sizeof(sobj)"      <<sizeof(sobj)<<std::endl;
+    //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl;
+    //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;
+
+    ////////////////////////////////////////////////
+    // Check all nodes agree on file position
+    ////////////////////////////////////////////////
+    uint64_t offset1;
+    if ( boss_node ) {
+      offset1 = ftello(File);    
+    }
+    grid->Broadcast(0,(void *)&offset1,sizeof(offset1));
+
+    ///////////////////////////////////////////
+    // The above is collective. Write by other means into the binary record
+    ///////////////////////////////////////////
+    std::string format = getFormatString<vobj>();
+    BinarySimpleMunger<sobj,sobj> munge;
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
+
+    ///////////////////////////////////////////
+    // Wind forward and close the record
+    ///////////////////////////////////////////
+    if ( boss_node ) {
+      fseek(File,0,SEEK_END);             
+      uint64_t offset2 = ftello(File);     //    std::cout << " now at offset "<<offset2 << std::endl;
+      assert( (offset2-offset1) == PayloadSize);
+    }
+
+    /////////////////////////////////////////////////////////////
+    // Check MPI-2 I/O did what we expect to file
+    /////////////////////////////////////////////////////////////
+
+    if ( boss_node ) { 
+      err=limeWriterCloseRecord(LimeW);  assert(err>=0);
+    }
+    ////////////////////////////////////////
+    // Write checksum element, propagaing forward from the BinaryIO
+    // Always pair a checksum with a binary object, and close message
+    ////////////////////////////////////////
+    scidacChecksum checksum;
+    std::stringstream streama; streama << std::hex << scidac_csuma;
+    std::stringstream streamb; streamb << std::hex << scidac_csumb;
+    checksum.suma= streama.str();
+    checksum.sumb= streamb.str();
+    if ( boss_node ) { 
+      writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
+    }
+  }
+};
+
+class ScidacWriter : public GridLimeWriter {
+ public:
+
+  ScidacWriter(bool isboss =true ) : GridLimeWriter(isboss)  { };
+
+  template<class SerialisableUserFile>
+  void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
+  {
+    scidacFile    _scidacFile(grid);
+    if ( this->boss_node ) {
+      writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
+      writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
+    }
+  }
+  ////////////////////////////////////////////////
+  // Write generic lattice field in scidac format
+  ////////////////////////////////////////////////
+  template <class vobj, class userRecord>
+  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
+                              const unsigned int recordScientificPrec = 0) 
+  {
+    GridBase * grid = field._grid;
+
+    ////////////////////////////////////////
+    // fill the Grid header
+    ////////////////////////////////////////
+    FieldMetaData header;
+    scidacRecord  _scidacRecord;
+    scidacFile    _scidacFile;
+
+    ScidacMetaData(field,header,_scidacRecord,_scidacFile);
+
+    //////////////////////////////////////////////
+    // Fill the Lime file record by record
+    //////////////////////////////////////////////
+    if ( this->boss_node ) {
+      writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
+      writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML), recordScientificPrec);
+      writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
+    }
+    // Collective call
+    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
+  }
+};
+
+
+class ScidacReader : public GridLimeReader {
+ public:
+
+   template<class SerialisableUserFile>
+   void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
+   {
+     scidacFile    _scidacFile(grid);
+     readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
+     readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
+   }
+  ////////////////////////////////////////////////
+  // Write generic lattice field in scidac format
+  ////////////////////////////////////////////////
+  template <class vobj, class userRecord>
+  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
+  {
+    typedef typename vobj::scalar_object sobj;
+    GridBase * grid = field._grid;
+
+    ////////////////////////////////////////
+    // fill the Grid header
+    ////////////////////////////////////////
+    FieldMetaData header;
+    scidacRecord  _scidacRecord;
+    scidacFile    _scidacFile;
+
+    //////////////////////////////////////////////
+    // Fill the Lime file record by record
+    //////////////////////////////////////////////
+    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
+    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
+    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
+    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
+  }
+  void skipPastBinaryRecord(void) {
+    std::string rec_name(ILDG_BINARY_DATA);
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
+      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
+	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
+	return;
+      }
+    }    
+  }
+  void skipPastObjectRecord(std::string rec_name) {
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
+      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
+	return;
+      }
+    }
+  }
+  void skipScidacFieldRecord() {
+    skipPastObjectRecord(std::string(GRID_FORMAT));
+    skipPastObjectRecord(std::string(SCIDAC_RECORD_XML));
+    skipPastObjectRecord(std::string(SCIDAC_PRIVATE_RECORD_XML));
+    skipPastBinaryRecord();
+  }
+};
+
+
+class IldgWriter : public ScidacWriter {
+ public:
+  
+  IldgWriter(bool isboss) : ScidacWriter(isboss) {};
+
+  ///////////////////////////////////
+  // A little helper
+  ///////////////////////////////////
+  void writeLimeIldgLFN(std::string &LFN)
+  {
+    uint64_t PayloadSize = LFN.size();
+    int err;
+    createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
+    err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0);
+    err=limeWriterCloseRecord(LimeW); assert(err>=0);
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Special ILDG operations ; gauge configs only.
+  // Don't require scidac records EXCEPT checksum
+  // Use Grid MetaData object if present.
+  ////////////////////////////////////////////////////////////////
+  template <class vsimd>
+  void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) 
+  {
+    GridBase * grid = Umu._grid;
+    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+    typedef iLorentzColourMatrix<vsimd> vobj;
+    typedef typename vobj::scalar_object sobj;
+
+    ////////////////////////////////////////
+    // fill the Grid header
+    ////////////////////////////////////////
+    FieldMetaData header;
+    scidacRecord  _scidacRecord;
+    scidacFile    _scidacFile;
+
+    ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
+
+    std::string format = header.floating_point;
+    header.ensemble_id    = description;
+    header.ensemble_label = description;
+    header.sequence_number = sequence;
+    header.ildg_lfn = LFN;
+
+    assert ( (format == std::string("IEEE32BIG"))  
+           ||(format == std::string("IEEE64BIG")) );
+
+    //////////////////////////////////////////////////////
+    // Fill ILDG header data struct
+    //////////////////////////////////////////////////////
+    ildgFormat ildgfmt ;
+    ildgfmt.field     = std::string("su3gauge");
+
+    if ( format == std::string("IEEE32BIG") ) { 
+      ildgfmt.precision = 32;
+    } else { 
+      ildgfmt.precision = 64;
+    }
+    ildgfmt.version = 1.0;
+    ildgfmt.lx = header.dimension[0];
+    ildgfmt.ly = header.dimension[1];
+    ildgfmt.lz = header.dimension[2];
+    ildgfmt.lt = header.dimension[3];
+    assert(header.nd==4);
+    assert(header.nd==header.dimension.size());
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Fill the USQCD info field
+    //////////////////////////////////////////////////////////////////////////////
+    usqcdInfo info;
+    info.version=1.0;
+    info.plaq   = header.plaquette;
+    info.linktr = header.link_trace;
+
+    std::cout << GridLogMessage << " Writing config; IldgIO "<<std::endl;
+    //////////////////////////////////////////////
+    // Fill the Lime file record by record
+    //////////////////////////////////////////////
+    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
+    writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
+    writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
+    writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
+    writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
+    writeLimeObject(0,0,ildgfmt,std::string("ildgFormat")   ,std::string(ILDG_FORMAT)); // rec
+    writeLimeIldgLFN(header.ildg_lfn);                                                 // rec
+    writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
+    //    limeDestroyWriter(LimeW);
+  }
+};
+
+class IldgReader : public GridLimeReader {
+ public:
+
+  ////////////////////////////////////////////////////////////////
+  // Read either Grid/SciDAC/ILDG configuration
+  // Don't require scidac records EXCEPT checksum
+  // Use Grid MetaData object if present.
+  // Else use ILDG MetaData object if present.
+  // Else use SciDAC MetaData object if present.
+  ////////////////////////////////////////////////////////////////
+  template <class vsimd>
+  void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
+
+    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+    typedef typename GaugeField::vector_object  vobj;
+    typedef typename vobj::scalar_object sobj;
+
+    typedef LorentzColourMatrixF fobj;
+    typedef LorentzColourMatrixD dobj;
+
+    GridBase *grid = Umu._grid;
+
+    std::vector<int> dims = Umu._grid->FullDimensions();
+
+    assert(dims.size()==4);
+
+    // Metadata holders
+    ildgFormat     ildgFormat_    ;
+    std::string    ildgLFN_       ;
+    scidacChecksum scidacChecksum_; 
+    usqcdInfo      usqcdInfo_     ;
+
+    // track what we read from file
+    int found_ildgFormat    =0;
+    int found_ildgLFN       =0;
+    int found_scidacChecksum=0;
+    int found_usqcdInfo     =0;
+    int found_ildgBinary =0;
+    int found_FieldMetaData =0;
+
+    uint32_t nersc_csum;
+    uint32_t scidac_csuma;
+    uint32_t scidac_csumb;
+
+    // Binary format
+    std::string format;
+
+    //////////////////////////////////////////////////////////////////////////
+    // Loop over all records
+    // -- Order is poorly guaranteed except ILDG header preceeds binary section.
+    // -- Run like an event loop.
+    // -- Impose trust hierarchy. Grid takes precedence & look for ILDG, and failing
+    //    that Scidac. 
+    // -- Insist on Scidac checksum record.
+    //////////////////////////////////////////////////////////////////////////
+
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
+
+      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
+      
+      //////////////////////////////////////////////////////////////////
+      // If not BINARY_DATA read a string and parse
+      //////////////////////////////////////////////////////////////////
+      if ( strncmp(limeReaderType(LimeR), ILDG_BINARY_DATA,strlen(ILDG_BINARY_DATA) )  ) {
+	
+	// Copy out the string
+	std::vector<char> xmlc(nbytes+1,'\0');
+	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
+	//	std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
+
+	//////////////////////////////////
+	// ILDG format record
+
+  std::string xmlstring(&xmlc[0]);
+	if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) { 
+
+	  XmlReader RD(xmlstring, true, "");
+	  read(RD,"ildgFormat",ildgFormat_);
+
+	  if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
+	  if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
+
+	  assert( ildgFormat_.lx == dims[0]);
+	  assert( ildgFormat_.ly == dims[1]);
+	  assert( ildgFormat_.lz == dims[2]);
+	  assert( ildgFormat_.lt == dims[3]);
+
+	  found_ildgFormat = 1;
+	}
+
+	if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
+	  FieldMetaData_.ildg_lfn = xmlstring;
+	  found_ildgLFN = 1;
+	}
+
+	if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) { 
+
+	  XmlReader RD(xmlstring, true, "");
+	  read(RD,"FieldMetaData",FieldMetaData_);
+
+	  format = FieldMetaData_.floating_point;
+
+	  assert(FieldMetaData_.dimension[0] == dims[0]);
+	  assert(FieldMetaData_.dimension[1] == dims[1]);
+	  assert(FieldMetaData_.dimension[2] == dims[2]);
+	  assert(FieldMetaData_.dimension[3] == dims[3]);
+
+	  found_FieldMetaData = 1;
+	}
+
+	if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) { 
+	  // is it a USQCD info field
+	  if ( xmlstring.find(std::string("usqcdInfo")) != std::string::npos ) { 
+	    //	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
+	    XmlReader RD(xmlstring, true, "");
+	    read(RD,"usqcdInfo",usqcdInfo_);
+	    found_usqcdInfo = 1;
+	  }
+	}
+
+	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) { 
+	  XmlReader RD(xmlstring, true, "");
+	  read(RD,"scidacChecksum",scidacChecksum_);
+	  found_scidacChecksum = 1;
+	}
+
+      } else {  
+	/////////////////////////////////
+	// Binary data
+	/////////////////////////////////
+	std::cout << GridLogMessage << "ILDG Binary record found : "  ILDG_BINARY_DATA << std::endl;
+	uint64_t offset= ftello(File);
+	if ( format == std::string("IEEE64BIG") ) {
+	  GaugeSimpleMunger<dobj, sobj> munge;
+	  BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+	} else { 
+	  GaugeSimpleMunger<fobj, sobj> munge;
+	  BinaryIO::readLatticeObject< vobj, fobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+	}
+
+	found_ildgBinary = 1;
+      }
+
+    }
+
+    //////////////////////////////////////////////////////
+    // Minimally must find binary segment and checksum
+    // Since this is an ILDG reader require ILDG format
+    //////////////////////////////////////////////////////
+    assert(found_ildgBinary);
+    assert(found_ildgFormat);
+    assert(found_scidacChecksum);
+
+    // Must find something with the lattice dimensions
+    assert(found_FieldMetaData||found_ildgFormat);
+
+    if ( found_FieldMetaData ) {
+
+      std::cout << GridLogMessage<<"Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<<std::endl;
+
+    } else { 
+
+      assert(found_ildgFormat);
+      assert ( ildgFormat_.field == std::string("su3gauge") );
+
+      ///////////////////////////////////////////////////////////////////////////////////////
+      // Populate our Grid metadata as best we can
+      ///////////////////////////////////////////////////////////////////////////////////////
+
+      std::ostringstream vers; vers << ildgFormat_.version;
+      FieldMetaData_.hdr_version = vers.str();
+      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
+
+      FieldMetaData_.nd=4;
+      FieldMetaData_.dimension.resize(4);
+
+      FieldMetaData_.dimension[0] = ildgFormat_.lx ;
+      FieldMetaData_.dimension[1] = ildgFormat_.ly ;
+      FieldMetaData_.dimension[2] = ildgFormat_.lz ;
+      FieldMetaData_.dimension[3] = ildgFormat_.lt ;
+
+      if ( found_usqcdInfo ) { 
+	FieldMetaData_.plaquette = usqcdInfo_.plaq;
+	FieldMetaData_.link_trace= usqcdInfo_.linktr;
+	std::cout << GridLogMessage <<"This configuration was probably written by USQCD "<<std::endl;
+	std::cout << GridLogMessage <<"USQCD xml record Plaquette : "<<FieldMetaData_.plaquette<<std::endl;
+	std::cout << GridLogMessage <<"USQCD xml record LinkTrace : "<<FieldMetaData_.link_trace<<std::endl;
+      } else { 
+	FieldMetaData_.plaquette = 0.0;
+	FieldMetaData_.link_trace= 0.0;
+	std::cout << GridLogWarning << "This configuration is unsafe with no plaquette records that can verify it !!! "<<std::endl;
+      }
+    }
+
+    ////////////////////////////////////////////////////////////
+    // Really really want to mandate a scidac checksum
+    ////////////////////////////////////////////////////////////
+    if ( found_scidacChecksum ) {
+      FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
+      FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
+      scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
+      assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
+      assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
+      std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
+    } else { 
+      std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
+      assert(0); // Can I insist always checksum ?
+    }
+
+    if ( found_FieldMetaData || found_usqcdInfo ) {
+      FieldMetaData checker;
+      GaugeStatistics(Umu,checker);
+      assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
+      assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
+      std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
+    }
+  }
+ };
+
+}}
+
+//HAVE_LIME
+#endif
+
+#endif
@@ -0,0 +1,237 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/parallelIO/IldgIO.h
+
+Copyright (C) 2015
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_ILDGTYPES_IO_H
+#define GRID_ILDGTYPES_IO_H
+
+#ifdef HAVE_LIME
+extern "C" { // for linkage
+#include "lime.h"
+}
+
+namespace Grid {
+
+/////////////////////////////////////////////////////////////////////////////////
+// Data representation of records that enter ILDG and SciDac formats
+/////////////////////////////////////////////////////////////////////////////////
+
+#define GRID_FORMAT      "grid-format"
+#define ILDG_FORMAT      "ildg-format"
+#define ILDG_BINARY_DATA "ildg-binary-data"
+#define ILDG_DATA_LFN    "ildg-data-lfn"
+#define SCIDAC_CHECKSUM           "scidac-checksum"
+#define SCIDAC_PRIVATE_FILE_XML   "scidac-private-file-xml"
+#define SCIDAC_FILE_XML           "scidac-file-xml"
+#define SCIDAC_PRIVATE_RECORD_XML "scidac-private-record-xml"
+#define SCIDAC_RECORD_XML         "scidac-record-xml"
+#define SCIDAC_BINARY_DATA        "scidac-binary-data"
+// Unused SCIDAC records names; could move to support this functionality
+#define SCIDAC_SITELIST           "scidac-sitelist"
+
+  ////////////////////////////////////////////////////////////
+  const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat
+  const int GRID_IO_MULTIFILE  = 1; // hardcode lift from QIO compat
+  const int GRID_IO_FIELD      = 0; // hardcode lift from QIO compat
+  const int GRID_IO_GLOBAL     = 1; // hardcode lift from QIO compat
+  ////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////
+// QIO uses mandatory "private" records fixed format
+// Private is in principle "opaque" however it can't be changed now because that would break existing 
+// file compatability, so should be correct to assume the undocumented but defacto file structure.
+/////////////////////////////////////////////////////////////////////////////////
+
+struct emptyUserRecord : Serializable { 
+  GRID_SERIALIZABLE_CLASS_MEMBERS(emptyUserRecord,int,dummy);
+  emptyUserRecord() { dummy=0; };
+};
+
+////////////////////////
+// Scidac private file xml
+// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
+////////////////////////
+struct scidacFile : Serializable {
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile,
+                                  double, version,
+                                  int, spacetime,
+				  std::string, dims, // must convert to int
+                                  int, volfmt);
+
+  std::vector<int> getDimensions(void) { 
+    std::stringstream stream(dims);
+    std::vector<int> dimensions;
+    int n;
+    while(stream >> n){
+      dimensions.push_back(n);
+    }
+    return dimensions;
+  }
+
+  void setDimensions(std::vector<int> dimensions) { 
+    char delimiter = ' ';
+    std::stringstream stream;
+    for(int i=0;i<dimensions.size();i++){ 
+      stream << dimensions[i];
+      if ( i != dimensions.size()-1) { 
+	stream << delimiter <<std::endl;
+      }
+    }
+    dims = stream.str();
+  }
+
+  // Constructor provides Grid
+  scidacFile() =default; // default constructor
+  scidacFile(GridBase * grid){
+    version      = 1.0;
+    spacetime    = grid->_ndimension;
+    setDimensions(grid->FullDimensions()); 
+    volfmt       = GRID_IO_SINGLEFILE;
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////
+// scidac-private-record-xml : example
+// <scidacRecord>
+// <version>1.1</version><date>Tue Jul 26 21:14:44 2011 UTC</date><recordtype>0</recordtype>
+// <datatype>QDP_D3_ColorMatrix</datatype><precision>D</precision><colors>3</colors><spins>4</spins>
+// <typesize>144</typesize><datacount>4</datacount>
+// </scidacRecord>
+///////////////////////////////////////////////////////////////////////
+
+struct scidacRecord : Serializable {
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord,
+                                  double, version,
+                                  std::string, date,
+				  int, recordtype,
+				  std::string, datatype,
+				  std::string, precision,
+				  int, colors,
+				  int, spins,
+				  int, typesize,
+				  int, datacount);
+
+  scidacRecord()
+  : version(1.0), recordtype(0), colors(0), spins(0), typesize(0), datacount(0)
+  {}
+};
+
+////////////////////////
+// ILDG format
+////////////////////////
+struct ildgFormat : Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(ildgFormat,
+				  double, version,
+				  std::string, field,
+				  int, precision,
+				  int, lx,
+				  int, ly,
+				  int, lz,
+				  int, lt);
+  ildgFormat() { version=1.0; };
+};
+////////////////////////
+// USQCD info
+////////////////////////
+struct usqcdInfo : Serializable { 
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo,
+				  double, version,
+				  double, plaq,
+				  double, linktr,
+				  std::string, info);
+  usqcdInfo() { 
+    version=1.0; 
+  };
+};
+////////////////////////
+// Scidac Checksum
+////////////////////////
+struct scidacChecksum : Serializable { 
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
+				  double, version,
+				  std::string, suma,
+				  std::string, sumb);
+  scidacChecksum() { 
+    version=1.0; 
+  };
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Type:           scidac-file-xml         <title>MILC ILDG archival gauge configuration</title>
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Type:           
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////
+// Scidac private file xml 
+// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile> 
+////////////////////////                                                                                                                                                                              
+
+#if 0
+////////////////////////////////////////////////////////////////////////////////////////
+// From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf
+////////////////////////////////////////////////////////////////////////////////////////
+struct usqcdPropFile : Serializable { 
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile,
+				  double, version,
+				  std::string, type,
+				  std::string, info);
+  usqcdPropFile() { 
+    version=1.0; 
+  };
+};
+struct usqcdSourceInfo : Serializable { 
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo,
+				  double, version,
+				  std::string, info);
+  usqcdSourceInfo() { 
+    version=1.0; 
+  };
+};
+struct usqcdPropInfo : Serializable { 
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo,
+				  double, version,
+				  int, spin,
+				  int, color,
+				  std::string, info);
+  usqcdPropInfo() { 
+    version=1.0; 
+  };
+};
+#endif
+
+}
+#endif
+#endif
@@ -0,0 +1,327 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/parallelIO/NerscIO.h
+
+    Copyright (C) 2015
+
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <algorithm>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <map>
+#include <unistd.h>
+#include <sys/utsname.h>
+#include <pwd.h>
+
+namespace Grid {
+
+  ///////////////////////////////////////////////////////
+  // Precision mapping
+  ///////////////////////////////////////////////////////
+  template<class vobj> static std::string getFormatString (void)
+  {
+    std::string format;
+    typedef typename getPrecision<vobj>::real_scalar_type stype;
+    if ( sizeof(stype) == sizeof(float) ) {
+      format = std::string("IEEE32BIG");
+    }
+    if ( sizeof(stype) == sizeof(double) ) {
+      format = std::string("IEEE64BIG");
+    }
+    return format;
+  }
+  ////////////////////////////////////////////////////////////////////////////////
+  // header specification/interpretation
+  ////////////////////////////////////////////////////////////////////////////////
+    class FieldMetaData : Serializable {
+    public:
+
+      GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData,
+				      int, nd,
+				      std::vector<int>, dimension,
+				      std::vector<std::string>, boundary,
+				      int, data_start,
+				      std::string, hdr_version,
+				      std::string, storage_format,
+				      double, link_trace,
+				      double, plaquette,
+				      uint32_t, checksum,
+				      uint32_t, scidac_checksuma,
+				      uint32_t, scidac_checksumb,
+				      unsigned int, sequence_number,
+				      std::string, data_type,
+				      std::string, ensemble_id,
+				      std::string, ensemble_label,
+				      std::string, ildg_lfn,
+				      std::string, creator,
+				      std::string, creator_hardware,
+				      std::string, creation_date,
+				      std::string, archive_date,
+				      std::string, floating_point);
+      // WARNING: non-initialised values might lead to twisted parallel IO
+      // issues, std::string are fine because they initliase to size 0
+      // as per C++ standard.
+      FieldMetaData(void) 
+      : nd(4), dimension(4,0), boundary(4, ""), data_start(0),
+      link_trace(0.), plaquette(0.), checksum(0),
+      scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
+      {}
+    };
+
+  namespace QCD {
+
+    using namespace Grid;
+
+
+    //////////////////////////////////////////////////////////////////////
+    // Bit and Physical Checksumming and QA of data
+    //////////////////////////////////////////////////////////////////////
+    inline void GridMetaData(GridBase *grid,FieldMetaData &header)
+    {
+      int nd = grid->_ndimension;
+      header.nd = nd;
+      header.dimension.resize(nd);
+      header.boundary.resize(nd);
+      header.data_start = 0;
+      for(int d=0;d<nd;d++) {
+	header.dimension[d] = grid->_fdimensions[d];
+      }
+      for(int d=0;d<nd;d++) {
+	header.boundary[d] = std::string("PERIODIC");
+      }
+    }
+
+    inline void MachineCharacteristics(FieldMetaData &header)
+    {
+      // Who
+      struct passwd *pw = getpwuid (getuid());
+      if (pw) header.creator = std::string(pw->pw_name); 
+
+      // When
+      std::time_t t = std::time(nullptr);
+      std::tm tm_ = *std::localtime(&t);
+      std::ostringstream oss; 
+      //      oss << std::put_time(&tm_, "%c %Z");
+      header.creation_date = oss.str();
+      header.archive_date  = header.creation_date;
+
+      // What
+      struct utsname name;  uname(&name);
+      header.creator_hardware = std::string(name.nodename)+"-";
+      header.creator_hardware+= std::string(name.machine)+"-";
+      header.creator_hardware+= std::string(name.sysname)+"-";
+      header.creator_hardware+= std::string(name.release);
+    }
+
+#define dump_meta_data(field, s)					\
+      s << "BEGIN_HEADER"      << std::endl;				\
+      s << "HDR_VERSION = "    << field.hdr_version    << std::endl;	\
+      s << "DATATYPE = "       << field.data_type      << std::endl;	\
+      s << "STORAGE_FORMAT = " << field.storage_format << std::endl;	\
+      for(int i=0;i<4;i++){						\
+	s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
+      }									\
+      s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
+      s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl; \
+      for(int i=0;i<4;i++){						\
+	s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;	\
+      }									\
+									\
+      s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
+      s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
+      s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
+      s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;	\
+      s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;	\
+      s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;	\
+      s << "CREATOR = "         << field.creator          << std::endl;	\
+      s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;	\
+      s << "CREATION_DATE = "   << field.creation_date    << std::endl;	\
+      s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;	\
+      s << "FLOATING_POINT = "  << field.floating_point   << std::endl;	\
+      s << "END_HEADER"         << std::endl;
+
+template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
+{
+  GridBase *grid = field._grid;
+  std::string format = getFormatString<vobj>();
+   header.floating_point = format;
+   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
+   GridMetaData(grid,header); 
+   MachineCharacteristics(header);
+ }
+ inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
+ {
+   // How to convert data precision etc...
+   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data);
+   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
+ }
+ inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
+ {
+   // How to convert data precision etc...
+   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data);
+   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
+ }
+ template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
+ {
+   
+   GridBase *grid = field._grid;
+   std::string format = getFormatString<vLorentzColourMatrixF>();
+   header.floating_point = format;
+   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
+   GridMetaData(grid,header); 
+   GaugeStatistics(field,header);
+   MachineCharacteristics(header);
+ }
+ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
+ {
+   GridBase *grid = field._grid;
+   std::string format = getFormatString<vLorentzColourMatrixD>();
+   header.floating_point = format;
+   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
+   GridMetaData(grid,header); 
+   GaugeStatistics(field,header);
+   MachineCharacteristics(header);
+ }
+
+    //////////////////////////////////////////////////////////////////////
+    // Utilities ; these are QCD aware
+    //////////////////////////////////////////////////////////////////////
+    inline void reconstruct3(LorentzColourMatrix & cm)
+    {
+      const int x=0;
+      const int y=1;
+      const int z=2;
+      for(int mu=0;mu<Nd;mu++){
+	cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
+	cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
+	cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
+      }
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Some data types for intermediate storage
+    ////////////////////////////////////////////////////////////////////////////////
+    template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
+
+    typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
+    typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
+    typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
+
+/////////////////////////////////////////////////////////////////////////////////
+// Simple classes for precision conversion
+/////////////////////////////////////////////////////////////////////////////////
+template <class fobj, class sobj>
+struct BinarySimpleUnmunger {
+  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
+  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
+  
+  void operator()(sobj &in, fobj &out) {
+    // take word by word and transform accoding to the status
+    fobj_stype *out_buffer = (fobj_stype *)&out;
+    sobj_stype *in_buffer = (sobj_stype *)&in;
+    size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
+    size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
+    assert(fobj_words == sobj_words);
+    
+    for (unsigned int word = 0; word < sobj_words; word++)
+      out_buffer[word] = in_buffer[word];  // type conversion on the fly
+    
+  }
+};
+
+template <class fobj, class sobj>
+struct BinarySimpleMunger {
+  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
+  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
+
+  void operator()(fobj &in, sobj &out) {
+    // take word by word and transform accoding to the status
+    fobj_stype *in_buffer = (fobj_stype *)&in;
+    sobj_stype *out_buffer = (sobj_stype *)&out;
+    size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
+    size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
+    assert(fobj_words == sobj_words);
+    
+    for (unsigned int word = 0; word < sobj_words; word++)
+      out_buffer[word] = in_buffer[word];  // type conversion on the fly
+    
+  }
+};
+
+
+    template<class fobj,class sobj>
+    struct GaugeSimpleMunger{
+      void operator()(fobj &in, sobj &out) {
+        for (int mu = 0; mu < Nd; mu++) {
+          for (int i = 0; i < Nc; i++) {
+          for (int j = 0; j < Nc; j++) {
+	    out(mu)()(i, j) = in(mu)()(i, j);
+	  }}
+        }
+      };
+    };
+
+    template <class fobj, class sobj>
+    struct GaugeSimpleUnmunger {
+
+      void operator()(sobj &in, fobj &out) {
+        for (int mu = 0; mu < Nd; mu++) {
+          for (int i = 0; i < Nc; i++) {
+          for (int j = 0; j < Nc; j++) {
+	    out(mu)()(i, j) = in(mu)()(i, j);
+	  }}
+        }
+      };
+    };
+
+    template<class fobj,class sobj>
+    struct Gauge3x2munger{
+      void operator() (fobj &in,sobj &out){
+	for(int mu=0;mu<Nd;mu++){
+	  for(int i=0;i<2;i++){
+	  for(int j=0;j<3;j++){
+	    out(mu)()(i,j) = in(mu)(i)(j);
+	  }}
+	}
+	reconstruct3(out);
+      }
+    };
+
+    template<class fobj,class sobj>
+    struct Gauge3x2unmunger{
+      void operator() (sobj &in,fobj &out){
+	for(int mu=0;mu<Nd;mu++){
+	  for(int i=0;i<2;i++){
+	  for(int j=0;j<3;j++){
+	    out(mu)(i)(j) = in(mu)()(i,j);
+	  }}
+	}
+      }
+    };
+  }
+
+
+}
@@ -0,0 +1,363 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/parallelIO/NerscIO.h
+
+    Copyright (C) 2015
+
+    Author: Matt Spraggs <matthew.spraggs@gmail.com>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_NERSC_IO_H
+#define GRID_NERSC_IO_H
+
+namespace Grid {
+  namespace QCD {
+
+    using namespace Grid;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Write and read from fstream; comput header offset for payload
+    ////////////////////////////////////////////////////////////////////////////////
+    class NerscIO : public BinaryIO { 
+    public:
+
+      static inline void truncate(std::string file){
+	std::ofstream fout(file,std::ios::out);
+      }
+  
+      static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
+      {
+      std::ofstream fout(file,std::ios::out|std::ios::in);
+      fout.seekp(0,std::ios::beg);
+      dump_meta_data(field, fout);
+      field.data_start = fout.tellp();
+      return field.data_start;
+    }
+
+      // for the header-reader
+      static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field)
+      {
+      uint64_t offset=0;
+      std::map<std::string,std::string> header;
+      std::string line;
+
+      //////////////////////////////////////////////////
+      // read the header
+      //////////////////////////////////////////////////
+      std::ifstream fin(file);
+
+      getline(fin,line); // read one line and insist is 
+
+      removeWhitespace(line);
+      std::cout << GridLogMessage << "* " << line << std::endl;
+
+      assert(line==std::string("BEGIN_HEADER"));
+
+      do {
+      getline(fin,line); // read one line
+      std::cout << GridLogMessage << "* "<<line<< std::endl;
+      int eq = line.find("=");
+      if(eq >0) {
+      std::string key=line.substr(0,eq);
+      std::string val=line.substr(eq+1);
+      removeWhitespace(key);
+      removeWhitespace(val);
+      
+      header[key] = val;
+    }
+    } while( line.find("END_HEADER") == std::string::npos );
+
+      field.data_start = fin.tellg();
+
+      //////////////////////////////////////////////////
+      // chomp the values
+      //////////////////////////////////////////////////
+      field.hdr_version    = header["HDR_VERSION"];
+      field.data_type      = header["DATATYPE"];
+      field.storage_format = header["STORAGE_FORMAT"];
+  
+      field.dimension[0] = std::stol(header["DIMENSION_1"]);
+      field.dimension[1] = std::stol(header["DIMENSION_2"]);
+      field.dimension[2] = std::stol(header["DIMENSION_3"]);
+      field.dimension[3] = std::stol(header["DIMENSION_4"]);
+
+      assert(grid->_ndimension == 4);
+      for(int d=0;d<4;d++){
+      assert(grid->_fdimensions[d]==field.dimension[d]);
+    }
+
+      field.link_trace = std::stod(header["LINK_TRACE"]);
+      field.plaquette  = std::stod(header["PLAQUETTE"]);
+
+      field.boundary[0] = header["BOUNDARY_1"];
+      field.boundary[1] = header["BOUNDARY_2"];
+      field.boundary[2] = header["BOUNDARY_3"];
+      field.boundary[3] = header["BOUNDARY_4"];
+
+      field.checksum = std::stoul(header["CHECKSUM"],0,16);
+      field.ensemble_id      = header["ENSEMBLE_ID"];
+      field.ensemble_label   = header["ENSEMBLE_LABEL"];
+      field.sequence_number  = std::stol(header["SEQUENCE_NUMBER"]);
+      field.creator          = header["CREATOR"];
+      field.creator_hardware = header["CREATOR_HARDWARE"];
+      field.creation_date    = header["CREATION_DATE"];
+      field.archive_date     = header["ARCHIVE_DATE"];
+      field.floating_point   = header["FLOATING_POINT"];
+
+      return field.data_start;
+    }
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Now the meat: the object readers
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    template<class vsimd>
+    static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+					 FieldMetaData& header,
+					 std::string file)
+    {
+      typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+
+      GridBase *grid = Umu._grid;
+      uint64_t offset = readHeader(file,Umu._grid,header);
+
+      FieldMetaData clone(header);
+
+      std::string format(header.floating_point);
+
+      int ieee32big = (format == std::string("IEEE32BIG"));
+      int ieee32    = (format == std::string("IEEE32"));
+      int ieee64big = (format == std::string("IEEE64BIG"));
+      int ieee64    = (format == std::string("IEEE64"));
+
+      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+      // depending on datatype, set up munger;
+      // munger is a function of <floating point, Real, data_type>
+      if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
+	if ( ieee32 || ieee32big ) {
+	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+	    (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
+	     nersc_csum,scidac_csuma,scidac_csumb);
+	}
+	if ( ieee64 || ieee64big ) {
+	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+	    (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
+	     nersc_csum,scidac_csuma,scidac_csumb);
+	}
+      } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
+	if ( ieee32 || ieee32big ) {
+	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
+	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
+	     nersc_csum,scidac_csuma,scidac_csumb);
+	}
+	if ( ieee64 || ieee64big ) {
+	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
+	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
+	     nersc_csum,scidac_csuma,scidac_csumb);
+	}
+      } else {
+	assert(0);
+      }
+
+      GaugeStatistics(Umu,clone);
+
+      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
+	       <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl;
+      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
+	       <<" header    "<<header.plaquette<<std::endl;
+      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
+	       <<" header    "<<header.link_trace<<std::endl;
+
+      if ( fabs(clone.plaquette -header.plaquette ) >=  1.0e-5 ) { 
+	std::cout << " Plaquette mismatch "<<std::endl;
+	std::cout << Umu[0]<<std::endl;
+	std::cout << Umu[1]<<std::endl;
+      }
+      if ( nersc_csum != header.checksum ) { 
+	std::cerr << " checksum mismatch " << std::endl;
+	std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
+	std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
+	std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
+	exit(0);
+      }
+      assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+      assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
+      assert(nersc_csum == header.checksum );
+      
+      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
+    }
+
+      template<class vsimd>
+      static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+					    std::string file, 
+					    int two_row,
+					    int bits32)
+      {
+	typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+
+	typedef iLorentzColourMatrix<vsimd> vobj;
+	typedef typename vobj::scalar_object sobj;
+
+	FieldMetaData header;
+	///////////////////////////////////////////
+	// Following should become arguments
+	///////////////////////////////////////////
+	header.sequence_number = 1;
+	header.ensemble_id     = "UKQCD";
+	header.ensemble_label  = "DWF";
+
+	typedef LorentzColourMatrixD fobj3D;
+	typedef LorentzColour2x3D    fobj2D;
+  
+	GridBase *grid = Umu._grid;
+
+	GridMetaData(grid,header);
+	assert(header.nd==4);
+	GaugeStatistics(Umu,header);
+	MachineCharacteristics(header);
+
+	uint64_t offset;
+
+	// Sod it -- always write 3x3 double
+	header.floating_point = std::string("IEEE64BIG");
+	header.data_type      = std::string("4D_SU3_GAUGE_3x3");
+	GaugeSimpleUnmunger<fobj3D,sobj> munge;
+	if ( grid->IsBoss() ) { 
+	  truncate(file);
+	  offset = writeHeader(header,file);
+	}
+	grid->Broadcast(0,(void *)&offset,sizeof(offset));
+
+	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+	BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
+								  nersc_csum,scidac_csuma,scidac_csumb);
+	header.checksum = nersc_csum;
+	if ( grid->IsBoss() ) { 
+	  writeHeader(header,file);
+	}
+
+	std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
+		 <<std::hex<<header.checksum
+		 <<std::dec<<" plaq "<< header.plaquette <<std::endl;
+
+      }
+      ///////////////////////////////
+      // RNG state
+      ///////////////////////////////
+      static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file)
+      {
+	typedef typename GridParallelRNG::RngStateType RngStateType;
+
+	// Following should become arguments
+	FieldMetaData header;
+	header.sequence_number = 1;
+	header.ensemble_id     = "UKQCD";
+	header.ensemble_label  = "DWF";
+
+	GridBase *grid = parallel._grid;
+
+	GridMetaData(grid,header);
+	assert(header.nd==4);
+	header.link_trace=0.0;
+	header.plaquette=0.0;
+	MachineCharacteristics(header);
+
+	uint64_t offset;
+  
+#ifdef RNG_RANLUX
+	header.floating_point = std::string("UINT64");
+	header.data_type      = std::string("RANLUX48");
+#endif
+#ifdef RNG_MT19937
+	header.floating_point = std::string("UINT32");
+	header.data_type      = std::string("MT19937");
+#endif
+#ifdef RNG_SITMO
+	header.floating_point = std::string("UINT64");
+	header.data_type      = std::string("SITMO");
+#endif
+
+	if ( grid->IsBoss() ) { 
+	  truncate(file);
+	  offset = writeHeader(header,file);
+	}
+	grid->Broadcast(0,(void *)&offset,sizeof(offset));
+	
+	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+	BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
+	header.checksum = nersc_csum;
+	if ( grid->IsBoss() ) { 
+	  offset = writeHeader(header,file);
+	}
+
+	std::cout<<GridLogMessage 
+		 <<"Written NERSC RNG STATE "<<file<< " checksum "
+		 <<std::hex<<header.checksum
+		 <<std::dec<<std::endl;
+
+      }
+    
+      static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
+      {
+	typedef typename GridParallelRNG::RngStateType RngStateType;
+
+	GridBase *grid = parallel._grid;
+
+	uint64_t offset = readHeader(file,grid,header);
+
+	FieldMetaData clone(header);
+
+	std::string format(header.floating_point);
+	std::string data_type(header.data_type);
+
+#ifdef RNG_RANLUX
+	assert(format == std::string("UINT64"));
+	assert(data_type == std::string("RANLUX48"));
+#endif
+#ifdef RNG_MT19937
+	assert(format == std::string("UINT32"));
+	assert(data_type == std::string("MT19937"));
+#endif
+#ifdef RNG_SITMO
+	assert(format == std::string("UINT64"));
+	assert(data_type == std::string("SITMO"));
+#endif
+
+	// depending on datatype, set up munger;
+	// munger is a function of <floating point, Real, data_type>
+	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+	BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
+
+	if ( nersc_csum != header.checksum ) { 
+	  std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
+	  exit(0);
+	}
+	assert(nersc_csum == header.checksum );
+
+	std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
+      }
+
+    };
+
+  }}
+#endif
@@ -0,0 +1,75 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/PerfCount.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/GridCore.h>
+#include <Grid/perfmon/PerfCount.h>
+
+namespace Grid {
+
+#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
+#define RawConfig(A,B) (A<<8|B)
+const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
+#ifdef __linux__
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." , INSTRUCTIONS},
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." , CACHE_REFERENCES},
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
+    // 4
+#ifdef KNL
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
+    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
+    { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS  },
+    { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS  },
+    { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
+    { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
+    // 11
+#else
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS....",INSTRUCTIONS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
+    // 11
+#endif
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS.......",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS.....",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
+    //15
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......",INSTRUCTIONS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS....",INSTRUCTIONS}
+    //19
+  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
+#endif
+};
+}
@@ -0,0 +1,245 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/PerfCount.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <peterboyle@MacBook-Pro.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_PERFCOUNT_H
+#define GRID_PERFCOUNT_H
+
+#include <sys/time.h>
+#include <ctime>
+#include <chrono>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+
+#ifdef __linux__
+#include <syscall.h>
+#include <linux/perf_event.h>
+#else
+#include <sys/syscall.h>
+#endif
+#ifdef __x86_64__
+#include <x86intrin.h>
+#endif
+
+namespace Grid {
+
+#ifdef __linux__
+static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
+			    int cpu, int group_fd, unsigned long flags)
+{
+  int ret=0;
+
+  ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
+		group_fd, flags);
+  return ret;
+}
+#endif
+
+#ifdef TIMERS_OFF
+
+
+inline uint64_t cyclecount(void){ 
+  return 0;
+}
+#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
+#define __SSC_STOP  __SSC_MARK(0x110)
+#define __SSC_START __SSC_MARK(0x111)
+
+
+#else
+
+#define __SSC_MARK(mark) 
+#define __SSC_STOP  
+#define __SSC_START 
+
+/*
+ * cycle counters arch dependent
+ */
+
+#ifdef __bgq__
+inline uint64_t cyclecount(void){ 
+   uint64_t tmp;
+   asm volatile ("mfspr %0,0x10C" : "=&r" (tmp)  );
+   return tmp;
+}
+#elif defined __x86_64__
+inline uint64_t cyclecount(void){ 
+  return __rdtsc();
+  //  unsigned int dummy;
+  // return __rdtscp(&dummy);
+}
+#else
+
+inline uint64_t cyclecount(void){ 
+   return 0;
+}
+
+#endif
+
+#endif
+
+class PerformanceCounter {
+private:
+
+  typedef struct { 
+  public:
+    uint32_t type;
+    uint64_t config;
+    const char *name;
+    int normalisation;
+  } PerformanceCounterConfig; 
+  
+  static const PerformanceCounterConfig PerformanceCounterConfigs [];
+
+public:
+
+  enum PerformanceCounterType {
+    CACHE_REFERENCES=0,
+    CACHE_MISSES=1,
+    CPUCYCLES=2,
+    INSTRUCTIONS=3,
+    L1D_READ_ACCESS=4,
+    PERFORMANCE_COUNTER_NUM_TYPES=19
+  };
+
+public:
+    
+  int PCT;
+
+  long long count;
+  long long cycles;
+  int fd;
+  int cyclefd;
+  unsigned long long elapsed;
+  uint64_t begin;
+
+  static int NumTypes(void){ 
+    return PERFORMANCE_COUNTER_NUM_TYPES;
+  }
+
+  PerformanceCounter(int _pct) {
+#ifdef __linux__
+    assert(_pct>=0);
+    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
+    fd=-1;
+    cyclefd=-1;
+    count=0;
+    cycles=0;
+    PCT =_pct;
+    Open();
+#endif
+  }
+  void Open(void) 
+  {
+#ifdef __linux__
+    struct perf_event_attr pe;
+    memset(&pe, 0, sizeof(struct perf_event_attr));
+    pe.size = sizeof(struct perf_event_attr);
+
+    pe.disabled = 1;
+    pe.exclude_kernel = 1;
+    pe.exclude_hv = 1;
+    pe.inherit    = 1;
+
+    pe.type  = PerformanceCounterConfigs[PCT].type;
+    pe.config= PerformanceCounterConfigs[PCT].config;
+    const char * name = PerformanceCounterConfigs[PCT].name;
+    fd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
+    if (fd == -1) {
+      fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name);
+      perror("Error is");
+    }
+    int norm = PerformanceCounterConfigs[PCT].normalisation;
+    pe.type  = PerformanceCounterConfigs[norm].type;
+    pe.config= PerformanceCounterConfigs[norm].config;
+    name = PerformanceCounterConfigs[norm].name;
+    cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
+    if (cyclefd == -1) {
+      fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name);
+      perror("Error is");
+    }
+#endif
+  }
+
+  void Start(void)
+  {
+#ifdef __linux__
+    if ( fd!= -1) {
+      ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+      ::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
+      ::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
+    }
+    begin  =cyclecount();
+#else
+    begin = 0;
+#endif
+  }
+
+  void Stop(void) {
+    count=0;
+    cycles=0;
+#ifdef __linux__
+    ssize_t ign;
+    if ( fd!= -1) {
+      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
+      ign=::read(fd, &count, sizeof(long long));
+      ign+=::read(cyclefd, &cycles, sizeof(long long));
+      assert(ign=2*sizeof(long long));
+    }
+    elapsed = cyclecount() - begin;
+#else
+    elapsed = 0;
+#endif
+
+  }
+  void Report(void) {
+#ifdef __linux__
+    int N = PerformanceCounterConfigs[PCT].normalisation;
+    const char * sn = PerformanceCounterConfigs[N].name ;
+    const char * sc = PerformanceCounterConfigs[PCT].name;
+      std::printf("tsc = %llu %s = %llu  %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, 
+		  sc, count, sc,sn, (double)count/(double)cycles);
+#else
+    std::printf("%llu cycles \n", elapsed );
+#endif
+  }
+
+  ~PerformanceCounter()
+  {
+#ifdef __linux__
+    ::close(fd);    ::close(cyclefd);
+#endif
+  }
+
+};
+
+}
+#endif
@@ -0,0 +1,245 @@
+#include <Grid/GridCore.h>
+#include <Grid/perfmon/PerfCount.h>
+#include <Grid/perfmon/Stat.h>
+
+namespace Grid { 
+
+bool PmuStat::pmu_initialized=false;
+
+
+void PmuStat::init(const char *regname)
+{
+#ifdef __x86_64__
+  name = regname;
+  if (!pmu_initialized)
+    {
+      std::cout<<"initialising pmu"<<std::endl;
+      pmu_initialized = true;
+      pmu_init();
+    }
+  clear();
+#endif
+}
+void PmuStat::clear(void)
+{
+#ifdef __x86_64__
+  count = 0;
+  tregion = 0;
+  pmc0 = 0;
+  pmc1 = 0;
+  inst = 0;
+  cyc = 0;
+  ref = 0;
+  tcycles = 0;
+  reads = 0;
+  writes = 0;
+#endif
+}
+void PmuStat::print(void)
+{
+#ifdef __x86_64__
+  std::cout <<"Reg "<<std::string(name)<<":\n";
+  std::cout <<"  region "<<tregion<<std::endl;
+  std::cout <<"  cycles "<<tcycles<<std::endl;
+  std::cout <<"  inst   "<<inst   <<std::endl;
+  std::cout <<"  cyc    "<<cyc    <<std::endl;
+  std::cout <<"  ref    "<<ref    <<std::endl;
+  std::cout <<"  pmc0   "<<pmc0   <<std::endl;
+  std::cout <<"  pmc1   "<<pmc1   <<std::endl;
+  std::cout <<"  count  "<<count  <<std::endl;
+  std::cout <<"  reads  "<<reads  <<std::endl;
+  std::cout <<"  writes "<<writes <<std::endl;
+#endif
+}
+void PmuStat::start(void)
+{
+#ifdef __x86_64__
+  pmu_start();
+  ++count;
+  xmemctrs(&mrstart, &mwstart);
+  tstart = __rdtsc();
+#endif
+}
+void PmuStat::enter(int t)
+{
+#ifdef __x86_64__
+  counters[0][t] = __rdpmc(0);
+  counters[1][t] = __rdpmc(1);
+  counters[2][t] = __rdpmc((1<<30)|0);
+  counters[3][t] = __rdpmc((1<<30)|1);
+  counters[4][t] = __rdpmc((1<<30)|2);
+  counters[5][t] = __rdtsc();
+#endif
+}
+void PmuStat::exit(int t)
+{
+#ifdef __x86_64__
+  counters[0][t] = __rdpmc(0) - counters[0][t];
+  counters[1][t] = __rdpmc(1) - counters[1][t];
+  counters[2][t] = __rdpmc((1<<30)|0) - counters[2][t];
+  counters[3][t] = __rdpmc((1<<30)|1) - counters[3][t];
+  counters[4][t] = __rdpmc((1<<30)|2) - counters[4][t];
+  counters[5][t] = __rdtsc() - counters[5][t];
+#endif
+}
+void PmuStat::accum(int nthreads)
+{
+#ifdef __x86_64__
+  tend = __rdtsc();
+  xmemctrs(&mrend, &mwend);
+  pmu_stop();
+  for (int t = 0; t < nthreads; ++t) {
+    pmc0 += counters[0][t];
+    pmc1 += counters[1][t];
+    inst += counters[2][t];
+    cyc += counters[3][t];
+    ref += counters[4][t];
+    tcycles += counters[5][t];
+  }
+  uint64_t region = tend - tstart;
+  tregion += region;
+  uint64_t mreads = mrend - mrstart;
+  reads += mreads;
+  uint64_t mwrites = mwend - mwstart;
+  writes += mwrites;
+#endif
+}
+
+
+void PmuStat::pmu_fini(void) {}
+void PmuStat::pmu_start(void) {};
+void PmuStat::pmu_stop(void) {};
+void PmuStat::pmu_init(void)
+{
+#ifdef _KNIGHTS_LANDING_
+  KNLsetup();
+#endif
+}
+void PmuStat::xmemctrs(uint64_t *mr, uint64_t *mw)
+{
+#ifdef _KNIGHTS_LANDING_
+  ctrs c;
+  KNLreadctrs(c);
+  uint64_t emr = 0, emw = 0;
+  for (int i = 0; i < NEDC; ++i)
+    {
+      emr += c.edcrd[i];
+      emw += c.edcwr[i];
+    }
+  *mr = emr;
+  *mw = emw;
+#else
+  *mr = *mw = 0;
+#endif
+}
+
+#ifdef _KNIGHTS_LANDING_
+
+struct knl_gbl_ PmuStat::gbl;
+
+#define PMU_MEM
+
+void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
+{
+  char fname[1024];
+  snprintf(fname, sizeof(fname), "%s/type", ename);
+  FILE *fp = fopen(fname, "r");
+  if (fp == 0) {
+    ::printf("open %s", fname);
+    ::exit(0);
+  }
+  int type;
+  int ret = fscanf(fp, "%d", &type);
+  assert(ret == 1);
+  fclose(fp);
+  //  std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
+
+  struct perf_event_attr hw = {};
+  hw.size = sizeof(hw);
+  hw.type = type;
+  // see /sys/devices/uncore_*/format/*
+  // All of the events we are interested in are configured the same way, but
+  // that isn't always true. Proper code would parse the format files
+  hw.config = event | (umask << 8);
+  //hw.read_format = PERF_FORMAT_GROUP;
+  // unfortunately the above only works within a single PMU; might
+  // as well just read them one at a time
+  int cpu = 0;
+  fd = perf_event_open(&hw, -1, cpu, -1, 0);
+  if (fd == -1) {
+    ::printf("CPU %d, box %s, event 0x%lx", cpu, ename, hw.config);
+    ::exit(0);
+  } else { 
+    //    std::cout << "event "<<std::string(ename)<<" set up for fd "<<fd<<" hw.config "<<hw.config <<std::endl;
+  }
+}
+
+
+ void PmuStat::KNLsetup(void){
+
+   int ret;
+   char fname[1024];
+
+   // MC RPQ inserts and WPQ inserts (reads & writes)
+   for (int mc = 0; mc < NMC; ++mc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
+       // RPQ Inserts
+       KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
+       // WPQ Inserts
+       KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
+     }
+   // EDC RPQ inserts and WPQ inserts
+   for (int edc=0; edc < NEDC; ++edc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
+       // RPQ inserts
+       KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
+       // WPQ inserts
+       KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
+     }
+   // EDC HitE, HitM, MissE, MissM
+   for (int edc=0; edc < NEDC; ++edc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
+       KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
+       KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
+       KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
+       KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
+     }
+ }
+
+uint64_t PmuStat::KNLreadctr(int fd)
+{
+  uint64_t data;
+  size_t s = ::read(fd, &data, sizeof(data));
+  if (s != sizeof(uint64_t)){
+    ::printf("read counter %lu", s);
+    ::exit(0);
+  }
+  return data;
+}
+
+void PmuStat::KNLreadctrs(ctrs &c)
+{
+  for (int i = 0; i < NMC; ++i)
+    {
+      c.mcrd[i] = KNLreadctr(gbl.mc_rd[i]);
+      c.mcwr[i] = KNLreadctr(gbl.mc_wr[i]);
+    }
+  for (int i = 0; i < NEDC; ++i)
+    {
+      c.edcrd[i] = KNLreadctr(gbl.edc_rd[i]);
+      c.edcwr[i] = KNLreadctr(gbl.edc_wr[i]);
+    }
+  for (int i = 0; i < NEDC; ++i)
+    {
+      c.edchite[i] = KNLreadctr(gbl.edc_hite[i]);
+      c.edchitm[i] = KNLreadctr(gbl.edc_hitm[i]);
+      c.edcmisse[i] = KNLreadctr(gbl.edc_misse[i]);
+      c.edcmissm[i] = KNLreadctr(gbl.edc_missm[i]);
+    }
+}
+
+#endif
+}
@@ -0,0 +1,104 @@
+#ifndef _GRID_STAT_H
+#define _GRID_STAT_H
+
+#ifdef AVX512
+#define _KNIGHTS_LANDING_ROOTONLY
+#endif
+
+namespace Grid { 
+
+///////////////////////////////////////////////////////////////////////////////
+// Extra KNL counters from MCDRAM
+///////////////////////////////////////////////////////////////////////////////
+#ifdef _KNIGHTS_LANDING_
+#define NMC 6
+#define NEDC 8
+struct ctrs
+{
+    uint64_t mcrd[NMC];
+    uint64_t mcwr[NMC];
+    uint64_t edcrd[NEDC]; 
+    uint64_t edcwr[NEDC];
+    uint64_t edchite[NEDC];
+    uint64_t edchitm[NEDC];
+    uint64_t edcmisse[NEDC];
+    uint64_t edcmissm[NEDC];
+};
+// Peter/Azusa:
+// Our modification of a code provided by Larry Meadows from Intel
+// Verified by email exchange non-NDA, ok for github. Should be as uses /sys/devices/ FS
+// so is already public and in the linux kernel for KNL.
+struct knl_gbl_
+{
+  int mc_rd[NMC];
+  int mc_wr[NMC];
+  int edc_rd[NEDC];
+  int edc_wr[NEDC];
+  int edc_hite[NEDC];
+  int edc_hitm[NEDC];
+  int edc_misse[NEDC];
+  int edc_missm[NEDC];
+};
+#endif
+///////////////////////////////////////////////////////////////////////////////
+
+class PmuStat
+{
+    uint64_t counters[8][256];
+#ifdef _KNIGHTS_LANDING_
+    static struct knl_gbl_ gbl;
+#endif
+    const char *name;
+
+    uint64_t reads;     // memory reads
+    uint64_t writes;    // memory writes
+    uint64_t mrstart;   // memory read counter at start of parallel region
+    uint64_t mrend;     // memory read counter at end of parallel region
+    uint64_t mwstart;   // memory write counter at start of parallel region
+    uint64_t mwend;     // memory write counter at end of parallel region
+
+    // cumulative counters
+    uint64_t count;     // number of invocations
+    uint64_t tregion;   // total time in parallel region (from thread 0)
+    uint64_t tcycles;   // total cycles inside parallel region
+    uint64_t inst, ref, cyc;   // fixed counters
+    uint64_t pmc0, pmc1;// pmu
+    // add memory counters here
+    // temp variables
+    uint64_t tstart;    // tsc at start of parallel region
+    uint64_t tend;      // tsc at end of parallel region
+    // map for ctrs values
+    // 0 pmc0 start
+    // 1 pmc0 end
+    // 2 pmc1 start
+    // 3 pmc1 end
+    // 4 tsc start
+    // 5 tsc end
+    static bool pmu_initialized;
+public:
+    static bool is_init(void){ return pmu_initialized;}
+    static void pmu_init(void);
+    static void pmu_fini(void);
+    static void pmu_start(void);
+    static void pmu_stop(void);
+    void accum(int nthreads);
+    static void xmemctrs(uint64_t *mr, uint64_t *mw);
+    void start(void);
+    void enter(int t);
+    void exit(int t);
+    void print(void);
+    void init(const char *regname);
+    void clear(void);
+#ifdef _KNIGHTS_LANDING_
+    static void     KNLsetup(void);
+    static uint64_t KNLreadctr(int fd);
+    static void     KNLreadctrs(ctrs &c);
+    static void     KNLevsetup(const char *ename, int &fd, int event, int umask);
+#endif
+    
+  };
+
+}
+#endif
+
+
@@ -0,0 +1,125 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Timer.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_TIME_H
+#define GRID_TIME_H
+
+#include <sys/time.h>
+#include <ctime>
+#include <chrono>
+
+namespace Grid {
+
+
+  // Dress the output; use std::chrono
+
+// C++11 time facilities better?
+inline double usecond(void) {
+  struct timeval tv;
+#ifdef TIMERS_ON
+  gettimeofday(&tv,NULL);
+#endif
+  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
+}
+
+typedef  std::chrono::system_clock          GridClock;
+typedef  std::chrono::time_point<GridClock> GridTimePoint;
+
+typedef  std::chrono::seconds               GridSecs;
+typedef  std::chrono::milliseconds          GridMillisecs;
+typedef  std::chrono::microseconds          GridUsecs;
+typedef  std::chrono::microseconds          GridTime;
+
+inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
+{
+  stream << time.count()<<" s";
+  return stream;
+}
+inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now)
+{
+  GridSecs second(1);
+  auto     secs       = now/second ; 
+  auto     subseconds = now%second ; 
+  stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
+  return stream;
+}
+inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
+{
+  GridSecs second(1);
+  auto     seconds    = now/second ; 
+  auto     subseconds = now%second ; 
+  stream << seconds<<"."<<std::setw(6)<<std::setfill('0')<<subseconds.count()<<" s";
+  return stream;
+}
+
+
+class GridStopWatch {
+private:
+  bool running;
+  GridTimePoint start;
+  GridUsecs accumulator;
+public:
+  GridStopWatch () { 
+    Reset();
+  }
+  void     Start(void) { 
+    assert(running == false);
+#ifdef TIMERS_ON
+    start = GridClock::now(); 
+#endif
+    running = true;
+  }
+  void     Stop(void)  { 
+    assert(running == true);
+#ifdef TIMERS_ON
+    accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); 
+#endif
+    running = false; 
+  };
+  void     Reset(void){
+    running = false;
+#ifdef TIMERS_ON
+    start = GridClock::now();
+#endif
+    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
+  }
+  GridTime Elapsed(void) {
+    assert(running == false);
+    return std::chrono::duration_cast<GridTime>( accumulator );
+  }
+  uint64_t useconds(void){
+    assert(running == false);
+    return (uint64_t) accumulator.count();
+  }
+  bool isRunning(void){
+    return running;
+  }
+};
+
+}
+#endif
@@ -1,7 +1,7 @@
 /**
- * pugixml parser - version 1.6
+ * pugixml parser - version 1.9
 * --------------------------------------------------------
- * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 * Report bugs and download new versions at http://pugixml.org/
 *
 * This library is distributed under the MIT License. See notice at the end
@@ -17,6 +17,9 @@
 // Uncomment this to enable wchar_t mode
 // #define PUGIXML_WCHAR_MODE

+// Uncomment this to enable compact mode
+// #define PUGIXML_COMPACT
+
 // Uncomment this to disable XPath
 // #define PUGIXML_NO_XPATH

@@ -46,7 +49,7 @@
 #endif

 /**
- * Copyright (c) 2006-2015 Arseny Kapoulkine
+ * Copyright (c) 2006-2018 Arseny Kapoulkine
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
@@ -59,7 +62,7 @@
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -1,7 +1,7 @@
 /**
- * pugixml parser - version 1.6
+ * pugixml parser - version 1.9
 * --------------------------------------------------------
- * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 * Report bugs and download new versions at http://pugixml.org/
 *
 * This library is distributed under the MIT License. See notice at the end
@@ -13,11 +13,11 @@

 #ifndef PUGIXML_VERSION
 // Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons
-#	define PUGIXML_VERSION 160
+#	define PUGIXML_VERSION 190
 #endif

 // Include user configuration file (this can define various configuration macros)
-#include <pugixml/pugiconfig.hpp>
+#include "pugiconfig.hpp"

 #ifndef HEADER_PUGIXML_HPP
 #define HEADER_PUGIXML_HPP
@@ -72,6 +72,44 @@
 #	endif
 #endif

+// If the platform is known to have move semantics support, compile move ctor/operator implementation
+#ifndef PUGIXML_HAS_MOVE
+#	if __cplusplus >= 201103
+#		define PUGIXML_HAS_MOVE
+#	elif defined(_MSC_VER) && _MSC_VER >= 1600
+#		define PUGIXML_HAS_MOVE
+#	endif
+#endif
+
+// If C++ is 2011 or higher, add 'noexcept' specifiers
+#ifndef PUGIXML_NOEXCEPT
+#	if __cplusplus >= 201103
+#		define PUGIXML_NOEXCEPT noexcept
+#	elif defined(_MSC_VER) && _MSC_VER >= 1900
+#		define PUGIXML_NOEXCEPT noexcept
+#	else
+#		define PUGIXML_NOEXCEPT
+#	endif
+#endif
+
+// Some functions can not be noexcept in compact mode
+#ifdef PUGIXML_COMPACT
+#	define PUGIXML_NOEXCEPT_IF_NOT_COMPACT
+#else
+#	define PUGIXML_NOEXCEPT_IF_NOT_COMPACT PUGIXML_NOEXCEPT
+#endif
+
+// If C++ is 2011 or higher, add 'override' qualifiers
+#ifndef PUGIXML_OVERRIDE
+#	if __cplusplus >= 201103
+#		define PUGIXML_OVERRIDE override
+#	elif defined(_MSC_VER) && _MSC_VER >= 1700
+#		define PUGIXML_OVERRIDE override
+#	else
+#		define PUGIXML_OVERRIDE
+#	endif
+#endif
+
 // Character interface macros
 #ifdef PUGIXML_WCHAR_MODE
 #	define PUGIXML_TEXT(t) L ## t
@@ -133,13 +171,13 @@ namespace pugi

 	// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
 	const unsigned int parse_eol = 0x0020;
-	
+
 	// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
 	const unsigned int parse_wconv_attribute = 0x0040;

 	// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
 	const unsigned int parse_wnorm_attribute = 0x0080;
-	
+
 	// This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
 	const unsigned int parse_declaration = 0x0100;

@@ -158,6 +196,11 @@ namespace pugi
 	// is a valid document. This flag is off by default.
 	const unsigned int parse_fragment = 0x1000;

+	// This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of
+	// the document; this flag is only recommended for parsing documents with many PCDATA nodes in memory-constrained environments.
+	// This flag is off by default.
+	const unsigned int parse_embed_pcdata = 0x2000;
+
 	// The default parsing mode.
 	// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
 	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
@@ -184,16 +227,16 @@ namespace pugi
 	};

 	// Formatting flags
-	
+
 	// Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
 	const unsigned int format_indent = 0x01;
-	
+
 	// Write encoding-specific BOM to the output stream. This flag is off by default.
 	const unsigned int format_write_bom = 0x02;

 	// Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
 	const unsigned int format_raw = 0x04;
-	
+
 	// Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
 	const unsigned int format_no_declaration = 0x08;

@@ -206,6 +249,9 @@ namespace pugi
 	// Write every attribute on a new line with appropriate indentation. This flag is off by default.
 	const unsigned int format_indent_attributes = 0x40;

+	// Don't output empty element tags, instead writing an explicit start and end tag even if there are no children. This flag is off by default.
+	const unsigned int format_no_empty_element_tags = 0x80;
+
 	// The default set of formatting flags.
 	// Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
 	const unsigned int format_default = format_indent;
@@ -225,7 +271,7 @@ namespace pugi
 	class xml_node;

 	class xml_text;
-	
+
 	#ifndef PUGIXML_NO_XPATH
 	class xpath_node;
 	class xpath_node_set;
@@ -268,7 +314,7 @@ namespace pugi
 		// Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
 		xml_writer_file(void* file);

-		virtual void write(const void* data, size_t size);
+		virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE;

 	private:
 		void* file;
@@ -283,7 +329,7 @@ namespace pugi
 		xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
 		xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);

-		virtual void write(const void* data, size_t size);
+		virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE;

 	private:
 		std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
@@ -299,13 +345,13 @@ namespace pugi

 	private:
 		xml_attribute_struct* _attr;
-	
+
 		typedef void (*unspecified_bool_type)(xml_attribute***);

 	public:
 		// Default constructor. Constructs an empty attribute.
 		xml_attribute();
-		
+
 		// Constructs attribute from internal pointer
 		explicit xml_attribute(xml_attribute_struct* attr);

@@ -354,6 +400,8 @@ namespace pugi
 		// Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
 		bool set_value(int rhs);
 		bool set_value(unsigned int rhs);
+		bool set_value(long rhs);
+		bool set_value(unsigned long rhs);
 		bool set_value(double rhs);
 		bool set_value(float rhs);
 		bool set_value(bool rhs);
@@ -367,6 +415,8 @@ namespace pugi
 		xml_attribute& operator=(const char_t* rhs);
 		xml_attribute& operator=(int rhs);
 		xml_attribute& operator=(unsigned int rhs);
+		xml_attribute& operator=(long rhs);
+		xml_attribute& operator=(unsigned long rhs);
 		xml_attribute& operator=(double rhs);
 		xml_attribute& operator=(float rhs);
 		xml_attribute& operator=(bool rhs);
@@ -417,7 +467,7 @@ namespace pugi

 		// Borland C++ workaround
 		bool operator!() const;
-	
+
 		// Comparison operators (compares wrapped node pointers)
 		bool operator==(const xml_node& r) const;
 		bool operator!=(const xml_node& r) const;
@@ -438,7 +488,7 @@ namespace pugi
 		// Get node value, or "" if node is empty or it has no value
 		// Note: For <node>text</node> node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes.
 		const char_t* value() const;
-	
+
 		// Get attribute list
 		xml_attribute first_attribute() const;
 		xml_attribute last_attribute() const;
@@ -450,7 +500,7 @@ namespace pugi
 		// Get next/previous sibling in the children list of the parent node
 		xml_node next_sibling() const;
 		xml_node previous_sibling() const;
-		
+
 		// Get parent node
 		xml_node parent() const;

@@ -478,7 +528,7 @@ namespace pugi
 		// Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
 		bool set_name(const char_t* rhs);
 		bool set_value(const char_t* rhs);
-		
+
 		// Add attribute with specified name. Returns added attribute, or empty attribute on errors.
 		xml_attribute append_attribute(const char_t* name);
 		xml_attribute prepend_attribute(const char_t* name);
@@ -532,11 +582,11 @@ namespace pugi
 		template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
 		{
 			if (!_root) return xml_attribute();
-			
+
 			for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
 				if (pred(attrib))
 					return attrib;
-		
+
 			return xml_attribute();
 		}

@@ -544,11 +594,11 @@ namespace pugi
 		template <typename Predicate> xml_node find_child(Predicate pred) const
 		{
 			if (!_root) return xml_node();
-	
+
 			for (xml_node node = first_child(); node; node = node.next_sibling())
 				if (pred(node))
 					return node;
-		
+
 			return xml_node();
 		}

@@ -558,7 +608,7 @@ namespace pugi
 			if (!_root) return xml_node();

 			xml_node cur = first_child();
-			
+
 			while (cur._root && cur._root != _root)
 			{
 				if (pred(cur)) return cur;
@@ -590,7 +640,7 @@ namespace pugi

 		// Recursively traverse subtree with xml_tree_walker
 		bool traverse(xml_tree_walker& walker);
-	
+
 	#ifndef PUGIXML_NO_XPATH
 		// Select single node by evaluating XPath query. Returns first node from the resulting node set.
 		xpath_node select_node(const char_t* query, xpath_variable_set* variables = 0) const;
@@ -601,11 +651,11 @@ namespace pugi
 		xpath_node_set select_nodes(const xpath_query& query) const;

 		// (deprecated: use select_node instead) Select single node by evaluating XPath query.
-		xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
-		xpath_node select_single_node(const xpath_query& query) const;
+		PUGIXML_DEPRECATED xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
+		PUGIXML_DEPRECATED xpath_node select_single_node(const xpath_query& query) const;

 	#endif
-		
+
 		// Print subtree using a writer object
 		void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;

@@ -701,6 +751,8 @@ namespace pugi
 		// Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
 		bool set(int rhs);
 		bool set(unsigned int rhs);
+		bool set(long rhs);
+		bool set(unsigned long rhs);
 		bool set(double rhs);
 		bool set(float rhs);
 		bool set(bool rhs);
@@ -714,6 +766,8 @@ namespace pugi
 		xml_text& operator=(const char_t* rhs);
 		xml_text& operator=(int rhs);
 		xml_text& operator=(unsigned int rhs);
+		xml_text& operator=(long rhs);
+		xml_text& operator=(unsigned long rhs);
 		xml_text& operator=(double rhs);
 		xml_text& operator=(float rhs);
 		xml_text& operator=(bool rhs);
@@ -867,11 +921,11 @@ namespace pugi

 	private:
 		int _depth;
-	
+
 	protected:
 		// Get current traversal depth
 		int depth() const;
-	
+
 	public:
 		xml_tree_walker();
 		virtual ~xml_tree_walker();
@@ -942,13 +996,14 @@ namespace pugi
 		char_t* _buffer;

 		char _memory[192];
-		
+
 		// Non-copyable semantics
 		xml_document(const xml_document&);
-		const xml_document& operator=(const xml_document&);
+		xml_document& operator=(const xml_document&);

-		void create();
-		void destroy();
+		void _create();
+		void _destroy();
+		void _move(xml_document& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;

 	public:
 		// Default constructor, makes empty document
@@ -957,6 +1012,12 @@ namespace pugi
 		// Destructor, invalidates all node/attribute handles to this document
 		~xml_document();

+	#ifdef PUGIXML_HAS_MOVE
+		// Move semantics support
+		xml_document(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
+		xml_document& operator=(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
+	#endif
+
 		// Removes all nodes, leaving the empty document
 		void reset();

@@ -970,7 +1031,7 @@ namespace pugi
 	#endif

 		// (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied.
-		xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
+		PUGIXML_DEPRECATED xml_parse_result load(const char_t* contents, unsigned int options = parse_default);

 		// Load document from zero-terminated string. No encoding conversions are applied.
 		xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default);
@@ -1051,7 +1112,7 @@ namespace pugi
 		// Non-copyable semantics
 		xpath_variable(const xpath_variable&);
 		xpath_variable& operator=(const xpath_variable&);
-		
+
 	public:
 		// Get variable name
 		const char_t* name() const;
@@ -1095,10 +1156,10 @@ namespace pugi
 		xpath_variable_set(const xpath_variable_set& rhs);
 		xpath_variable_set& operator=(const xpath_variable_set& rhs);

-	#if __cplusplus >= 201103
+	#ifdef PUGIXML_HAS_MOVE
 		// Move semantics support
-		xpath_variable_set(xpath_variable_set&& rhs);
-		xpath_variable_set& operator=(xpath_variable_set&& rhs);
+		xpath_variable_set(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
+		xpath_variable_set& operator=(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
 	#endif

 		// Add a new variable or get the existing one, if the types match
@@ -1139,29 +1200,29 @@ namespace pugi
 		// Destructor
 		~xpath_query();

-	#if __cplusplus >= 201103
+	#ifdef PUGIXML_HAS_MOVE
 		// Move semantics support
-		xpath_query(xpath_query&& rhs);
-		xpath_query& operator=(xpath_query&& rhs);
+		xpath_query(xpath_query&& rhs) PUGIXML_NOEXCEPT;
+		xpath_query& operator=(xpath_query&& rhs) PUGIXML_NOEXCEPT;
 	#endif

 		// Get query expression return type
 		xpath_value_type return_type() const;
-		
+
 		// Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
 		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
 		bool evaluate_boolean(const xpath_node& n) const;
-		
+
 		// Evaluate expression as double value in the specified context; performs type conversion if necessary.
 		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
 		double evaluate_number(const xpath_node& n) const;
-		
+
 	#ifndef PUGIXML_NO_STL
 		// Evaluate expression as string value in the specified context; performs type conversion if necessary.
 		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
 		string_t evaluate_string(const xpath_node& n) const;
 	#endif
-		
+
 		// Evaluate expression as string value in the specified context; performs type conversion if necessary.
 		// At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
 		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
@@ -1188,7 +1249,7 @@ namespace pugi
 		// Borland C++ workaround
 		bool operator!() const;
 	};
-	
+
 	#ifndef PUGIXML_NO_EXCEPTIONS
 	// XPath exception class
 	class PUGIXML_CLASS xpath_exception: public std::exception
@@ -1201,26 +1262,26 @@ namespace pugi
 		explicit xpath_exception(const xpath_parse_result& result);

 		// Get error message
-		virtual const char* what() const throw();
+		virtual const char* what() const throw() PUGIXML_OVERRIDE;

 		// Get parse result
 		const xpath_parse_result& result() const;
 	};
 	#endif
-	
+
 	// XPath node class (either xml_node or xml_attribute)
 	class PUGIXML_CLASS xpath_node
 	{
 	private:
 		xml_node _node;
 		xml_attribute _attribute;
-	
+
 		typedef void (*unspecified_bool_type)(xpath_node***);

 	public:
 		// Default constructor; constructs empty XPath node
 		xpath_node();
-		
+
 		// Construct XPath node from XML node/attribute
 		xpath_node(const xml_node& node);
 		xpath_node(const xml_attribute& attribute, const xml_node& parent);
@@ -1228,13 +1289,13 @@ namespace pugi
 		// Get node/attribute, if any
 		xml_node node() const;
 		xml_attribute attribute() const;
-		
+
 		// Get parent of contained node/attribute
 		xml_node parent() const;

 		// Safe bool conversion operator
 		operator unspecified_bool_type() const;
-		
+
 		// Borland C++ workaround
 		bool operator!() const;

@@ -1260,13 +1321,13 @@ namespace pugi
 			type_sorted,			// Sorted by document order (ascending)
 			type_sorted_reverse		// Sorted by document order (descending)
 		};
-		
+
 		// Constant iterator type
 		typedef const xpath_node* const_iterator;

 		// We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work
 		typedef const xpath_node* iterator;
-	
+
 		// Default constructor. Constructs empty set.
 		xpath_node_set();

@@ -1275,49 +1336,49 @@ namespace pugi

 		// Destructor
 		~xpath_node_set();
-		
+
 		// Copy constructor/assignment operator
 		xpath_node_set(const xpath_node_set& ns);
 		xpath_node_set& operator=(const xpath_node_set& ns);

-	#if __cplusplus >= 201103
+	#ifdef PUGIXML_HAS_MOVE
 		// Move semantics support
-		xpath_node_set(xpath_node_set&& rhs);
-		xpath_node_set& operator=(xpath_node_set&& rhs);
+		xpath_node_set(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
+		xpath_node_set& operator=(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
 	#endif

 		// Get collection type
 		type_t type() const;
-		
+
 		// Get collection size
 		size_t size() const;

 		// Indexing operator
 		const xpath_node& operator[](size_t index) const;
-		
+
 		// Collection iterators
 		const_iterator begin() const;
 		const_iterator end() const;

 		// Sort the collection in ascending/descending order by document order
 		void sort(bool reverse = false);
-		
+
 		// Get first node in the collection by document order
 		xpath_node first() const;
-		
+
 		// Check if collection is empty
 		bool empty() const;
-	
+
 	private:
 		type_t _type;
-		
+
 		xpath_node _storage;
-		
+
 		xpath_node* _begin;
 		xpath_node* _end;

 		void _assign(const_iterator begin, const_iterator end, type_t type);
-		void _move(xpath_node_set& rhs);
+		void _move(xpath_node_set& rhs) PUGIXML_NOEXCEPT;
 	};
 #endif

@@ -1325,7 +1386,7 @@ namespace pugi
 	// Convert wide string to UTF8
 	std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
 	std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
-	
+
 	// Convert UTF8 to wide string
 	std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
 	std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
@@ -1333,13 +1394,13 @@ namespace pugi

 	// Memory allocation function interface; returns pointer to allocated memory or NULL on failure
 	typedef void* (*allocation_function)(size_t size);
-	
+
 	// Memory deallocation function interface
 	typedef void (*deallocation_function)(void* ptr);

 	// Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
 	void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
-	
+
 	// Get current memory management functions
 	allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
 	deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
@@ -1375,7 +1436,7 @@ namespace std
 #endif

 /**
- * Copyright (c) 2006-2015 Arseny Kapoulkine
+ * Copyright (c) 2006-2018 Arseny Kapoulkine
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
@@ -1388,7 +1449,7 @@ namespace std
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -1,6 +1,6 @@
-pugixml 1.6 - an XML processing library
+pugixml 1.9 - an XML processing library

-Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 Report bugs and download new versions at http://pugixml.org/

 This is the distribution of pugixml, which is a C++ XML processing library,
@@ -28,7 +28,7 @@ The distribution contains the following folders:

 This library is distributed under the MIT License:

-Copyright (c) 2006-2015 Arseny Kapoulkine
+Copyright (c) 2006-2018 Arseny Kapoulkine

 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
@@ -0,0 +1,124 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/QCD.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LT_H
+#define GRID_LT_H
+namespace Grid{
+
+// First steps in the complete generalization of the Physics part
+// Design not final
+namespace LatticeTheories {
+
+template <int Dimensions>
+struct LatticeTheory {
+  static const int Nd = Dimensions;
+  static const int Nds = Dimensions * 2;  // double stored field
+  template <typename vtype>
+  using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
+};
+
+template <int Dimensions, int Colours>
+struct LatticeGaugeTheory : public LatticeTheory<Dimensions> {
+  static const int Nds = Dimensions * 2;
+  static const int Nd = Dimensions;
+  static const int Nc = Colours;
+
+  template <typename vtype> 
+  using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > >;
+  template <typename vtype>
+  using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd>;
+  template <typename vtype>
+  using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds>;
+  template <typename vtype>
+  using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
+};
+
+template <int Dimensions, int Colours, int Spin>
+struct FermionicLatticeGaugeTheory
+    : public LatticeGaugeTheory<Dimensions, Colours> {
+  static const int Nd = Dimensions;
+  static const int Nds = Dimensions * 2;
+  static const int Nc = Colours;
+  static const int Ns = Spin;
+
+  template <typename vtype>
+  using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
+  template <typename vtype>
+  using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
+  template <typename vtype>
+  using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
+  template <typename vtype>
+  using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
+  // These 2 only if Spin is a multiple of 2
+  static const int Nhs = Spin / 2;
+  template <typename vtype>
+  using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
+  template <typename vtype>
+  using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
+
+  //tests
+  typedef iColourMatrix<Complex> ColourMatrix;
+  typedef iColourMatrix<ComplexF> ColourMatrixF;
+  typedef iColourMatrix<ComplexD> ColourMatrixD;
+
+
+};
+
+// Examples, not complete now.
+struct QCD : public FermionicLatticeGaugeTheory<4, 3, 4> {
+    static const int Xp = 0;
+    static const int Yp = 1;
+    static const int Zp = 2;
+    static const int Tp = 3;
+    static const int Xm = 4;
+    static const int Ym = 5;
+    static const int Zm = 6;
+    static const int Tm = 7;
+
+    typedef FermionicLatticeGaugeTheory FLGT;
+
+    typedef FLGT::iSpinMatrix<Complex  >          SpinMatrix;
+    typedef FLGT::iSpinMatrix<ComplexF >          SpinMatrixF;
+    typedef FLGT::iSpinMatrix<ComplexD >          SpinMatrixD;
+
+};
+struct QED : public FermionicLatticeGaugeTheory<4, 1, 4> {//fill
+};
+
+template <int Dimensions>
+struct Scalar : public LatticeTheory<Dimensions> {};
+
+};  // LatticeTheories
+
+} // Grid
+
+#endif
@@ -1,10 +1,45 @@
-#ifndef GRID_QCD_H
-#define GRID_QCD_H
-namespace Grid{
+    /*************************************************************************************

+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/QCD.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_QCD_BASE_H
+#define GRID_QCD_BASE_H
+namespace Grid{
 namespace QCD {

+    static const int Xdir = 0;
+    static const int Ydir = 1;
+    static const int Zdir = 2;
+    static const int Tdir = 3;

+  
    static const int Xp = 0;
    static const int Yp = 1;
    static const int Zp = 2;
@@ -24,10 +59,18 @@ namespace QCD {
    //////////////////////////////////////////////////////////////////////////////
    // QCD iMatrix types
    // Index conventions:                            Lorentz x Spin x Colour
+    // note: static const int or constexpr will work for type deductions
+    //       with the intel compiler (up to version 17)
    //////////////////////////////////////////////////////////////////////////////
-    static const int ColourIndex = 2;
-    static const int SpinIndex   = 1;
-    static const int LorentzIndex= 0;
+    #define ColourIndex  2
+    #define SpinIndex    1
+    #define LorentzIndex 0
+
+    // Also should make these a named enum type
+    static const int DaggerNo=0;
+    static const int DaggerYes=1;
+    static const int InverseNo=0;
+    static const int InverseYes=1;

    // Useful traits is this a spin index
    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
@@ -47,19 +90,22 @@ namespace QCD {
    // That probably makes for GridRedBlack4dCartesian grid.

    // s,sp,c,spc,lc
-    template<typename vtype> using iSinglet                   = iScalar<iScalar<iScalar<vtype> > >;
-    template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iScalar<vtype>, Ns> >;
-    template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
-    template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
-    template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
-    template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
-    template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
-    template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
-    template<typename vtype> using iSpinColourVector          = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
-    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
-    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;

-    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+    template<typename vtype> using iSinglet                     = iScalar<iScalar<iScalar<vtype> > >;
+    template<typename vtype> using iSpinMatrix                  = iScalar<iMatrix<iScalar<vtype>, Ns> >;
+    template<typename vtype> using iColourMatrix                = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
+    template<typename vtype> using iSpinColourMatrix            = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
+    template<typename vtype> using iLorentzColourMatrix         = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
+    template<typename vtype> using iDoubleStoredColourMatrix    = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
+    template<typename vtype> using iSpinVector                  = iScalar<iVector<iScalar<vtype>, Ns> >;
+    template<typename vtype> using iColourVector                = iScalar<iScalar<iVector<vtype, Nc> > >;
+    template<typename vtype> using iSpinColourVector            = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
+    template<typename vtype> using iHalfSpinVector              = iScalar<iVector<iScalar<vtype>, Nhs> >;
+    template<typename vtype> using iHalfSpinColourVector        = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
+    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
+
+
+    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;

    // Spin matrix
@@ -84,10 +130,28 @@ namespace QCD {
    typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
    typedef iSpinColourMatrix<ComplexF >    SpinColourMatrixF;
    typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
-
+    
    typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
    typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
    typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
+    
+    // SpinColourSpinColour matrix
+    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+
+    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
+
+    // SpinColourSpinColour matrix
+    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+
+    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;

    // LorentzColour
    typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
@@ -186,6 +250,9 @@ namespace QCD {
    typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
    typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;

+    typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
+    typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
+    typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;

    typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
    typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
@@ -315,36 +382,36 @@ namespace QCD {
    //////////////////////////////////////////////
    template<class vobj> 
      void pokeColour(Lattice<vobj> &lhs,
-		      const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
-		      int i)
+              const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
+              int i)
    {
      PokeIndex<ColourIndex>(lhs,rhs,i);
    }
    template<class vobj> 
      void pokeColour(Lattice<vobj> &lhs,
-		      const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
-		      int i,int j)
+              const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
+              int i,int j)
    {
      PokeIndex<ColourIndex>(lhs,rhs,i,j);
    }
    template<class vobj> 
      void pokeSpin(Lattice<vobj> &lhs,
-		      const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
-		      int i)
+              const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
+              int i)
    {
      PokeIndex<SpinIndex>(lhs,rhs,i);
    }
    template<class vobj> 
      void pokeSpin(Lattice<vobj> &lhs,
-		      const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
-		      int i,int j)
+              const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
+              int i,int j)
    {
      PokeIndex<SpinIndex>(lhs,rhs,i,j);
    }
    template<class vobj> 
      void pokeLorentz(Lattice<vobj> &lhs,
-		      const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
-		      int i)
+              const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
+              int i)
    {
      PokeIndex<LorentzIndex>(lhs,rhs,i);
    }
@@ -352,7 +419,6 @@ namespace QCD {
    //////////////////////////////////////////////
    // Poke scalars
    //////////////////////////////////////////////
-
    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
    {
      pokeIndex<SpinIndex>(lhs,rhs,i);
@@ -376,7 +442,43 @@ namespace QCD {
      pokeIndex<LorentzIndex>(lhs,rhs,i);
    }

-
+    //////////////////////////////////////////////
+    // Fermion <-> propagator assignements
+    //////////////////////////////////////////////
+    //template <class Prop, class Ferm>
+    template <class Fimpl>
+      void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
+    {
+      for(int j = 0; j < Ns; ++j)
+        {
+            auto pjs = peekSpin(p, j, s);
+            auto fj  = peekSpin(f, j);
+            
+            for(int i = 0; i < Fimpl::Dimension; ++i)
+            {
+                pokeColour(pjs, peekColour(fj, i), i, c);
+            }
+            pokeSpin(p, pjs, j, s);
+        }
+    }
+    
+    //template <class Prop, class Ferm>
+    template <class Fimpl>
+      void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
+    {
+        for(int j = 0; j < Ns; ++j)
+        {
+            auto pjs = peekSpin(p, j, s);
+            auto fj  = peekSpin(f, j);
+            
+            for(int i = 0; i < Fimpl::Dimension; ++i)
+            {
+                pokeColour(fj, peekColour(pjs, i, c), i);
+            }
+            pokeSpin(f, fj, j);
+        }
+    }
+    
    //////////////////////////////////////////////
    // transpose array and scalar
    //////////////////////////////////////////////
@@ -417,20 +519,17 @@ namespace QCD {
      return traceIndex<ColourIndex>(lhs);
    }

+    //////////////////////////////////////////
+    // Current types
+    //////////////////////////////////////////
+    GRID_SERIALIZABLE_ENUM(Current, undef,
+                           Vector,  0,
+                           Axial,   1,
+                           Tadpole, 2);
+
 }   //namespace QCD
 } // Grid

-#include <qcd/utils/SpaceTimeGrid.h>
-#include <qcd/spin/Dirac.h>
-#include <qcd/spin/TwoSpinor.h>
-#include <qcd/utils/LinalgUtils.h>
-#include <qcd/utils/CovariantCshift.h>
-#include <qcd/utils/WilsonLoops.h>
-#include <qcd/utils/SUn.h>
-#include <qcd/action/Actions.h>
-#include <qcd/hmc/integrators/Integrator.h>
-#include <qcd/hmc/integrators/Integrator_algorithm.h>
-#include <qcd/hmc/HMC.h>


 #endif
--- a/Show More
+++ b/Show More