Merge branch 'release/v0.6.0'

Merge branch 'develop' into release/v0.6.0
Merge branch 'develop' of https://github.com/paboyle/Grid into develop
2025-06-14 22:07:05 +01:00 · 2016-11-09 12:43:14 +00:00 · 2016-11-09 04:13:01 -08:00 · 2016-11-09 04:12:15 -08:00 · 2016-11-09 04:11:03 -08:00 · 2016-11-08 19:07:47 +00:00
382 changed files with 36844 additions and 19297 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,7 +5,6 @@
 *.o
 *.obj
 # Editor files #
 ################
 *~
@ -48,6 +47,7 @@ Config.h.in
 config.log
 config.status
 .deps
 *.inc
 # http://www.gnu.org/software/autoconf #
 ########################################
@ -62,19 +62,8 @@ stamp-h1
 config.sub
 config.guess
 INSTALL
-
+.dirstamp
-# Packages #
+ltmain.sh
 ############
 # it's better to unpack these files and commit the raw source
 # git has its own built in compression methods
 *.7z
 *.dmg
 *.gz
 *.iso
 *.jar
 *.rar
 *.tar
 *.zip
 # Logs and databases #
 ######################
@ -94,9 +83,22 @@ Thumbs.db
 # build directory #
 ###################
-build/*
+build*/*
 # IDE related files #
 #####################
 *.xcodeproj/*
 build.sh
 # Eigen source #
 ################
 lib/Eigen/*
 # FFTW source #
 ################
 lib/fftw/*
 # libtool macros #
 ##################
 m4/lt*
 m4/libtool.m4
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,106 @@
 language: cpp
 cache:
  directories:
    - clang
 matrix:
  include:
    - os:        osx
      osx_image: xcode7.2
      compiler: clang
    - compiler: gcc
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.9
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-5
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 before_install:
    - export GRIDDIR=`pwd`
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
 install:
    - export CC=$CC$VERSION
    - export CXX=$CXX$VERSION
    - echo $PATH
    - which $CC
    - $CC  --version
    - which $CXX
    - $CXX --version
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
 script:
    - ./bootstrap.sh
    - mkdir build
    - cd build
    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1
    - echo make clean
    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1
    - echo make clean
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
    - make -j4
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
--- a/9
+++ b/9
@ -1,5 +1,4 @@
-Peter Boyle
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Azusa Yamaguchi
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Intel Parallel Computing Centre @ Higgs Centre for Theoretical Physics
+Author: Peter Boyle <peterboyle@MacBook-Pro.local>
-University of Edinburgh
+Author: paboyle <paboyle@ph.ed.ac.uk>
 Scotland, UK
--- a/876
+++ b/876
@ -1,622 +1,281 @@
-                    GNU GENERAL PUBLIC LICENSE
+                   GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
+                       Version 2, June 1991
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
                            Preamble
-  The GNU General Public License is a free, copyleft license for
+  The licenses for most software are designed to take away your
-software and other kinds of works.
+freedom to share and change it.  By contrast, the GNU General Public
-
+License is intended to guarantee your freedom to share and change free
-  The licenses for most software and other practical works are designed
+software--to make sure the software is free for all its users.  This
-to take away your freedom to share and change the works.  By contrast,
+General Public License applies to most of the Free Software
-the GNU General Public License is intended to guarantee your freedom to
+Foundation's software and to any other program whose authors commit to
-share and change all versions of a program--to make sure it remains free
+using it.  (Some other Free Software Foundation software is covered by
-software for all its users.  We, the Free Software Foundation, use the
+the GNU Lesser General Public License instead.)  You can apply it to
 GNU General Public License for most of our software; it applies also to
 any other work released this way by its authors.  You can apply it to
 your programs, too.
  When we speak of free software, we are referring to freedom, not
 price.  Our General Public Licenses are designed to make sure that you
 have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
+this service if you wish), that you receive source code or can get it
-want it, that you can change the software or use pieces of it in new
+if you want it, that you can change the software or use pieces of it
-free programs, and that you know you can do these things.
+in new free programs; and that you know you can do these things.
-  To protect your rights, we need to prevent others from denying you
+  To protect your rights, we need to make restrictions that forbid
-these rights or asking you to surrender the rights.  Therefore, you have
+anyone to deny you these rights or to ask you to surrender the rights.
-certain responsibilities if you distribute copies of the software, or if
+These restrictions translate to certain responsibilities for you if you
-you modify it: responsibilities to respect the freedom of others.
+distribute copies of the software, or if you modify it.
  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
+gratis or for a fee, you must give the recipients all the rights that
-freedoms that you received.  You must make sure that they, too, receive
+you have.  You must make sure that they, too, receive or can get the
-or can get the source code.  And you must show them these terms so they
+source code.  And you must show them these terms so they know their
-know their rights.
+rights.
-  Developers that use the GNU GPL protect your rights with two steps:
+  We protect your rights with two steps: (1) copyright the software, and
-(1) assert copyright on the software, and (2) offer you this License
+(2) offer you this license which gives you legal permission to copy,
-giving you legal permission to copy, distribute and/or modify it.
+distribute and/or modify the software.
-  For the developers' and authors' protection, the GPL clearly explains
+  Also, for each author's protection and ours, we want to make certain
-that there is no warranty for this free software.  For both users' and
+that everyone understands that there is no warranty for this free
-authors' sake, the GPL requires that modified versions be marked as
+software.  If the software is modified by someone else and passed on, we
-changed, so that their problems will not be attributed erroneously to
+want its recipients to know that what they have is not the original, so
-authors of previous versions.
+that any problems introduced by others will not reflect on the original
 authors' reputations.
-  Some devices are designed to deny users access to install or run
+  Finally, any free program is threatened constantly by software
-modified versions of the software inside them, although the manufacturer
+patents.  We wish to avoid the danger that redistributors of a free
-can do so.  This is fundamentally incompatible with the aim of
+program will individually obtain patent licenses, in effect making the
-protecting users' freedom to change the software.  The systematic
+program proprietary.  To prevent this, we have made it clear that any
-pattern of such abuse occurs in the area of products for individuals to
+patent must be licensed for everyone's free use or not licensed at all.
 use, which is precisely where it is most unacceptable.  Therefore, we
 have designed this version of the GPL to prohibit the practice for those
 products.  If such problems arise substantially in other domains, we
 stand ready to extend this provision to those domains in future versions
 of the GPL, as needed to protect the freedom of users.
  Finally, every program is threatened constantly by software patents.
 States should not allow patents to restrict development and use of
 software on general-purpose computers, but in those that do, we wish to
 avoid the special danger that patents applied to a free program could
 make it effectively proprietary.  To prevent this, the GPL assures that
 patents cannot be used to render the program non-free.
  The precise terms and conditions for copying, distribution and
 modification follow.
-                       TERMS AND CONDITIONS
+                    GNU GENERAL PUBLIC LICENSE
-
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-  0. Definitions.
+
-
+  0. This License applies to any program or other work which contains
-  "This License" refers to version 3 of the GNU General Public License.
+a notice placed by the copyright holder saying it may be distributed
-
+under the terms of this General Public License.  The "Program", below,
-  "Copyright" also means copyright-like laws that apply to other kinds of
+refers to any such program or work, and a "work based on the Program"
-works, such as semiconductor masks.
+means either the Program or any derivative work under copyright law:
-
+that is to say, a work containing the Program or a portion of it,
-  "The Program" refers to any copyrightable work licensed under this
+either verbatim or with modifications and/or translated into another
-License.  Each licensee is addressed as "you".  "Licensees" and
+language.  (Hereinafter, translation is included without limitation in
-"recipients" may be individuals or organizations.
+the term "modification".)  Each licensee is addressed as "you".
-
+
-  To "modify" a work means to copy from or adapt all or part of the work
+Activities other than copying, distribution and modification are not
-in a fashion requiring copyright permission, other than the making of an
+covered by this License; they are outside its scope.  The act of
-exact copy.  The resulting work is called a "modified version" of the
+running the Program is not restricted, and the output from the Program
-earlier work or a work "based on" the earlier work.
+is covered only if its contents constitute a work based on the
-
+Program (independent of having been made by running the Program).
-  A "covered work" means either the unmodified Program or a work based
+Whether that is true depends on what the Program does.
-on the Program.
+
-
+  1. You may copy and distribute verbatim copies of the Program's
-  To "propagate" a work means to do anything with it that, without
+source code as you receive it, in any medium, provided that you
-permission, would make you directly or secondarily liable for
+conspicuously and appropriately publish on each copy an appropriate
-infringement under applicable copyright law, except executing it on a
+copyright notice and disclaimer of warranty; keep intact all the
-computer or modifying a private copy.  Propagation includes copying,
+notices that refer to this License and to the absence of any warranty;
-distribution (with or without modification), making available to the
+and give any other recipients of the Program a copy of this License
-public, and in some countries other activities as well.
+along with the Program.
-
+
-  To "convey" a work means any kind of propagation that enables other
+You may charge a fee for the physical act of transferring a copy, and
-parties to make or receive copies.  Mere interaction with a user through
+you may at your option offer warranty protection in exchange for a fee.
-a computer network, with no transfer of a copy, is not conveying.
+
-
+  2. You may modify your copy or copies of the Program or any portion
-  An interactive user interface displays "Appropriate Legal Notices"
+of it, thus forming a work based on the Program, and copy and
-to the extent that it includes a convenient and prominently visible
+distribute such modifications or work under the terms of Section 1
-feature that (1) displays an appropriate copyright notice, and (2)
+above, provided that you also meet all of these conditions:
-tells the user that there is no warranty for the work (except to the
+
-extent that warranties are provided), that licensees may convey the
+    a) You must cause the modified files to carry prominent notices
-work under this License, and how to view a copy of this License.  If
+    stating that you changed the files and the date of any change.
-the interface presents a list of user commands or options, such as a
+
-menu, a prominent item in the list meets this criterion.
+    b) You must cause any work that you distribute or publish, that in
-
+    whole or in part contains or is derived from the Program or any
-  1. Source Code.
+    part thereof, to be licensed as a whole at no charge to all third
-
+    parties under the terms of this License.
-  The "source code" for a work means the preferred form of the work
+
-for making modifications to it.  "Object code" means any non-source
+    c) If the modified program normally reads commands interactively
-form of a work.
+    when run, you must cause it, when started running for such
-
+    interactive use in the most ordinary way, to print or display an
-  A "Standard Interface" means an interface that either is an official
+    announcement including an appropriate copyright notice and a
-standard defined by a recognized standards body, or, in the case of
+    notice that there is no warranty (or else, saying that you provide
-interfaces specified for a particular programming language, one that
+    a warranty) and that users may redistribute the program under
-is widely used among developers working in that language.
+    these conditions, and telling the user how to view a copy of this
-
+    License.  (Exception: if the Program itself is interactive but
-  The "System Libraries" of an executable work include anything, other
+    does not normally print such an announcement, your work based on
-than the work as a whole, that (a) is included in the normal form of
+    the Program is not required to print an announcement.)
-packaging a Major Component, but which is not part of that Major
+
-Component, and (b) serves only to enable use of the work with that
+These requirements apply to the modified work as a whole.  If
-Major Component, or to implement a Standard Interface for which an
+identifiable sections of that work are not derived from the Program,
-implementation is available to the public in source code form.  A
+and can be reasonably considered independent and separate works in
-"Major Component", in this context, means a major essential component
+themselves, then this License, and its terms, do not apply to those
-(kernel, window system, and so on) of the specific operating system
+sections when you distribute them as separate works.  But when you
-(if any) on which the executable work runs, or a compiler used to
+distribute the same sections as part of a whole which is a work based
-produce the work, or an object code interpreter used to run it.
+on the Program, the distribution of the whole must be on the terms of
-
+this License, whose permissions for other licensees extend to the
-  The "Corresponding Source" for a work in object code form means all
+entire whole, and thus to each and every part regardless of who wrote it.
-the source code needed to generate, install, and (for an executable
+
-work) run the object code and to modify the work, including scripts to
+Thus, it is not the intent of this section to claim rights or contest
-control those activities.  However, it does not include the work's
+your rights to work written entirely by you; rather, the intent is to
-System Libraries, or general-purpose tools or generally available free
+exercise the right to control the distribution of derivative or
-programs which are used unmodified in performing those activities but
+collective works based on the Program.
-which are not part of the work.  For example, Corresponding Source
+
-includes interface definition files associated with source files for
+In addition, mere aggregation of another work not based on the Program
-the work, and the source code for shared libraries and dynamically
+with the Program (or with a work based on the Program) on a volume of
-linked subprograms that the work is specifically designed to require,
+a storage or distribution medium does not bring the other work under
-such as by intimate data communication or control flow between those
+the scope of this License.
-subprograms and other parts of the work.
+
-
+  3. You may copy and distribute the Program (or a work based on it,
-  The Corresponding Source need not include anything that users
+under Section 2) in object code or executable form under the terms of
-can regenerate automatically from other parts of the Corresponding
+Sections 1 and 2 above provided that you also do one of the following:
-Source.
+
-
+    a) Accompany it with the complete corresponding machine-readable
-  The Corresponding Source for a work in source code form is that
+    source code, which must be distributed under the terms of Sections
-same work.
+    1 and 2 above on a medium customarily used for software interchange; or,
-
+
-  2. Basic Permissions.
+    b) Accompany it with a written offer, valid for at least three
-
+    years, to give any third party, for a charge no more than your
-  All rights granted under this License are granted for the term of
+    cost of physically performing source distribution, a complete
-copyright on the Program, and are irrevocable provided the stated
+    machine-readable copy of the corresponding source code, to be
-conditions are met.  This License explicitly affirms your unlimited
+    distributed under the terms of Sections 1 and 2 above on a medium
-permission to run the unmodified Program.  The output from running a
+    customarily used for software interchange; or,
-covered work is covered by this License only if the output, given its
+
-content, constitutes a covered work.  This License acknowledges your
+    c) Accompany it with the information you received as to the offer
-rights of fair use or other equivalent, as provided by copyright law.
+    to distribute corresponding source code.  (This alternative is
-
+    allowed only for noncommercial distribution and only if you
-  You may make, run and propagate covered works that you do not
+    received the program in object code or executable form with such
-convey, without conditions so long as your license otherwise remains
+    an offer, in accord with Subsection b above.)
-in force.  You may convey covered works to others for the sole purpose
+
-of having them make modifications exclusively for you, or provide you
+The source code for a work means the preferred form of the work for
-with facilities for running those works, provided that you comply with
+making modifications to it.  For an executable work, complete source
-the terms of this License in conveying all material for which you do
+code means all the source code for all modules it contains, plus any
-not control copyright.  Those thus making or running the covered works
+associated interface definition files, plus the scripts used to
-for you must do so exclusively on your behalf, under your direction
+control compilation and installation of the executable.  However, as a
-and control, on terms that prohibit them from making any copies of
+special exception, the source code distributed need not include
-your copyrighted material outside their relationship with you.
+anything that is normally distributed (in either source or binary
-
+form) with the major components (compiler, kernel, and so on) of the
-  Conveying under any other circumstances is permitted solely under
+operating system on which the executable runs, unless that component
-the conditions stated below.  Sublicensing is not allowed; section 10
+itself accompanies the executable.
-makes it unnecessary.
+
-
+If distribution of executable or object code is made by offering
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+access to copy from a designated place, then offering equivalent
-
+access to copy the source code from the same place counts as
-  No covered work shall be deemed part of an effective technological
+distribution of the source code, even though third parties are not
-measure under any applicable law fulfilling obligations under article
+compelled to copy the source along with the object code.
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
+
-similar laws prohibiting or restricting circumvention of such
+  4. You may not copy, modify, sublicense, or distribute the Program
-measures.
+except as expressly provided under this License.  Any attempt
-
+otherwise to copy, modify, sublicense or distribute the Program is
-  When you convey a covered work, you waive any legal power to forbid
+void, and will automatically terminate your rights under this License.
-circumvention of technological measures to the extent such circumvention
+However, parties who have received copies, or rights, from you under
-is effected by exercising rights under this License with respect to
+this License will not have their licenses terminated so long as such
-the covered work, and you disclaim any intention to limit operation or
+parties remain in full compliance.
-modification of the work as a means of enforcing, against the work's
+
-users, your or third parties' legal rights to forbid circumvention of
+  5. You are not required to accept this License, since you have not
-technological measures.
+signed it.  However, nothing else grants you permission to modify or
-
+distribute the Program or its derivative works.  These actions are
-  4. Conveying Verbatim Copies.
+prohibited by law if you do not accept this License.  Therefore, by
-
+modifying or distributing the Program (or any work based on the
-  You may convey verbatim copies of the Program's source code as you
+Program), you indicate your acceptance of this License to do so, and
-receive it, in any medium, provided that you conspicuously and
+all its terms and conditions for copying, distributing or modifying
-appropriately publish on each copy an appropriate copyright notice;
+the Program or works based on it.
-keep intact all notices stating that this License and any
+
-non-permissive terms added in accord with section 7 apply to the code;
+  6. Each time you redistribute the Program (or any work based on the
-keep intact all notices of the absence of any warranty; and give all
+Program), the recipient automatically receives a license from the
-recipients a copy of this License along with the Program.
+original licensor to copy, distribute or modify the Program subject to
-
+these terms and conditions.  You may not impose any further
-  You may charge any price or no price for each copy that you convey,
+restrictions on the recipients' exercise of the rights granted herein.
-and you may offer support or warranty protection for a fee.
+You are not responsible for enforcing compliance by third parties to
  5. Conveying Modified Source Versions.
  You may convey a work based on the Program, or the modifications to
 produce it from the Program, in the form of source code under the
 terms of section 4, provided that you also meet all of these conditions:
    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.
    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".
    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.
    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.
  A compilation of a covered work with other separate and independent
 works, which are not by their nature extensions of the covered work,
 and which are not combined with it such as to form a larger program,
 in or on a volume of a storage or distribution medium, is called an
 "aggregate" if the compilation and its resulting copyright are not
 used to limit the access or legal rights of the compilation's users
 beyond what the individual works permit.  Inclusion of a covered work
 in an aggregate does not cause this License to apply to the other
 parts of the aggregate.
  6. Conveying Non-Source Forms.
  You may convey a covered work in object code form under the terms
 of sections 4 and 5, provided that you also convey the
 machine-readable Corresponding Source under the terms of this License,
 in one of these ways:
    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.
    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.
    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.
    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.
    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.
  A separable portion of the object code, whose source code is excluded
 from the Corresponding Source as a System Library, need not be
 included in conveying the object code work.
  A "User Product" is either (1) a "consumer product", which means any
 tangible personal property which is normally used for personal, family,
 or household purposes, or (2) anything designed or sold for incorporation
 into a dwelling.  In determining whether a product is a consumer product,
 doubtful cases shall be resolved in favor of coverage.  For a particular
 product received by a particular user, "normally used" refers to a
 typical or common use of that class of product, regardless of the status
 of the particular user or of the way in which the particular user
 actually uses, or expects or is expected to use, the product.  A product
 is a consumer product regardless of whether the product has substantial
 commercial, industrial or non-consumer uses, unless such uses represent
 the only significant mode of use of the product.
  "Installation Information" for a User Product means any methods,
 procedures, authorization keys, or other information required to install
 and execute modified versions of a covered work in that User Product from
 a modified version of its Corresponding Source.  The information must
 suffice to ensure that the continued functioning of the modified object
 code is in no case prevented or interfered with solely because
 modification has been made.
  If you convey an object code work under this section in, or with, or
 specifically for use in, a User Product, and the conveying occurs as
 part of a transaction in which the right of possession and use of the
 User Product is transferred to the recipient in perpetuity or for a
 fixed term (regardless of how the transaction is characterized), the
 Corresponding Source conveyed under this section must be accompanied
 by the Installation Information.  But this requirement does not apply
 if neither you nor any third party retains the ability to install
 modified object code on the User Product (for example, the work has
 been installed in ROM).
  The requirement to provide Installation Information does not include a
 requirement to continue to provide support service, warranty, or updates
 for a work that has been modified or installed by the recipient, or for
 the User Product in which it has been modified or installed.  Access to a
 network may be denied when the modification itself materially and
 adversely affects the operation of the network or violates the rules and
 protocols for communication across the network.
  Corresponding Source conveyed, and Installation Information provided,
 in accord with this section must be in a format that is publicly
 documented (and with an implementation available to the public in
 source code form), and must require no special password or key for
 unpacking, reading or copying.
  7. Additional Terms.
  "Additional permissions" are terms that supplement the terms of this
 License by making exceptions from one or more of its conditions.
 Additional permissions that are applicable to the entire Program shall
 be treated as though they were included in this License, to the extent
 that they are valid under applicable law.  If additional permissions
 apply only to part of the Program, that part may be used separately
 under those permissions, but the entire Program remains governed by
 this License without regard to the additional permissions.
  When you convey a copy of a covered work, you may at your option
 remove any additional permissions from that copy, or from any part of
 it.  (Additional permissions may be written to require their own
 removal in certain cases when you modify the work.)  You may place
 additional permissions on material, added by you to a covered work,
 for which you have or can give appropriate copyright permission.
  Notwithstanding any other provision of this License, for material you
 add to a covered work, you may (if authorized by the copyright holders of
 that material) supplement the terms of this License with terms:
    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or
    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or
    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or
    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or
    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or
    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.
  All other non-permissive additional terms are considered "further
 restrictions" within the meaning of section 10.  If the Program as you
 received it, or any part of it, contains a notice stating that it is
 governed by this License along with a term that is a further
 restriction, you may remove that term.  If a license document contains
 a further restriction but permits relicensing or conveying under this
 License, you may add to a covered work material governed by the terms
 of that license document, provided that the further restriction does
 not survive such relicensing or conveying.
  If you add terms to a covered work in accord with this section, you
 must place, in the relevant source files, a statement of the
 additional terms that apply to those files, or a notice indicating
 where to find the applicable terms.
  Additional terms, permissive or non-permissive, may be stated in the
 form of a separately written license, or stated as exceptions;
 the above requirements apply either way.
  8. Termination.
  You may not propagate or modify a covered work except as expressly
 provided under this License.  Any attempt otherwise to propagate or
 modify it is void, and will automatically terminate your rights under
 this License (including any patent licenses granted under the third
 paragraph of section 11).
  However, if you cease all violation of this License, then your
 license from a particular copyright holder is reinstated (a)
 provisionally, unless and until the copyright holder explicitly and
 finally terminates your license, and (b) permanently, if the copyright
 holder fails to notify you of the violation by some reasonable means
 prior to 60 days after the cessation.
  Moreover, your license from a particular copyright holder is
 reinstated permanently if the copyright holder notifies you of the
 violation by some reasonable means, this is the first time you have
 received notice of violation of this License (for any work) from that
 copyright holder, and you cure the violation prior to 30 days after
 your receipt of the notice.
  Termination of your rights under this section does not terminate the
 licenses of parties who have received copies or rights from you under
 this License.  If your rights have been terminated and not permanently
 reinstated, you do not qualify to receive new licenses for the same
 material under section 10.
  9. Acceptance Not Required for Having Copies.
  You are not required to accept this License in order to receive or
 run a copy of the Program.  Ancillary propagation of a covered work
 occurring solely as a consequence of using peer-to-peer transmission
 to receive a copy likewise does not require acceptance.  However,
 nothing other than this License grants you permission to propagate or
 modify any covered work.  These actions infringe copyright if you do
 not accept this License.  Therefore, by modifying or propagating a
 covered work, you indicate your acceptance of this License to do so.
  10. Automatic Licensing of Downstream Recipients.
  Each time you convey a covered work, the recipient automatically
 receives a license from the original licensors, to run, modify and
 propagate that work, subject to this License.  You are not responsible
 for enforcing compliance by third parties with this License.
  An "entity transaction" is a transaction transferring control of an
 organization, or substantially all assets of one, or subdividing an
 organization, or merging organizations.  If propagation of a covered
 work results from an entity transaction, each party to that
 transaction who receives a copy of the work also receives whatever
 licenses to the work the party's predecessor in interest had or could
 give under the previous paragraph, plus a right to possession of the
 Corresponding Source of the work from the predecessor in interest, if
 the predecessor has it or can get it with reasonable efforts.
  You may not impose any further restrictions on the exercise of the
 rights granted or affirmed under this License.  For example, you may
 not impose a license fee, royalty, or other charge for exercise of
 rights granted under this License, and you may not initiate litigation
 (including a cross-claim or counterclaim in a lawsuit) alleging that
 any patent claim is infringed by making, using, selling, offering for
 sale, or importing the Program or any portion of it.
  11. Patents.
  A "contributor" is a copyright holder who authorizes use under this
 License of the Program or a work on which the Program is based.  The
 work thus licensed is called the contributor's "contributor version".
  A contributor's "essential patent claims" are all patent claims
 owned or controlled by the contributor, whether already acquired or
 hereafter acquired, that would be infringed by some manner, permitted
 by this License, of making, using, or selling its contributor version,
 but do not include claims that would be infringed only as a
 consequence of further modification of the contributor version.  For
 purposes of this definition, "control" includes the right to grant
 patent sublicenses in a manner consistent with the requirements of
 this License.
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
+  7. If, as a consequence of a court judgment or allegation of patent
-patent license under the contributor's essential patent claims, to
+infringement or for any other reason (not limited to patent issues),
-make, use, sell, offer for sale, import and otherwise run, modify and
+conditions are imposed on you (whether by court order, agreement or
 propagate the contents of its contributor version.
  In the following three paragraphs, a "patent license" is any express
 agreement or commitment, however denominated, not to enforce a patent
 (such as an express permission to practice a patent or covenant not to
 sue for patent infringement).  To "grant" such a patent license to a
 party means to make such an agreement or commitment not to enforce a
 patent against the party.
  If you convey a covered work, knowingly relying on a patent license,
 and the Corresponding Source of the work is not available for anyone
 to copy, free of charge and under the terms of this License, through a
 publicly available network server or other readily accessible means,
 then you must either (1) cause the Corresponding Source to be so
 available, or (2) arrange to deprive yourself of the benefit of the
 patent license for this particular work, or (3) arrange, in a manner
 consistent with the requirements of this License, to extend the patent
 license to downstream recipients.  "Knowingly relying" means you have
 actual knowledge that, but for the patent license, your conveying the
 covered work in a country, or your recipient's use of the covered work
 in a country, would infringe one or more identifiable patents in that
 country that you have reason to believe are valid.
  If, pursuant to or in connection with a single transaction or
 arrangement, you convey, or propagate by procuring conveyance of, a
 covered work, and grant a patent license to some of the parties
 receiving the covered work authorizing them to use, propagate, modify
 or convey a specific copy of the covered work, then the patent license
 you grant is automatically extended to all recipients of the covered
 work and works based on it.
  A patent license is "discriminatory" if it does not include within
 the scope of its coverage, prohibits the exercise of, or is
 conditioned on the non-exercise of one or more of the rights that are
 specifically granted under this License.  You may not convey a covered
 work if you are a party to an arrangement with a third party that is
 in the business of distributing software, under which you make payment
 to the third party based on the extent of your activity of conveying
 the work, and under which the third party grants, to any of the
 parties who would receive the covered work from you, a discriminatory
 patent license (a) in connection with copies of the covered work
 conveyed by you (or copies made from those copies), or (b) primarily
 for and in connection with specific products or compilations that
 contain the covered work, unless you entered into that arrangement,
 or that patent license was granted, prior to 28 March 2007.
  Nothing in this License shall be construed as excluding or limiting
 any implied license or other defenses to infringement that may
 otherwise be available to you under applicable patent law.
  12. No Surrender of Others' Freedom.
  If conditions are imposed on you (whether by court order, agreement or
 otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
+excuse you from the conditions of this License.  If you cannot
-covered work so as to satisfy simultaneously your obligations under this
+distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
+License and any other pertinent obligations, then as a consequence you
-not convey it at all.  For example, if you agree to terms that obligate you
+may not distribute the Program at all.  For example, if a patent
-to collect a royalty for further conveying from those to whom you convey
+license would not permit royalty-free redistribution of the Program by
-the Program, the only way you could satisfy both those terms and this
+all those who receive copies directly or indirectly through you, then
-License would be to refrain entirely from conveying the Program.
+the only way you could satisfy both it and this License would be to
 refrain entirely from distribution of the Program.
-  13. Use with the GNU Affero General Public License.
+If any portion of this section is held invalid or unenforceable under
 any particular circumstance, the balance of the section is intended to
 apply and the section as a whole is intended to apply in other
 circumstances.
-  Notwithstanding any other provision of this License, you have
+It is not the purpose of this section to induce you to infringe any
-permission to link or combine any covered work with a work licensed
+patents or other property right claims or to contest validity of any
-under version 3 of the GNU Affero General Public License into a single
+such claims; this section has the sole purpose of protecting the
-combined work, and to convey the resulting work.  The terms of this
+integrity of the free software distribution system, which is
-License will continue to apply to the part which is the covered work,
+implemented by public license practices.  Many people have made
-but the special requirements of the GNU Affero General Public License,
+generous contributions to the wide range of software distributed
-section 13, concerning interaction through a network will apply to the
+through that system in reliance on consistent application of that
-combination as such.
+system; it is up to the author/donor to decide if he or she is willing
 to distribute software through any other system and a licensee cannot
 impose that choice.
-  14. Revised Versions of this License.
+This section is intended to make thoroughly clear what is believed to
 be a consequence of the rest of this License.
-  The Free Software Foundation may publish revised and/or new versions of
+  8. If the distribution and/or use of the Program is restricted in
-the GNU General Public License from time to time.  Such new versions will
+certain countries either by patents or by copyrighted interfaces, the
 original copyright holder who places the Program under this License
 may add an explicit geographical distribution limitation excluding
 those countries, so that distribution is permitted only in or among
 countries not thus excluded.  In such case, this License incorporates
 the limitation as if written in the body of this License.
  9. The Free Software Foundation may publish revised and/or new versions
 of the General Public License from time to time.  Such new versions will
 be similar in spirit to the present version, but may differ in detail to
 address new problems or concerns.
-  Each version is given a distinguishing version number.  If the
+Each version is given a distinguishing version number.  If the Program
-Program specifies that a certain numbered version of the GNU General
+specifies a version number of this License which applies to it and "any
-Public License "or any later version" applies to it, you have the
+later version", you have the option of following the terms and conditions
-option of following the terms and conditions either of that numbered
+either of that version or of any later version published by the Free
-version or of any later version published by the Free Software
+Software Foundation.  If the Program does not specify a version number of
-Foundation.  If the Program does not specify a version number of the
+this License, you may choose any version ever published by the Free Software
-GNU General Public License, you may choose any version ever published
+Foundation.
 by the Free Software Foundation.
-  If the Program specifies that a proxy can decide which future
+  10. If you wish to incorporate parts of the Program into other free
-versions of the GNU General Public License can be used, that proxy's
+programs whose distribution conditions are different, write to the author
-public statement of acceptance of a version permanently authorizes you
+to ask for permission.  For software which is copyrighted by the Free
-to choose that version for the Program.
+Software Foundation, write to the Free Software Foundation; we sometimes
 make exceptions for this.  Our decision will be guided by the two goals
 of preserving the free status of all derivatives of our free software and
 of promoting the sharing and reuse of software generally.
-  Later license versions may give you additional or different
+                            NO WARRANTY
 permissions.  However, no additional obligations are imposed on any
 author or copyright holder as a result of your choosing to follow a
 later version.
-  15. Disclaimer of Warranty.
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
 FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
 OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
 PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
 OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
 TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
 PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
 REPAIR OR CORRECTION.
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
-
+POSSIBILITY OF SUCH DAMAGES.
  16. Limitation of Liability.
  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
 WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
 THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
 GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
 USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
 DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
 PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
 EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGES.
  17. Interpretation of Sections 15 and 16.
  If the disclaimer of warranty and limitation of liability provided
 above cannot be given local legal effect according to their terms,
 reviewing courts shall apply local law that most closely approximates
 an absolute waiver of all civil liability in connection with the
 Program, unless a warranty or assumption of liability accompanies a
 copy of the Program in return for a fee.
                     END OF TERMS AND CONDITIONS
@ -628,15 +287,15 @@ free software which everyone can redistribute and change under these terms.
  To do so, attach the following notices to the program.  It is safest
 to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
+convey the exclusion of warranty; and each file should have at least
 the "copyright" line and a pointer to where the full notice is found.
    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>
-    This program is free software: you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
@ -644,31 +303,38 @@ the "copyright" line and a pointer to where the full notice is found.
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
+    You should have received a copy of the GNU General Public License along
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 Also add information on how to contact you by electronic and paper mail.
-  If the program does terminal interaction, make it output a short
+If the program is interactive, make it output a short notice like this
-notice like this when it starts in an interactive mode:
+when it starts in an interactive mode:
-    <program>  Copyright (C) <year>  <name of author>
+    Gnomovision version 69, Copyright (C) year name of author
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.
 The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
+parts of the General Public License.  Of course, the commands you use may
-might be different; for a GUI interface, you would use an "about box".
+be called something other than `show w' and `show c'; they could even be
 mouse-clicks or menu items--whatever suits your program.
-  You should also get your employer (if you work as a programmer) or school,
+You should also get your employer (if you work as a programmer) or your
-if any, to sign a "copyright disclaimer" for the program, if necessary.
+school, if any, to sign a "copyright disclaimer" for the program, if
-For more information on this, and how to apply and follow the GNU GPL, see
+necessary.  Here is a sample; alter the names:
-<http://www.gnu.org/licenses/>.
+
  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
  `Gnomovision' (which makes passes at compilers) written by James Hacker.
  <signature of Ty Coon>, 1 April 1989
  Ty Coon, President of Vice
 This General Public License does not permit incorporating your program into
 proprietary programs.  If your program is a subroutine library, you may
 consider it more useful to permit linking proprietary applications with the
 library.  If this is what you want to do, use the GNU Lesser General
 Public License instead of this License.
  The GNU General Public License does not permit incorporating your program
 into proprietary programs.  If your program is a subroutine library, you
 may consider it more useful to permit linking proprietary applications with
 the library.  If this is what you want to do, use the GNU Lesser General
 Public License instead of this License.  But first, please read
 <http://www.gnu.org/philosophy/why-not-lgpl.html>.
--- a/10
+++ b/10
@ -1,7 +1,7 @@
-GNU GENERAL PUBLIC LICENSE
+                   GNU GENERAL PUBLIC LICENSE
                       Version 2, June 1991
- Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
@ -290,8 +290,8 @@ to attach them to the start of each source file to most effectively
 convey the exclusion of warranty; and each file should have at least
 the "copyright" line and a pointer to where the full notice is found.
-    {description}
+    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) {year}  {fullname}
+    Copyright (C) <year>  <name of author>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -329,7 +329,7 @@ necessary.  Here is a sample; alter the names:
  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
  `Gnomovision' (which makes passes at compilers) written by James Hacker.
-  {signature of Ty Coon}, 1 April 1989
+  <signature of Ty Coon>, 1 April 1989
  Ty Coon, President of Vice
 This General Public License does not permit incorporating your program into
--- a/Makefile.am
+++ b/Makefile.am
@ -1,5 +1,10 @@
 # additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/
+SUBDIRS = lib benchmarks tests
 SUBDIRS = lib tests benchmarks
-filelist: $(SUBDIRS)
+.PHONY: tests
 tests: all
 	$(MAKE) -C tests tests
 AM_CXXFLAGS += -I$(top_builddir)/include
 ACLOCAL_AMFLAGS = -I m4
--- a/44
+++ b/44
@ -1,44 +0,0 @@
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 array indices to both MPI tasks and SIMD processing elements.
 * Identically shaped arrays then be processed with perfect data parallelisation.
 * Such identically shapped arrays are called conformable arrays.
 The transformation is based on the observation that Cartesian array processing involves
 identical processing to be performed on different regions of the Cartesian array.
 The library will (eventually) both geometrically decompose into MPI tasks and across SIMD lanes.
 Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
 optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
 for most programmers.
 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
 Presently SSE2 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported.
 These are presented as 
  vRealF, vRealD, vComplexF, vComplexD 
 internal vector data types. These may be useful in themselves for other programmers.
 The corresponding scalar types are named
  RealF, RealD, ComplexF, ComplexD
 MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is present in the library.
   You can give `configure' initial values for configuration parameters
 by setting variables in the command line or in the environment.  Here
 is are examples:
     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
     ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none
--- a/1
+++ b/1
@ -0,0 +1 @@
 README.md
--- a/README.md
+++ b/README.md
@ -1,13 +1,51 @@
 # Grid
-Data parallel C++ mathematical object library
+<table>
 <tr>
    <td>Last stable release</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
    </td>
 </tr>
 <tr>
    <td>Development branch</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
    </td>
 </tr>
 </table>
 **Data parallel C++ mathematical object library.**
 License: GPL v2.
 Last update Nov 2016.
 _Please do not send pull requests to the `master` branch which is reserved for releases._
 ### Bug report
 _To help us tracking and solving more efficiently issues with Grid, please report problems using the issue system of GitHub rather than sending emails to Grid developers._
 When you file an issue, please go though the following checklist:
 1. Check that the code is pointing to the `HEAD` of `develop` or any commit in `master` which is tagged with a version number. 
 2. Give a description of the target platform (CPU, network, compiler). Please give the full CPU part description, using for example `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) or `sysctl machdep.cpu.brand_string` (macOS) and the full output the `--version` option of your compiler.
 3. Give the exact `configure` command used.
 4. Attach `config.log`.
 5. Attach `config.summary`.
 6. Attach the output of `make V=1`.
 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
 ### Description
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 array indices to both MPI tasks and SIMD processing elements.
 * Identically shaped arrays then be processed with perfect data parallelisation.
-* Such identically shapped arrays are called conformable arrays.
+* Such identically shaped arrays are called conformable arrays.
 The transformation is based on the observation that Cartesian array processing involves
 identical processing to be performed on different regions of the Cartesian array.
@ -20,31 +58,136 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
 for most programmers.
 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported.
+Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
-These are presented as 
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
-
+The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
  vRealF, vRealD, vComplexF, vComplexD 
 internal vector data types. These may be useful in themselves for other programmers.
 The corresponding scalar types are named
  RealF, RealD, ComplexF, ComplexD
 MPI, OpenMP, and SIMD parallelism are present in the library.
 Please see https://arxiv.org/abs/1512.03487 for more detail.
-   You can give `configure' initial values for configuration parameters
+### Quick start
-by setting variables in the command line or in the environment.  Here
+First, start by cloning the repository:
 are examples:
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
+``` bash
 git clone https://github.com/paboyle/Grid.git
 ```
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
+Then enter the cloned directory and set up the build system:
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
+``` bash
 cd Grid
 ./bootstrap.sh
 ```
-     ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none
+Now you can execute the `configure` script to generate makefiles (here from a build directory):
 ``` bash
 mkdir build; cd build
 ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
 ```
-For developers:
+where `--enable-precision=` set the default precision,
-Use reconfigure_script in the scripts/ directory to create the autotools environment 
+`--enable-simd=` set the SIMD type, `--enable-
 comms=`, and `<path>` should be replaced by the prefix path where you want to
 install Grid. Other options are detailed in the next section, you can also use `configure
 --help` to display them. Like with any other program using GNU autotool, the
 `CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to
 customise the build.
 Finally, you can build and install Grid:
 ``` bash
 make; make install
 ```
 To minimise the build time, only the tests at the root of the `tests` directory are built by default. If you want to build tests in the sub-directory `<subdir>` you can execute:
 ``` bash
 make -C tests/<subdir> tests
 ```
 If you want to build all the tests at once just use `make tests`.
 ### Build configuration options
 - `--prefix=<path>`: installation prefix for Grid.
 - `--with-gmp=<path>`: look for GMP in the UNIX prefix `<path>`
 - `--with-mpfr=<path>`: look for MPFR in the UNIX prefix `<path>`
 - `--with-fftw=<path>`: look for FFTW in the UNIX prefix `<path>`
 - `--enable-lapack[=<path>]`: enable LAPACK support in Lanczos eigensolver. A UNIX prefix containing the library can be specified (optional).
 - `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
 - `--enable-numa`: ???
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-precision={single|double}`: set the default precision (default: `double`).
 - `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `).
 - `--disable-timers`: disable system dependent high-resolution timers.
 - `--enable-chroma`: enable Chroma regression tests.
 ### Possible communication interfaces
 The following options can be use with the `--enable-comms=` option to target different communication interfaces:
 | `<comm>`       | Description                                                   |
 | -------------- | ------------------------------------------------------------- |
 | `none`         | no communications                                             |
 | `mpi[-auto]`   | MPI communications                                            |
 | `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  |
 | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | `shmem `       | Cray SHMEM communications                                     |
 For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names).
 ### Possible SIMD types
 The following options can be use with the `--enable-simd=` option to target different SIMD instruction sets:
 | `<code>`    | Description                            |
 | ----------- | -------------------------------------- |
 | `GEN`       | generic portable vector code           |
 | `SSE4`      | SSE 4.2 (128 bit)                      |
 | `AVX`       | AVX (256 bit)                          |
 | `AVXFMA`    | AVX (256 bit) + FMA                    |
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
 | `QPX`       | QPX (256 bit)                          |
 Alternatively, some CPU codenames can be directly used:
 | `<code>`    | Description                            |
 | ----------- | -------------------------------------- |
 | `KNC`       | [Intel Xeon Phi codename Knights Corner](http://ark.intel.com/products/codename/57721/Knights-Corner) |
 | `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
 | `BGQ`       | Blue Gene/Q                            |
 #### Notes:
 - We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
 - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
 - BG/Q performances are currently rather poor. This is being investigated for future versions.
 ### Build setup for Intel Knights Landing platform
 The following configuration is recommended for the Intel Knights Landing platform:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
             --enable-comms=mpi-auto \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
             --enable-comms=mpi       \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
--- a/22
+++ b/22
@ -1,5 +1,27 @@
 TODO:
 ---------------
 * Forces; the UdSdU  term in gauge force term is half of what I think it should
  be. This is a consequence of taking ONLY the first term in:
  dSg/dt = dU/dt dSdU + dUdag/dt dSdUdag
  in the fermion force.
  Now, S_mom = - tr Pmu Pmu      ; Pmu anti-herm
                                  .
       d Smom/dt = - 2.0 tr Pmu Pmu   = - dSg/dt = - tr Pmu [Umu dSdUmu + UmuDag dSdUmuDag]
           .
       => Pmu =  Umu dSdUmu
       Where the norm is half expected.
  This means we must double the force in the Test_xxx_force routines, and is the origin of the factor of two.
  This 2x is applied by hand in the fermion routines and in the Test_rect_force routine.
 Policies:
 * Link smearing/boundary conds; Policy class based implementation ; framework more in place
--- a/6
+++ b/6
@ -0,0 +1,6 @@
 Version : 0.6.0
 - AVX512, AVX2, AVX, SSE good
 - Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above
 - MPI and MPI3
 - HiRep, Smearing, Generic gauge group
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@ -1,4 +1,31 @@
-#include <Grid.h>
+    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_comms.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
@ -15,15 +42,14 @@ int main (int argc, char ** argv)
  int Nloop=10;
  int nmu=0;
-  for(int mu=0;mu<4;mu++) if (mpi_layout[mu]>1) nmu++;
+  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-
+  int maxlat=16;
-
+  for(int lat=4;lat<=maxlat;lat+=2){
  for(int lat=4;lat<=32;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
@ -98,7 +124,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-  for(int lat=4;lat<=32;lat+=2){
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat,lat,lat,lat});
@ -168,6 +194,168 @@ int main (int argc, char ** argv)
  }  
  Nloop=100;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.StencilSendToRecvFromBegin(requests,
 					    (void *)&xbuf[mu][0],
 					    xmit_to_rank,
 					    (void *)&rbuf[mu][0],
 					    recv_from_rank,
 					    bytes);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.StencilSendToRecvFromBegin(requests,
 					    (void *)&xbuf[mu+4][0],
 					    xmit_to_rank,
 					    (void *)&rbuf[mu+4][0],
 					    recv_from_rank,
 					    bytes);
 	  }
 	}
 	Grid.StencilSendToRecvFromComplete(requests);
 	Grid.Barrier();
      }
      double stop=usecond();
      double dbytes    = bytes;
      double xbytes    = Nloop*dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
      double time = stop-start; // microseconds
      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    
  Nloop=100;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.StencilSendToRecvFromBegin(requests,
 					    (void *)&xbuf[mu][0],
 					    xmit_to_rank,
 					    (void *)&rbuf[mu][0],
 					    recv_from_rank,
 					    bytes);
 	    //	    Grid.StencilSendToRecvFromComplete(requests);
 	    //	    requests.resize(0);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.StencilSendToRecvFromBegin(requests,
 					    (void *)&xbuf[mu+4][0],
 					    xmit_to_rank,
 					    (void *)&rbuf[mu+4][0],
 					    recv_from_rank,
 					    bytes);
 	    Grid.StencilSendToRecvFromComplete(requests);
 	    requests.resize(0);
 	  }
 	}
 	Grid.Barrier();
      }
      double stop=usecond();
      double dbytes    = bytes;
      double xbytes    = Nloop*dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
      double time = stop-start; // microseconds
      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -1,4 +1,32 @@
-#include <Grid.h>
+    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
@ -16,6 +44,11 @@ struct scal {
    Gamma::GammaT
  };
 typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@ -24,12 +57,18 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
+  const int Ls=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
@ -42,9 +81,9 @@ int main (int argc, char ** argv)
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
-  ColourMatrix cm = Complex(1.0,0.0);
+  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
@ -79,16 +118,36 @@ int main (int argc, char ** argv)
  RealD mass=0.1;
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  int ncall=10000;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
-  {
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  int ncall =100;
  if (1) {
    FGrid->Barrier();
    Dw.ZeroCounters();
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      Dw.Dhop(src,result,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
@ -97,11 +156,165 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    assert (norm2(err)< 1.0e-4 );
    Dw.Report();
  }
  if (1)
  {
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::Dhop "<<std::endl;
    std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
    if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
    if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
    LatticeFermion ssrc(sFGrid);
    LatticeFermion sref(sFGrid);
    LatticeFermion sresult(sFGrid);
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVector tmp;
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
    std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
    FGrid->Barrier();
    double t0=usecond();
    sDw.ZeroCounters();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      sDw.Dhop(ssrc,sresult,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    sDw.Report();
    if(0){
      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 	sDw.Dhop(ssrc,sresult,0);
 	PerformanceCounter Counter(i);
 	Counter.Start();
 	sDw.Dhop(ssrc,sresult,0);
 	Counter.Stop();
 	Counter.Report();
      }
    }
    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
    RealD sum=0;
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVector normal, simd;
      peekSite(normal,result,site);
      peekSite(simd,sresult,site);
      sum=sum+norm2(normal-simd);
      if (norm2(normal-simd) > 1.0e-6 ) {
 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
      }
    }}}}}
    std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
    assert (sum< 1.0e-4 );
    if (1) {
      LatticeFermion sr_eo(sFGrid);
      LatticeFermion ssrc_e (sFrbGrid);
      LatticeFermion ssrc_o (sFrbGrid);
      LatticeFermion sr_e   (sFrbGrid);
      LatticeFermion sr_o   (sFrbGrid);
      pickCheckerboard(Even,ssrc_e,ssrc);
      pickCheckerboard(Odd,ssrc_o,ssrc);
      setCheckerboard(sr_eo,ssrc_o);
      setCheckerboard(sr_eo,ssrc_e);
      sr_e = zero;
      sr_o = zero;
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
      std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
      if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
      if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      FGrid->Barrier();
      sDw.ZeroCounters();
      sDw.stat.init("DhopEO");
      double t0=usecond();
      for (int i = 0; i < ncall; i++) {
        sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
      }
      double t1=usecond();
      FGrid->Barrier();
      sDw.stat.print();
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
      sDw.Report();
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
      sDw.Dhop  (ssrc  ,sresult,DaggerNo);
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
      RealD error = norm2(ssrc_e);
      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      ssrc_o = ssrc_o - sr_o;
      error+= norm2(ssrc_o);
      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
      if(error>1.0e-4) { 
 	setCheckerboard(ssrc,ssrc_o);
 	setCheckerboard(ssrc,ssrc_e);
 	std::cout<< ssrc << std::endl;
      }
    }
  }
  if (1)
  { // Naive wilson dag implementation
@ -111,24 +324,25 @@ int main (int argc, char ** argv)
      //    ref =  src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
      tmp = U[mu]*Cshift(src,mu+1,1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+  ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+  ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,1);
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-
+  assert(norm2(err)<1.0e-4);
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
@ -136,26 +350,39 @@ int main (int argc, char ** argv)
  LatticeFermion r_eo  (FGrid);
-  std::cout<<GridLogMessage << "Calling Deo and Doe"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO                "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  {
    Dw.ZeroCounters();
    FGrid->Barrier();
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=(1344.0*volume*ncall)/2;
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
  Dw.Dhop  (src  ,result,DaggerNo);
@ -169,11 +396,14 @@ int main (int argc, char ** argv)
  err = r_eo-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  assert(norm2(err)<1.0e-4);
  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
  assert(norm2(src_e)<1.0e-4);
  assert(norm2(src_o)<1.0e-4);
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@ -0,0 +1,366 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::GammaMatrix Gmu [] = {
    Gamma::GammaX,
    Gamma::GammaY,
    Gamma::GammaZ,
    Gamma::GammaT
  };
 void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
 void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  const int Ls=8;
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  int Lmax=16;
  int dmin=2;
  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
  for (int L=8;L<=Lmax;L*=2){
    std::vector<int> latt4(4,L);
    for(int d=4;d>dmin;d--){
      if ( d<=3 ) latt4[d]*=2;
      std::cout << GridLogMessage <<"\t";
      for(int d=0;d<Nd;d++){
 	std::cout<<latt4[d]<<"x";
      }
      std::cout <<Ls<<"\t" ;
      benchDw (latt4,Ls,threads,0);
      benchsDw(latt4,Ls,threads,0);
      std::cout<<std::endl;
    }
  }
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  {
    std::vector<int> latt4(4,16);
    std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
    benchDw (latt4,Ls,threads,1);
    std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
    benchsDw(latt4,Ls,threads,1);
  }
  Grid_finalize();
 }
 #undef CHECK
 void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
 {
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
 #ifdef CHECK 
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  LatticeFermion src   (FGrid); random(RNG5,src);
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
 #else 
  LatticeFermion src   (FGrid); src=zero;
  LatticeGaugeField Umu(UGrid); Umu=zero;
 #endif
  LatticeFermion result(FGrid); result=zero;
  LatticeFermion    ref(FGrid);    ref=zero;
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
    }
  }
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  std::vector<LatticeColourMatrix> U(4,FGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
  }
 #ifdef CHECK
  if (1) {
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
      tmp = U[mu]*Cshift(src,mu+1,1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
    ref = -0.5*ref;
  }
 #endif
  RealD mass=0.1;
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  double t0=usecond();
  Dw.Dhop(src,result,0);
  double t1=usecond();
 #ifdef TIMERS_OFF
    int ncall =10;
 #else
  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
 #endif
  if (ncall < 5 ) exit(0);
  Dw.Dhop(src,result,0);
  PerformanceCounter Counter(8);
  Counter.Start();
  t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.Dhop(src,result,0);
  }
  t1=usecond();
  Counter.Stop();
  if ( report ) {
    Counter.Report();
  }
  if ( ! report ) {
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
  }
 #ifdef CHECK
  err = ref-result; 
  RealD errd = norm2(err);
  if ( errd> 1.0e-4 ) {
    std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
    exit(-1);
  }
 #endif
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
  LatticeFermion r_o   (FrbGrid);
  LatticeFermion r_eo  (FGrid);
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  {
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
    }
    double t1=usecond();
    if(!report){
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<< flops/(t1-t0);
    }
  }
 }
 #define CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
 #ifdef CHECK_SDW
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  LatticeFermion src   (FGrid); random(RNG5,src);
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
 #else 
  LatticeFermion src   (FGrid); src=zero;
  LatticeGaugeField Umu(UGrid); Umu=zero;
 #endif
  LatticeFermion result(FGrid); result=zero;
  LatticeFermion    ref(FGrid);    ref=zero;
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
    }
  }
  RealD mass=0.1;
  RealD M5  =1.8;
  typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
  LatticeFermion ssrc(sFGrid);
  LatticeFermion sref(sFGrid);
  LatticeFermion sresult(sFGrid);
  WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
  for(int x=0;x<latt4[0];x++){
  for(int y=0;y<latt4[1];y++){
  for(int z=0;z<latt4[2];z++){
  for(int t=0;t<latt4[3];t++){
  for(int s=0;s<Ls;s++){
    std::vector<int> site({s,x,y,z,t});
    SpinColourVector tmp;
    peekSite(tmp,src,site);
    pokeSite(tmp,ssrc,site);
  }}}}}
  double t0=usecond();
  sDw.Dhop(ssrc,sresult,0);
  double t1=usecond();
 #ifdef TIMERS_OFF
  int ncall =10;
 #else 
  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
 #endif
  PerformanceCounter Counter(8);
  Counter.Start();
  t0=usecond();
  for(int i=0;i<ncall;i++){
    sDw.Dhop(ssrc,sresult,0);
  }
  t1=usecond();
  Counter.Stop();
  if ( report ) {
    Counter.Report();
  } else { 
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout<<"\t"<< flops/(t1-t0);
  }
  LatticeFermion sr_eo(sFGrid);
  LatticeFermion serr(sFGrid);
  LatticeFermion ssrc_e (sFrbGrid);
  LatticeFermion ssrc_o (sFrbGrid);
  LatticeFermion sr_e   (sFrbGrid);
  LatticeFermion sr_o   (sFrbGrid);
  pickCheckerboard(Even,ssrc_e,ssrc);
  pickCheckerboard(Odd,ssrc_o,ssrc);
  setCheckerboard(sr_eo,ssrc_o);
  setCheckerboard(sr_eo,ssrc_e);
  sr_e = zero;
  sr_o = zero;
  sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
  PerformanceCounter CounterSdw(8);
  CounterSdw.Start();
  t0=usecond();
  for(int i=0;i<ncall;i++){
    __SSC_START;
    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
    __SSC_STOP;
  }
  t1=usecond();
  CounterSdw.Stop();
  if ( report ) { 
    CounterSdw.Report();
  } else {
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=(1344.0*volume*ncall)/2;
    std::cout<<"\t"<< flops/(t1-t0);
  }
 }
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@ -1,4 +1,32 @@
-#include <Grid.h>
+    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_memory_asynch.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@ -1,4 +1,32 @@
-#include <Grid.h>
+    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@ -1,4 +1,32 @@
-#include <Grid.h>
+    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_su3.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@ -1,4 +1,32 @@
-#include <Grid.h>
+    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_wilson.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
@ -16,10 +44,15 @@ struct scal {
    Gamma::GammaT
  };
 bool overlapComms = false;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
    overlapComms = true;
  }
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
@ -57,11 +90,12 @@ int main (int argc, char ** argv)
  Complex cone(1.0,0.0);
  for(int nn=0;nn<Nd;nn++){
    random(pRNG,U[nn]);
-    if(0) {
+    if(1) {
-      if (nn==-1) { U[nn]=zero; std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; }
+      if (nn!=2) { U[nn]=zero; std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; }
-      else       { U[nn] = cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; }
+      //      else       { U[nn]= cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; }
      else       { std::cout<<GridLogMessage << "random gauge field in dir "<<nn<<std::endl; }
    }
-    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
+    PokeIndex<LorentzIndex>(Umu,U[nn],nn);
  }
 #endif
@ -87,7 +121,11 @@ int main (int argc, char ** argv)
  }
  ref = -0.5*ref;
  RealD mass=0.1;
-  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
+
  typename WilsonFermionR::ImplParams params; 
  params.overlapCommsCompute = overlapComms;
  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=1000;
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@ -0,0 +1,130 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_wilson.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Richard Rollins <rprollins@users.noreply.github.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
 Gamma::GammaMatrix Gmu [] = {
  Gamma::GammaX,
  Gamma::GammaY,
  Gamma::GammaZ,
  Gamma::GammaT
 };
 bool overlapComms = false;
 void bench_wilson (
 		   LatticeFermion &    src,
 		   LatticeFermion & result,
 		   WilsonFermionR &     Dw,
 		   double const     volume,
 		   int const           dag );
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ overlapComms = true; }
  typename WilsonFermionR::ImplParams params;
  params.overlapCommsCompute = overlapComms;
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> seeds({1,2,3,4});
  RealD mass = 0.1;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  int Lmax = 32;
  int dmin = 0;
  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
  for (int L=8; L<=Lmax; L*=2)
    {
      std::vector<int> latt_size = std::vector<int>(4,L);
      for(int d=4; d>dmin; d--)
 	{
 	  if ( d<=3 ) { latt_size[d] *= 2; }
 	  std::cout << GridLogMessage;
 	  std::copy( latt_size.begin(), --latt_size.end(), std::ostream_iterator<int>( std::cout, std::string("x").c_str() ) );
 	  std::cout << latt_size.back() << "\t\t";
 	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
 	  GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 	  LatticeFermion    src(&Grid); random(pRNG,src);
 	  LatticeFermion result(&Grid); result=zero;
 	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
 	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
 	  bench_wilson(src,result,Dw,volume,DaggerNo);
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << std::endl;
 	}
    }
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  Grid_finalize();
 }
 void bench_wilson (
 		   LatticeFermion &    src,
 		   LatticeFermion & result,
 		   WilsonFermionR &     Dw,
 		   double const     volume,
 		   int const           dag )
 {
  int ncall    = 1000;
  double t0    = usecond();
  for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
  double t1    = usecond();
  double flops = 1344 * volume * ncall;
  std::cout << flops/(t1-t0) << "\t\t";
 }
--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@ -1,27 +0,0 @@
 bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
 Benchmark_comms_SOURCES=Benchmark_comms.cc
 Benchmark_comms_LDADD=-lGrid
 Benchmark_dwf_SOURCES=Benchmark_dwf.cc
 Benchmark_dwf_LDADD=-lGrid
 Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 Benchmark_memory_asynch_LDADD=-lGrid
 Benchmark_memory_bandwidth_SOURCES=Benchmark_memory_bandwidth.cc
 Benchmark_memory_bandwidth_LDADD=-lGrid
 Benchmark_su3_SOURCES=Benchmark_su3.cc
 Benchmark_su3_LDADD=-lGrid
 Benchmark_wilson_SOURCES=Benchmark_wilson.cc
 Benchmark_wilson_LDADD=-lGrid
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@ -1,8 +1 @@
 # additional include paths necessary to compile the C++ library
 AM_CXXFLAGS = -I$(top_srcdir)/lib
 AM_LDFLAGS = -L$(top_builddir)/lib
 #
 # Test code
 #
 include Make.inc
--- a/benchmarks/simple_su3_expr.cc
+++ b/benchmarks/simple_su3_expr.cc
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/simple_su3_expr.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 using namespace std;
--- a/benchmarks/simple_su3_test.cc
+++ b/benchmarks/simple_su3_test.cc
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/simple_su3_test.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 using namespace std;
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -0,0 +1,13 @@
 #!/usr/bin/env bash
 EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
 echo "-- deploying Eigen source..."
 wget ${EIGEN_URL} --no-check-certificate
 ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
 rm `basename ${EIGEN_URL}`
 echo '-- generating Make.inc files...'
 ./scripts/filelist
 echo '-- generating configure script...'
 autoreconf -fvi
--- a/8052
+++ b/8052
--- a/configure.ac
+++ b/configure.ac
@ -1,226 +1,405 @@
 #                         -*- Autoconf -*-
 # Process this file with autoconf to produce a configure script.
 #
 # Project Grid package  
 # 
 # Time-stamp: <2015-07-10 17:46:21 neo>
 AC_PREREQ([2.63])
-AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
+AC_INIT([Grid], [0.6.0], [https://github.com/paboyle/Grid], [Grid])
-AC_CANONICAL_SYSTEM
+AC_CANONICAL_BUILD
 AC_CANONICAL_HOST
 AC_CANONICAL_TARGET
 AM_INIT_AUTOMAKE(subdir-objects)
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([lib/Grid.h])
 AC_CONFIG_HEADERS([lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-AC_MSG_NOTICE([
+############### Checks for programs
-
+CXXFLAGS="-O3 $CXXFLAGS"
 :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 Configuring $PACKAGE v$VERSION  for $host
 :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 ])
 # Checks for programs.
 AC_LANG(C++)
 AC_PROG_CXX
 AC_OPENMP
 AC_PROG_RANLIB
 #AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
 AX_EXT
-# Checks for libraries.
+############### Get compiler informations
-#AX_GCC_VAR_ATTRIBUTE(aligned)
+AC_LANG([C++])
 AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
 AX_COMPILER_VENDOR
 AC_DEFINE_UNQUOTED([CXX_COMP_VENDOR],["$ax_cv_cxx_compiler_vendor"],
      [vendor of C++ compiler that will compile the code])
 AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
      [version of g++ that will compile the code])
-# Checks for header files.
+############### Checks for typedefs, structures, and compiler characteristics
 AC_TYPE_SIZE_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T
 ############### OpenMP 
 AC_OPENMP
 ac_openmp=no
 if test "${OPENMP_CXXFLAGS}X" != "X"; then
  ac_openmp=yes
  AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
  AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
 fi
 ############### Checks for header files
 AC_CHECK_HEADERS(stdint.h)
 AC_CHECK_HEADERS(mm_malloc.h)
 AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
 AC_CHECK_HEADERS(gmp.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
-# Checks for typedefs, structures, and compiler characteristics.
+############### GMP and MPFR
-AC_TYPE_SIZE_T
+AC_ARG_WITH([gmp],
-AC_TYPE_UINT32_T
+    [AS_HELP_STRING([--with-gmp=prefix],
-AC_TYPE_UINT64_T
+    [try this for a non-standard install prefix of the GMP library])],
    [AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
 AC_ARG_WITH([mpfr],
    [AS_HELP_STRING([--with-mpfr=prefix],
    [try this for a non-standard install prefix of the MPFR library])],
    [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
-# Checks for library functions.
+############### FFTW3 
-echo
+AC_ARG_WITH([fftw],    
-echo Checking libraries 
+            [AS_HELP_STRING([--with-fftw=prefix],
-echo :::::::::::::::::::::::::::::::::::::::::::
+            [try this for a non-standard install prefix of the FFTW3 library])],
            [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
 ############### lapack 
 AC_ARG_ENABLE([lapack],
    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
    [ac_LAPACK=${enable_lapack}], [ac_LAPACK=no])
 case ${ac_LAPACK} in
    no)
        ;;
    yes)
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
    *)
        AM_CXXFLAGS="-I$ac_LAPACK/include $AM_CXXFLAGS"
        AM_LDFLAGS="-L$ac_LAPACK/lib $AM_LDFLAGS"
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac
 ############### MKL
 AC_ARG_ENABLE([mkl],
    [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
    [ac_MKL=${enable_mkl}], [ac_MKL=no])
 case ${ac_MKL} in
    no)
        ;;
    yes)
        AC_DEFINE([USE_MKL], [1], [Define to 1 if you use the Intel MKL]);;
    *)
        AM_CXXFLAGS="-I$ac_MKL/include $AM_CXXFLAGS"
        AM_LDFLAGS="-L$ac_MKL/lib $AM_LDFLAGS"
        AC_DEFINE([USE_MKL], [1], [Define to 1 if you use the Intel MKL]);;
 esac
 ############### first-touch
 AC_ARG_ENABLE([numa],
    [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])], 
    [ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
 case ${ac_NUMA} in
    no)
        ;;
    yes)
        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
    *)
        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
 esac
 ############### Checks for library functions
 CXXFLAGS_CPY=$CXXFLAGS
 LDFLAGS_CPY=$LDFLAGS
 CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
 LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 AC_CHECK_FUNCS([gettimeofday])
-#AC_CHECK_LIB([gmp],[__gmpf_init],,
+if test "${ac_MKL}x" != "nox"; then
-#        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
+    AC_SEARCH_LIBS([mkl_set_interface_layer], [mkl_rt], [],
-#Please install or provide the correct path to your installation
+                   [AC_MSG_ERROR("MKL enabled but library not found")])
-#Info at: http://www.gmplib.org)])
+fi
-#AC_CHECK_LIB([mpfr],[mpfr_init],,
+AC_SEARCH_LIBS([__gmpf_init], [gmp],
-#        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
+               [AC_SEARCH_LIBS([mpfr_init], [mpfr], 
-#Please install or provide the correct path to your installation
+                               [AC_DEFINE([HAVE_LIBMPFR], [1], 
-#Info at: http://www.mpfr.org/)])
+                                          [Define to 1 if you have the `MPFR' library])]
                               [have_mpfr=true], [AC_MSG_ERROR([MPFR library not found])])]
               [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library])]
               [have_gmp=true])
-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
+if test "${ac_LAPACK}x" != "nox"; then
-	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
+    AC_SEARCH_LIBS([LAPACKE_sbdsdc], [lapack], [],
-	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
+                   [AC_MSG_ERROR("LAPACK enabled but library not found")])
 fi   
-supported=no
+AC_SEARCH_LIBS([fftw_execute], [fftw3],
               [AC_SEARCH_LIBS([fftwf_execute], [fftw3f], [],
                               [AC_MSG_ERROR("single precision FFTW library not found")])]
               [AC_DEFINE([HAVE_FFTW], [1], [Define to 1 if you have the `FFTW' library])]
               [have_fftw=true])
 CXXFLAGS=$CXXFLAGS_CPY
 LDFLAGS=$LDFLAGS_CPY
 ############### SIMD instruction selection
 AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=<code>],
 	            [select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN])
 case ${ax_cv_cxx_compiler_vendor} in
  clang|gnu)
    case ${ac_SIMD} in
      SSE4)
        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
        SIMD_FLAGS='-msse4.2';;
      AVX)
        AC_DEFINE([AVX1],[1],[AVX intrinsics])
        SIMD_FLAGS='-mavx';;
      AVXFMA4)
        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
        SIMD_FLAGS='-mavx -mfma4';;
      AVXFMA)
        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
        SIMD_FLAGS='-mavx -mfma';;
      AVX2)
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
        SIMD_FLAGS='-mavx2 -mfma';;
      AVX512)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
      KNC)
        AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
        SIMD_FLAGS='';;
      KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-march=knl';;
      GEN)
        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
        SIMD_FLAGS='';;
      QPX|BGQ)
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
        SIMD_FLAGS='';;
      *)
        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang compiler"]);;
    esac;;
  intel)
    case ${ac_SIMD} in
      SSE4)
        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
        SIMD_FLAGS='-msse4.2 -xsse4.2';;
      AVX)
        AC_DEFINE([AVX1],[1],[AVX intrinsics])
        SIMD_FLAGS='-mavx -xavx';;
      AVXFMA)
        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
        SIMD_FLAGS='-mavx -mfma';;
      AVX2)
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
      AVX512)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-xcore-avx512';;
      KNC)
        AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
        SIMD_FLAGS='';;
      KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
        SIMD_FLAGS='-xmic-avx512';;
      GEN)
        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
        SIMD_FLAGS='';;
      *)
        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
    esac;;
  *)
    AC_MSG_WARN([Compiler unknown, using generic vector code])
    AC_DEFINE([GENERIC_VEC],[1],[generic vector code]);;
 esac
 AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
 AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"
 case ${ac_SIMD} in
-     SSE4)
+  AVX512|KNL)
-       echo Configuring for SSE4
+    AC_DEFINE([TEST_ZMM],[1],[compile ZMM test]);;
-       AC_DEFINE([SSE4],[1],[SSE4 Intrinsics] )
+  *)
-       if test x"$ax_cv_support_ssse3_ext" = x"yes"; then  dnl minimal support for SSE4
+	;;
         supported=yes
       else
  	AC_MSG_WARN([Your processor does not support SSE4 instructions])
       fi
     ;;
     AVX)
       echo Configuring for AVX
       AC_DEFINE([AVX1],[1],[AVX Intrinsics] )
       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
       supported=yes			  
       else
       	AC_MSG_WARN([Your processor does not support AVX instructions])
       fi
     ;;
     AVXFMA4)
       echo Configuring for AVX
       AC_DEFINE([AVXFMA4],[1],[AVX Intrinsics with FMA4] )
       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
       supported=yes			  
       else
       	AC_MSG_WARN([Your processor does not support AVX instructions])
       fi
     ;;
     AVX2)
       echo Configuring for AVX2
       AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )
       if test x"$ax_cv_support_avx2_ext" = x"yes"; then  dnl minimal support for AVX2
       supported=yes
       else
       AC_MSG_WARN([Your processor does not support AVX2 instructions])
       fi
     ;;
     AVX512)
       echo Configuring for AVX512 
       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] )
       supported="cross compilation"
     ;;
     IMCI)
       echo Configuring for IMCI
       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
       supported="cross compilation"
     ;;
     NEONv8)
       echo Configuring for experimental ARMv8a support 
       AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
       supported="cross compilation"
     ;;
     DEBUG)
       echo Configuring without SIMD support - only for compiler DEBUGGING!
       AC_DEFINE([EMPTY_SIMD],[1],[EMPTY_SIMD only for DEBUGGING] )
      ;;     
     *)
     AC_MSG_ERROR([${ac_SIMD} flag unsupported as --enable-simd option\nRun ./configure --help for the list of options]); 
     ;;
 esac
-AC_ARG_ENABLE([precision],[AC_HELP_STRING([--enable-precision=single|double],[Select default word size of Real])],[ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
+############### Precision selection
 AC_ARG_ENABLE([precision],
              [AC_HELP_STRING([--enable-precision=single|double],
                              [Select default word size of Real])],
              [ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
 case ${ac_PRECISION} in
     single)
       echo default precision is single
       AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
     ;;
     double)
       echo default precision is double
       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
     ;;
 esac
-AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
+############### communication type selection
 AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem],
              [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
 case ${ac_COMMS} in
     none)
-       echo Configuring for NO communications
+        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
-       AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
+        comms_type='none'
     ;;
-     mpi)
+     mpi3l*)
-       echo Configuring for MPI communications
+       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
-       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
+       comms_type='mpi3l'
     ;;
     mpi3*)
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
        comms_type='mpi3'
     ;;
     mpi*)
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
        comms_type='mpi'
     ;;
     shmem)
        AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
        comms_type='shmem'
     ;;
     *)
-     AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
+        AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
     ;;
 esac
 case ${ac_COMMS} in
    *-auto)
        LX_FIND_MPI
        if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
        AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
        AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
        AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
        LIBS="`echo $MPI_CXXLDFLAGS | sed -E 's/-L@<:@^ @:>@+//g'` $LIBS";;
    *)
        ;;
 esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
 AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 ############### RNG selection
 AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937],\
 	            [Select Random Number Generator to be used])],\
 	            [ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
 case ${ac_RNG} in
     ranlux48)
      AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
     ;;
     mt19937)
      AC_DEFINE([RNG_MT19937],[1],[RNG_MT19937] )
     ;;
     *)
      AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
     ;;
 esac
-AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
+############### Timer option
-AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
+AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers],\
 	            [Enable system dependent high res timers])],\
 	            [ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
-AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
+case ${ac_TIMERS} in
 case ${ac_CHROMA} in
     yes)
-       echo Enabling tests regressing to Chroma
+      AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
     ;;
     no)
-       echo Disabling tests regressing to Chroma
+      AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
     ;;
     *)
-     AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
+      AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
     ;;
 esac
 ############### Chroma regression test
 AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],
              [Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
 case ${ac_CHROMA} in
     yes|no)
     ;;
     *)
       AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
     ;;
 esac
 AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
-###################################################################
+############### Doxygen
-# Checks for doxygen support
+AC_PROG_DOXYGEN
 # if present enables the "make doxyfile" command
 #echo
 #echo Checking doxygen support 
 #echo :::::::::::::::::::::::::::::::::::::::::::
 #AC_PROG_DOXYGEN
-#if test -n "$DOXYGEN"
+if test -n "$DOXYGEN"
-#then
+then
-#AC_CONFIG_FILES([docs/doxy.cfg])
+AC_CONFIG_FILES([docs/doxy.cfg])
-#fi
+fi
-echo
+############### Ouput
-echo Creating configuration files
+cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
-echo :::::::::::::::::::::::::::::::::::::::::::
+AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS"
 AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS"
 AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS"
 AC_SUBST([AM_CFLAGS])
 AC_SUBST([AM_CXXFLAGS])
 AC_SUBST([AM_LDFLAGS])
 AC_CONFIG_FILES(Makefile)
 AC_CONFIG_FILES(lib/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
 AC_CONFIG_FILES(tests/IO/Makefile)
 AC_CONFIG_FILES(tests/core/Makefile)
 AC_CONFIG_FILES(tests/debug/Makefile)
 AC_CONFIG_FILES(tests/forces/Makefile)
 AC_CONFIG_FILES(tests/hmc/Makefile)
 AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_OUTPUT
-
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 echo "
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Summary of configuration for $PACKAGE v$VERSION
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The following features are enabled:
+----- PLATFORM ----------------------------------------
-
+architecture (build)        : $build_cpu
- architecture (build)          : $build_cpu
+os (build)                  : $build_os
- os (build)                    : $build_os
+architecture (target)       : $target_cpu
- architecture (target)         : $target_cpu
+os (target)                 : $target_os
- os (target)                   : $target_os
+compiler vendor             : ${ax_cv_cxx_compiler_vendor}
- build DOXYGEN documentation   : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
+compiler version            : ${ax_cv_gxx_version}
- graphs and diagrams           : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
+----- BUILD OPTIONS -----------------------------------
- Supported SIMD flags          : $SIMD_FLAGS
+SIMD                        : ${ac_SIMD}
----------------------------------------------------------
+Threading                   : ${ac_openmp} 
- enabled simd support          : ${ac_SIMD}   (supported: $supported )
+Communications type         : ${comms_type}
- communications type           : ${ac_COMMS}
+Default precision           : ${ac_PRECISION}
-
+RNG choice                  : ${ac_RNG} 
-
+GMP                         : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
-"
+LAPACK                      : ${ac_LAPACK}
 FFTW                        : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
 build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
 graphs and diagrams         : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
 CXXFLAGS:
 `echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 LDFLAGS:
 `echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 LIBS:
 `echo ${LIBS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 -------------------------------------------------------" > config.summary
 echo ""
 cat config.summary
 echo ""
--- a/gcc-bug-report/broken.cc
+++ b/gcc-bug-report/broken.cc
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./gcc-bug-report/broken.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <vector>
 #include <complex>
 #include <type_traits>
--- a/include/Grid
+++ b/include/Grid
@ -0,0 +1 @@
 ../lib
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@ -1,27 +1,56 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Algorithms.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_ALGORITHMS_H
 #define GRID_ALGORITHMS_H
-#include <algorithms/SparseMatrix.h>
+#include <Grid/algorithms/SparseMatrix.h>
-#include <algorithms/LinearOperator.h>
+#include <Grid/algorithms/LinearOperator.h>
-#include <algorithms/Preconditioner.h>
+#include <Grid/algorithms/Preconditioner.h>
-#include <algorithms/approx/Zolotarev.h>
+#include <Grid/algorithms/approx/Zolotarev.h>
-#include <algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/Chebyshev.h>
-#include <algorithms/approx/Remez.h>
+#include <Grid/algorithms/approx/Remez.h>
-#include <algorithms/approx/MultiShiftFunction.h>
+#include <Grid/algorithms/approx/MultiShiftFunction.h>
-#include <algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateGradient.h>
-#include <algorithms/iterative/ConjugateResidual.h>
+#include <Grid/algorithms/iterative/ConjugateResidual.h>
-#include <algorithms/iterative/NormalEquations.h>
+#include <Grid/algorithms/iterative/NormalEquations.h>
-#include <algorithms/iterative/SchurRedBlack.h>
+#include <Grid/algorithms/iterative/SchurRedBlack.h>
-#include <algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 // Lanczos support
-#include <algorithms/iterative/MatrixUtils.h>
+#include <Grid/algorithms/iterative/MatrixUtils.h>
-#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
-#include <algorithms/CoarsenedMatrix.h>
+#include <Grid/algorithms/CoarsenedMatrix.h>
 // Eigen/lanczos
 // EigCg
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/AlignedAllocator.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_ALIGNED_ALLOCATOR_H
 #define GRID_ALIGNED_ALLOCATOR_H
@ -8,7 +36,6 @@
 #include <malloc.h>
 #endif
 #include <immintrin.h>
 #ifdef HAVE_MM_MALLOC_H
 #include <mm_malloc.h>
 #endif
@ -30,27 +57,28 @@ public:
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; };
  alignedAllocator() throw() { }
  alignedAllocator(const alignedAllocator&) throw() { }
  template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
  ~alignedAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  //  const_pointer address(const_reference __x) const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
-  pointer allocate(size_type __n, const void* = 0)
+  pointer allocate(size_type __n, const void* _p= 0)
  { 
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
    _Tp tmp;
 #ifdef GRID_NUMA
 #pragma omp parallel for schedule(static)
  for(int i=0;i<__n;i++){
    ptr[i]=tmp;
  }
 #endif 
    return ptr;
  }
@ -63,15 +91,101 @@ public:
  }
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
-template<typename _Tp>  inline bool
+//////////////////////////////////////////////////////////////////////////////////////////
-operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
+// MPI3 : comms must use shm region
 // SHMEM: comms must use symmetric heap
 //////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_COMMS_SHMEM
 extern "C" { 
 #include <mpp/shmem.h>
 extern void * shmem_align(size_t, size_t);
 extern void  shmem_free(void *);
 }
 #define PARANOID_SYMMETRIC_HEAP
 #endif
-template<typename _Tp>  inline bool
+template<typename _Tp>
-operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+class commAllocator {
 public: 
  typedef std::size_t     size_type;
  typedef std::ptrdiff_t  difference_type;
  typedef _Tp*       pointer;
  typedef const _Tp* const_pointer;
  typedef _Tp&       reference;
  typedef const _Tp& const_reference;
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef commAllocator<_Tp1> other; };
  commAllocator() throw() { }
  commAllocator(const commAllocator&) throw() { }
  template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
  ~commAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 #ifdef GRID_COMMS_SHMEM
  pointer allocate(size_type __n, const void* _p= 0)
  {
 #ifdef CRAY
    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
 #else
    _Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
 #endif
 #ifdef PARANOID_SYMMETRIC_HEAP
    static void * bcast;
    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
    bcast = (void *) ptr;
    shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
    if ( bcast != ptr ) {
      std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
      //      BACKTRACEFILE();
      exit(0);
    }
    assert( bcast == (void *) ptr);
 #endif 
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
    shmem_free((void *)__p);
  }
 #else
  pointer allocate(size_type __n, const void* _p= 0) 
  {
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
    free((void *)__p);
 #endif
  }
 #endif
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
 template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
 template<class T> using commVector = std::vector<T,commAllocator<T> >;              
 template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 }; // namespace Grid
 #endif
--- a/lib/Cartesian.h
+++ b/lib/Cartesian.h
@ -1,8 +1,35 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Cartesian.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CARTESIAN_H
 #define GRID_CARTESIAN_H
-#include <cartesian/Cartesian_base.h>
+#include <Grid/cartesian/Cartesian_base.h>
-#include <cartesian/Cartesian_full.h>
+#include <Grid/cartesian/Cartesian_full.h>
-#include <cartesian/Cartesian_red_black.h> 
+#include <Grid/cartesian/Cartesian_red_black.h> 
 #endif
--- a/lib/Communicator.h
+++ b/lib/Communicator.h
@ -1,6 +1,33 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Communicator.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_COMMUNICATOR_H
 #define GRID_COMMUNICATOR_H
-#include <communicator/Communicator_base.h>
+#include <Grid/communicator/Communicator_base.h>
 #endif
--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@ -1,174 +0,0 @@
 /* lib/Config.h.in.  Generated from configure.ac by autoheader.  */
 /* AVX Intrinsics */
 #undef AVX1
 /* AVX2 Intrinsics */
 #undef AVX2
 /* AVX512 Intrinsics for Knights Landing */
 #undef AVX512
 /* AVX Intrinsics with FMA4 */
 #undef AVXFMA4
 /* EMPTY_SIMD only for DEBUGGING */
 #undef EMPTY_SIMD
 /* GRID_COMMS_MPI */
 #undef GRID_COMMS_MPI
 /* GRID_COMMS_NONE */
 #undef GRID_COMMS_NONE
 /* GRID_DEFAULT_PRECISION is DOUBLE */
 #undef GRID_DEFAULT_PRECISION_DOUBLE
 /* GRID_DEFAULT_PRECISION is SINGLE */
 #undef GRID_DEFAULT_PRECISION_SINGLE
 /* Support Altivec instructions */
 #undef HAVE_ALTIVEC
 /* Support AVX (Advanced Vector Extensions) instructions */
 #undef HAVE_AVX
 /* Support AVX2 (Advanced Vector Extensions 2) instructions */
 #undef HAVE_AVX2
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #undef HAVE_DECL_BE64TOH
 /* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
   */
 #undef HAVE_DECL_NTOHLL
 /* Define to 1 if you have the <endian.h> header file. */
 #undef HAVE_ENDIAN_H
 /* Define to 1 if you have the <execinfo.h> header file. */
 #undef HAVE_EXECINFO_H
 /* Support FMA3 (Fused Multiply-Add) instructions */
 #undef HAVE_FMA
 /* Define to 1 if you have the `gettimeofday' function. */
 #undef HAVE_GETTIMEOFDAY
 /* Define to 1 if you have the <gmp.h> header file. */
 #undef HAVE_GMP_H
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 /* Define to 1 if you have the <malloc.h> header file. */
 #undef HAVE_MALLOC_H
 /* Define to 1 if you have the <malloc/malloc.h> header file. */
 #undef HAVE_MALLOC_MALLOC_H
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
 /* Support mmx instructions */
 #undef HAVE_MMX
 /* Define to 1 if you have the <mm_malloc.h> header file. */
 #undef HAVE_MM_MALLOC_H
 /* Support SSE (Streaming SIMD Extensions) instructions */
 #undef HAVE_SSE
 /* Support SSE2 (Streaming SIMD Extensions 2) instructions */
 #undef HAVE_SSE2
 /* Support SSE3 (Streaming SIMD Extensions 3) instructions */
 #undef HAVE_SSE3
 /* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
 #undef HAVE_SSE4_1
 /* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
 #undef HAVE_SSE4_2
 /* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
 #undef HAVE_SSSE3
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 /* Define to 1 if you have the <stdlib.h> header file. */
 #undef HAVE_STDLIB_H
 /* Define to 1 if you have the <strings.h> header file. */
 #undef HAVE_STRINGS_H
 /* Define to 1 if you have the <string.h> header file. */
 #undef HAVE_STRING_H
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #undef HAVE_SYS_STAT_H
 /* Define to 1 if you have the <sys/types.h> header file. */
 #undef HAVE_SYS_TYPES_H
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H
 /* IMCI Intrinsics for Knights Corner */
 #undef IMCI
 /* NEON ARMv8 Experimental support */
 #undef NEONv8
 /* Name of package */
 #undef PACKAGE
 /* Define to the address where bug reports for this package should be sent. */
 #undef PACKAGE_BUGREPORT
 /* Define to the full name of this package. */
 #undef PACKAGE_NAME
 /* Define to the full name and version of this package. */
 #undef PACKAGE_STRING
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
 /* Define to the home page for this package. */
 #undef PACKAGE_URL
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 /* SSE4 Intrinsics */
 #undef SSE4
 /* Define to 1 if you have the ANSI C header files. */
 #undef STDC_HEADERS
 /* Version number of package */
 #undef VERSION
 /* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
   #define below would cause a syntax error. */
 #undef _UINT32_T
 /* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
   #define below would cause a syntax error. */
 #undef _UINT64_T
 /* Define to `unsigned int' if <sys/types.h> does not define. */
 #undef size_t
 /* Define to the type of an unsigned integer type of width exactly 32 bits if
   such a type exists and the standard includes do not define it. */
 #undef uint32_t
 /* Define to the type of an unsigned integer type of width exactly 64 bits if
   such a type exists and the standard includes do not define it. */
 #undef uint64_t
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@ -1,13 +1,52 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Cshift.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_CSHIFT_H_
 #define _GRID_CSHIFT_H_
-#include <cshift/Cshift_common.h>
+#include <Grid/cshift/Cshift_common.h>
 #ifdef GRID_COMMS_NONE
-#include <cshift/Cshift_none.h>
+#include <Grid/cshift/Cshift_none.h>
 #endif
 #ifdef GRID_COMMS_MPI
-#include <cshift/Cshift_mpi.h>
+#include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_MPI3
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_MPI3L
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_SHMEM
 #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
 #endif
--- a/lib/FFT.h
+++ b/lib/FFT.h
@ -0,0 +1,302 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Cshift.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_
 #ifdef HAVE_FFTW
 #ifdef USE_MKL
 #include <fftw/fftw3.h>
 #else
 #include <fftw3.h>
 #endif
 #endif
 namespace Grid {
  template<class scalar> struct FFTW { };
 #ifdef HAVE_FFTW	
  template<> struct FFTW<ComplexD> {
  public:
    typedef fftw_complex FFTW_scalar;
    typedef fftw_plan    FFTW_plan;
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 					FFTW_scalar *in, const int *inembed,		
 					int istride, int idist,		
 					FFTW_scalar *out, const int *onembed,		
 					int ostride, int odist,		
 					int sign, unsigned flags) {
      return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
    }	  
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
      ::fftw_flops(p,add,mul,fmas);
    }
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
      ::fftw_execute_dft(p,in,out);
    }
    inline static void fftw_destroy_plan(const FFTW_plan p) {
      ::fftw_destroy_plan(p);
    }
  };
  template<> struct FFTW<ComplexF> {
  public:
    typedef fftwf_complex FFTW_scalar;
    typedef fftwf_plan    FFTW_plan;
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 					FFTW_scalar *in, const int *inembed,		
 					int istride, int idist,		
 					FFTW_scalar *out, const int *onembed,		
 					int ostride, int odist,		
 					int sign, unsigned flags) {
      return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
    }	  
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
      ::fftwf_flops(p,add,mul,fmas);
    }
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
      ::fftwf_execute_dft(p,in,out);
    }
    inline static void fftw_destroy_plan(const FFTW_plan p) {
      ::fftwf_destroy_plan(p);
    }
  };
 #endif
 #ifndef FFTW_FORWARD
 #define FFTW_FORWARD (-1)
 #define FFTW_BACKWARD (+1)
 #endif
  class FFT {
  private:
    GridCartesian *vgrid;
    GridCartesian *sgrid;
    int Nd;
    double flops;
    double flops_call;
    uint64_t usec;
    std::vector<int> dimensions;
    std::vector<int> processors;
    std::vector<int> processor_coor;
  public:
    static const int forward=FFTW_FORWARD;
    static const int backward=FFTW_BACKWARD;
    double Flops(void) {return flops;}
    double MFlops(void) {return flops/usec;}
    double USec(void)   {return (double)usec;}    
    FFT ( GridCartesian * grid ) :
    vgrid(grid),
    Nd(grid->_ndimension),
    dimensions(grid->_fdimensions),
    processors(grid->_processors),
    processor_coor(grid->_processor_coor)
    {
      flops=0;
      usec =0;
      std::vector<int> layout(Nd,1);
      sgrid = new GridCartesian(dimensions,layout,processors);
    };
    ~FFT ( void)  {
      delete sgrid;
    }
    template<class vobj>
    void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){
      conformable(result._grid,vgrid);
      conformable(source._grid,vgrid);
      Lattice<vobj> tmp(vgrid);
      tmp = source;
      for(int d=0;d<Nd;d++){
 	if( mask[d] ) {
 	  FFT_dim(result,tmp,d,sign);
 	  tmp=result;
 	}
      }
    }
    template<class vobj>
    void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
      std::vector<int> mask(Nd,1);
      FFT_dim_mask(result,source,mask,sign);
    }
    template<class vobj>
    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
 #ifndef HAVE_FFTW
      assert(0);
 #else
      conformable(result._grid,vgrid);
      conformable(source._grid,vgrid);
      int L = vgrid->_ldimensions[dim];
      int G = vgrid->_fdimensions[dim];
      std::vector<int> layout(Nd,1);
      std::vector<int> pencil_gd(vgrid->_fdimensions);
      pencil_gd[dim] = G*processors[dim];
      // Pencil global vol LxLxGxLxL per node
      GridCartesian pencil_g(pencil_gd,layout,processors);
      // Construct pencils
      typedef typename vobj::scalar_object sobj;
      typedef typename sobj::scalar_type   scalar;
      Lattice<sobj> pgbuf(&pencil_g);
      typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
      typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
      int Ncomp = sizeof(sobj)/sizeof(scalar);
      int Nlow  = 1;
      for(int d=0;d<dim;d++){
        Nlow*=vgrid->_ldimensions[d];
      }
      int rank = 1;  /* 1d transforms */
      int n[] = {G}; /* 1d transforms of length G */
      int howmany = Ncomp;
      int odist,idist,istride,ostride;
      idist   = odist   = 1;          /* Distance between consecutive FT's */
      istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
      int *inembed = n, *onembed = n;
      scalar div;
 	  if ( sign == backward ) div = 1.0/G;
 	  else if ( sign == forward ) div = 1.0;
 	  else assert(0);
      FFTW_plan p;
      {
        FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0];
        FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0];
        p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
                                             in,inembed,
                                             istride,idist,
                                             out,onembed,
                                             ostride, odist,
                                             sign,FFTW_ESTIMATE);
      }
      // Barrel shift and collect global pencil
      std::vector<int> lcoor(Nd), gcoor(Nd);
      result = source;
      for(int p=0;p<processors[dim];p++) {
        PARALLEL_REGION
        {
          std::vector<int> cbuf(Nd);
          sobj s;
          PARALLEL_FOR_LOOP_INTERN
          for(int idx=0;idx<sgrid->lSites();idx++) {
            sgrid->LocalIndexToLocalCoor(idx,cbuf);
            peekLocalSite(s,result,cbuf);
            cbuf[dim]+=p*L;
            pokeLocalSite(s,pgbuf,cbuf);
          }
        }
        result = Cshift(result,dim,L);
      }
      // Loop over orthog coords
      int NN=pencil_g.lSites();
      GridStopWatch timer;
      timer.Start();
      PARALLEL_REGION
      {
        std::vector<int> cbuf(Nd);
        PARALLEL_FOR_LOOP_INTERN
        for(int idx=0;idx<NN;idx++) {
          pencil_g.LocalIndexToLocalCoor(idx, cbuf);
          if ( cbuf[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
            FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx];
            FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx];
            FFTW<scalar>::fftw_execute_dft(p,in,out);
          }
        }
      }
      timer.Stop();
      // performance counting
      double add,mul,fma;
      FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
      flops_call = add+mul+2.0*fma;
      usec += timer.useconds();
      flops+= flops_call*NN;
      // writing out result
      int pc = processor_coor[dim];
      PARALLEL_REGION
      {
        std::vector<int> clbuf(Nd), cgbuf(Nd);
        sobj s;
        PARALLEL_FOR_LOOP_INTERN
        for(int idx=0;idx<sgrid->lSites();idx++) {
          sgrid->LocalIndexToLocalCoor(idx,clbuf);
          cgbuf = clbuf;
          cgbuf[dim] = clbuf[dim]+L*pc;
          peekLocalSite(s,pgbuf,cgbuf);
          s = s * div;
          pokeLocalSite(s,result,clbuf);
        }
      }
      // destroying plan
      FFTW<scalar>::fftw_destroy_plan(p);
 #endif
    }
  };
 }
 #endif
--- a/lib/Grid.h
+++ b/lib/Grid.h
@ -1,3 +1,32 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Grid.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 //
 //  Grid.h
 //  simd
@ -30,25 +59,31 @@
 ///////////////////
 // Grid headers
 ///////////////////
-#include <serialisation/Serialisation.h>
+#include <Grid/serialisation/Serialisation.h>
-#include <Config.h>
+#include "Config.h"
-#include <Timer.h>
+#include <Grid/Timer.h>
-#include <Log.h>
+#include <Grid/PerfCount.h>
-#include <AlignedAllocator.h>
+#include <Grid/Log.h>
-#include <Simd.h>
+#include <Grid/AlignedAllocator.h>
-#include <Threads.h>
+#include <Grid/Simd.h>
-#include <Communicator.h> 
+#include <Grid/Threads.h>
-#include <Cartesian.h>    
+#include <Grid/Lexicographic.h>
-#include <Tensors.h>      
+#include <Grid/Init.h>
-#include <Lattice.h>      
+#include <Grid/Communicator.h> 
-#include <Cshift.h>       
+#include <Grid/Cartesian.h>    
-#include <Stencil.h>      
+#include <Grid/Tensors.h>      
-#include <Algorithms.h>   
+#include <Grid/Lattice.h>      
-#include <qcd/QCD.h>
+#include <Grid/Cshift.h>       
-#include <parallelIO/BinaryIO.h>
+#include <Grid/Stencil.h>      
-#include <parallelIO/NerscIO.h>
+#include <Grid/Algorithms.h>   
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/FFT.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/parallelIO/NerscIO.h>
 #include <Grid/qcd/hmc/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/HmcRunner.h>
 #include <Init.h>
 #endif
--- a/lib/Init.cc
+++ b/lib/Init.cc
@ -1,3 +1,33 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Init.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@MacBook-Pro.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 /****************************************************************************/
 /* pab: Signal magic. Processor state dump is x86-64 specific               */
 /****************************************************************************/
@ -14,15 +44,33 @@
 #include <Grid.h>
 #include <algorithm>
 #include <iterator>
 #include <cstdlib>
 #include <memory>
 #define __X86_64
-#ifdef HAVE_EXECINFO_H
+#include <fenv.h>
-#include <execinfo.h>
+#ifdef __APPLE__
 static int
 feenableexcept (unsigned int excepts)
 {
  static fenv_t fenv;
  unsigned int new_excepts = excepts & FE_ALL_EXCEPT,
    old_excepts;  // previous masks
  if ( fegetenv (&fenv) ) return -1;
  old_excepts = fenv.__control & FE_ALL_EXCEPT;
  // unmask
  fenv.__control &= ~new_excepts;
  fenv.__mxcsr   &= ~(new_excepts << 7);
  return ( fesetenv (&fenv) ? -1 : old_excepts );
 }
 #endif
 namespace Grid {
 //////////////////////////////////////////////////////
 // Convenience functions to access stadard command line arg
 // driven parallelism controls
@ -99,6 +147,13 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
  return;
 }
 void GridCmdOptionInt(std::string &str,int & val)
 {
  std::stringstream ss(str);
  ss>>val;
  return;
 }
 void GridParseLayout(char **argv,int argc,
 		     std::vector<int> &latt,
@ -120,18 +175,21 @@ void GridParseLayout(char **argv,int argc,
  }
  if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
    std::vector<int> ompthreads(0);
 #ifndef GRID_OMP
    std::cout << GridLogWarning << "'--threads' option used but Grid was"
              << " not compiled with thread support" << std::endl;
 #endif
    arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
    GridCmdOptionIntVector(arg,ompthreads);
    assert(ompthreads.size()==1);
    GridThread::SetThreads(ompthreads[0]);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
-    std::vector<int> cores(0);
+    int cores;
    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
-    GridCmdOptionIntVector(arg,cores);
+    GridCmdOptionInt(arg,cores);
-    GridThread::SetCores(cores[0]);
+    GridThread::SetCores(cores);
  }
 }
 std::string GridCmdVectorIntToString(const std::vector<int> & vec){
@ -140,33 +198,40 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
  return oss.str();
 }
 /////////////////////////////////////////////////////////
-//
+// Reinit guard
 /////////////////////////////////////////////////////////
 static int Grid_is_initialised = 0;
 void Grid_init(int *argc,char ***argv)
 {
 #ifdef GRID_COMMS_MPI
  MPI_Init(argc,argv);
 #endif
  // Parse command line args.
  GridLogger::StopWatch.Start();
  std::string arg;
  ////////////////////////////////////
  // Shared memory block size
  ////////////////////////////////////
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
    int MB;
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
    GridCmdOptionInt(arg,MB);
    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
  }
  CartesianCommunicator::Init(argc,argv);
  ////////////////////////////////////
  // Logging
  ////////////////////////////////////
  std::vector<std::string> logstreams;
  std::string defaultLog("Error,Warning,Message,Performance");
  GridCmdOptionCSL(defaultLog,logstreams);
  GridLogConfigure(logstreams);
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
+  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
-    std::cout<<GridLogMessage<<"--help : this message"<<std::endl;
+    Grid_quiesce_nodes();
    std::cout<<GridLogMessage<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"--omp n         : default number of OMP threads"<<std::endl;    
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Debug"<<std::endl;    
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
@ -175,28 +240,118 @@ void Grid_init(int *argc,char ***argv)
    GridLogConfigure(logstreams);
  }
  ////////////////////////////////////
  // Help message
  ////////////////////////////////////
  if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
    std::cout<<GridLogMessage<<"  --help : this message"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Geometry:"<<std::endl;
    std::cout<<GridLogMessage<<"  --mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;    
    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
    std::cout<<GridLogMessage<<"  --log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;    
    std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;    
    std::cout<<GridLogMessage<<"  --lebesgue      : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;    
    std::cout<<GridLogMessage<<"  --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    exit(EXIT_SUCCESS);
  }
  ////////////////////////////////////
  // Banner
  ////////////////////////////////////
  std::string COL_RED    = GridLogColours.colour["RED"];
  std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
  std::string COL_BLACK  = GridLogColours.colour["BLACK"];
  std::string COL_GREEN  = GridLogColours.colour["GREEN"];
  std::string COL_BLUE   = GridLogColours.colour["BLUE"];
  std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
  std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
  std::cout <<std::endl;
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_RED  << "__|_ |  |  |  "<<             "|  |  | "<<COL_PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
  std::cout <<COL_RED  << "__|_          "<<             "        "<<COL_PURPLE<<"        "<<                "          _|__"<<std::endl; 
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G  GG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_GREEN <<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G   G   "<<COL_RED<<" R  R   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_GREEN <<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" R   R  "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_GREEN <<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_          "<<             "        "<<COL_GREEN <<"        "<<                "          _|__"<<std::endl; 
  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_BLUE << "  |  |  |  |  "<<             "|  |  | "<<COL_GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
  std::cout << std::endl;
  std::cout << std::endl;
  std::cout <<COL_YELLOW<< std::endl;
  std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
  std::cout << std::endl;
  std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
  std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
  std::cout << "the Free Software Foundation; either version 2 of the License, or"<<std::endl;
  std::cout << "(at your option) any later version."<<std::endl;
  std::cout << std::endl;
  std::cout << "This program is distributed in the hope that it will be useful,"<<std::endl;
  std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
  std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
  std::cout << "GNU General Public License for more details."<<std::endl;
  std::cout << COL_BACKGROUND <<std::endl;
  std::cout << std::endl;
  ////////////////////////////////////
  // Debug and performance options
  ////////////////////////////////////
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
-  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){
-    Grid_quiesce_nodes();
+    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptHandUnroll;
  }
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
-    QCD::WilsonFermionStatic::HandOptDslash=1;
+    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptInlineAsm;
-    QCD::WilsonFermion5DStatic::HandOptDslash=1;
+  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
    GridLogTimestamp(0);
  } else { 
    GridLogTimestamp(1);
  }
  GridParseLayout(*argv,*argc,
 		  Grid_default_latt,
 		  Grid_default_mpi);
  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Decomposition\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@ -208,23 +363,18 @@ void Grid_init(int *argc,char ***argv)
  }
  Grid_is_initialised = 1;
 }
 void Grid_finalize(void)
 {
-#ifdef GRID_COMMS_MPI
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) 
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
 }
 double usecond(void) {
  struct timeval tv;
  gettimeofday(&tv,NULL);
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
 #define _NBACKTRACE (256)
 void * Grid_backtrace_buffer[_NBACKTRACE];
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
@ -236,11 +386,11 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  // Linux/Posix
 #ifdef __linux__
  // And x86 64bit
-    ucontext_t * uc= (ucontext_t *)ptr;
+#ifdef __x86_64__
  ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  printf("  instruction %llx\n",(unsigned long long)sc->rip);
 #define REG(A)  printf("  %s %lx\n",#A,sc-> A);
  REG(rdi);
  REG(rsi);
  REG(rbp);
@ -261,13 +411,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  REG(r14);
  REG(r15);
 #endif
 #ifdef HAVE_EXECINFO_H
  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);
  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
  for (int i = 0; i < symbols; i++){
    printf ("%s\n", strings[i]);
  }
 #endif
  BACKTRACE();
  exit(0);
  return;
 };
@ -280,5 +425,9 @@ void Grid_debug_handler_init(void)
  sa.sa_flags    = SA_SIGINFO;
  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
  sigaction(SIGFPE,&sa,NULL);
 }
 }
--- a/lib/Init.h
+++ b/lib/Init.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Init.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_INIT_H
 #define GRID_INIT_H
@ -5,6 +33,7 @@ namespace Grid {
  void Grid_init(int *argc,char ***argv);
  void Grid_finalize(void);
  // internal, controled with --handle
  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
  void Grid_debug_handler_init(void);
@ -16,11 +45,15 @@ namespace Grid {
  const std::vector<int> &GridDefaultMpi(void);
  const int              &GridThreads(void)  ;
  void                    GridSetThreads(int t) ;
  void GridLogTimestamp(int);
  // Common parsing chores
  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
  bool        GridCmdOptionExists(char** begin, char** end, const std::string& option);
  std::string GridCmdVectorIntToString(const std::vector<int> & vec);
  void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
  void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec);
  void GridParseLayout(char **argv,int argc,
 		       std::vector<int> &latt,
--- a/lib/Lattice.h
+++ b/lib/Lattice.h
@ -1,6 +1,33 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Lattice.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_H
 #define GRID_LATTICE_H
-#include <lattice/Lattice_base.h>
+#include <Grid/lattice/Lattice_base.h>
 #endif
--- a/lib/Lexicographic.h
+++ b/lib/Lexicographic.h
@ -0,0 +1,32 @@
 #ifndef GRID_LEXICOGRAPHIC_H
 #define GRID_LEXICOGRAPHIC_H
 namespace Grid{
  class Lexicographic {
  public:
    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=0;d<nd;d++){
 	coor[d] = index % dims[d];
 	index   = index / dims[d];
      }
    }
    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=0;d<nd;d++){
 	index = index+stride*coor[d];
 	stride=stride*dims[d];
      }
    }
  };
 }
 #endif
--- a/lib/Log.cc
+++ b/lib/Log.cc
@ -1,62 +1,112 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/Log.cc
 Copyright (C) 2015
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid.h>
 #include <cxxabi.h>
 namespace Grid {
  std::string demangle(const char* name) {
    int status = -4; // some arbitrary value to eliminate the compiler warning
    // enable c++11 by passing the flag -std=c++11 to g++
    std::unique_ptr<char, void(*)(void*)> res {
      abi::__cxa_demangle(name, NULL, NULL, &status),
 	std::free
 	};
    return (status==0) ? res.get() : name ;
  }
 GridStopWatch Logger::StopWatch;
-std::ostream  Logger::devnull(0);
+int Logger::timestamp;
 std::ostream Logger::devnull(0);
-GridLogger GridLogError      (1,"Error");
+void GridLogTimestamp(int on){
-GridLogger GridLogWarning    (1,"Warning");
+  Logger::Timestamp(on);
-GridLogger GridLogMessage    (1,"Message");
+}
 GridLogger GridLogDebug      (1,"Debug");
 GridLogger GridLogPerformance(1,"Performance");
 GridLogger GridLogIterative  (1,"Iterative");
-void GridLogConfigure(std::vector<std::string> &logstreams)
+Colours GridLogColours(0);
-{
+GridLogger GridLogError(1, "Error", GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
 GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative(1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator(1, "Integrator", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
  GridLogWarning.Active(0);
-  GridLogMessage.Active(0);
+  GridLogMessage.Active(1); // at least the messages should be always on
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(0);
  GridLogColours.Active(0);
-  for(int i=0;i<logstreams.size();i++){
+  for (int i = 0; i < logstreams.size(); i++) {
-    if ( logstreams[i]== std::string("Error")       ) GridLogError.Active(1);
+    if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
-    if ( logstreams[i]== std::string("Warning")     ) GridLogWarning.Active(1);
+    if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
-    if ( logstreams[i]== std::string("Message")     ) GridLogMessage.Active(1);
+    if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
-    if ( logstreams[i]== std::string("Iterative")   ) GridLogIterative.Active(1);
+    if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
-    if ( logstreams[i]== std::string("Debug")       ) GridLogDebug.Active(1);
+    if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
-    if ( logstreams[i]== std::string("Performance") ) GridLogPerformance.Active(1);
+    if (logstreams[i] == std::string("Performance"))
      GridLogPerformance.Active(1);
    if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
    if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
  }
 }
 ////////////////////////////////////////////////////////////
 // Verbose limiter on MPI tasks
 ////////////////////////////////////////////////////////////
-void Grid_quiesce_nodes(void)
+void Grid_quiesce_nodes(void) {
-{
+  int me = 0;
-#ifdef GRID_COMMS_MPI
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
-  int me;
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
-  MPI_Comm_rank(MPI_COMM_WORLD,&me);
+#endif
-  if ( me ) { 
+#ifdef GRID_COMMS_SHMEM
  me = shmem_my_pe();
 #endif
  if (me) {
    std::cout.setstate(std::ios::badbit);
  }
 #endif
 }
-void Grid_unquiesce_nodes(void)
+void Grid_unquiesce_nodes(void) {
 {
 #ifdef GRID_COMMS_MPI
-    std::cout.clear();
+  std::cout.clear();
 #endif
 }
 std::ostream& operator<< (std::ostream& stream, const GridTime& time)
 {
  stream << time.count()<<" ms";
  return stream;
 }
 }
--- a/lib/Log.h
+++ b/lib/Log.h
@ -1,44 +1,136 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Log.h
    Copyright (C) 2015
    Author: Antonin Portelli <antonin.portelli@me.com>
    Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <map>
 #ifndef GRID_LOG_H
 #define GRID_LOG_H
 #ifdef HAVE_EXECINFO_H
 #include <execinfo.h>
 #endif
 namespace Grid {
 //////////////////////////////////////////////////////////////////////////////////////////////////
 // Dress the output; use std::chrono for time stamping via the StopWatch class
 //////////////////////////////////////////////////////////////////////////////////////////////////
 class Colours{
 protected:
  bool is_active;
 public:
  std::map<std::string, std::string> colour;
  Colours(bool activate=false){
    Active(activate);
  };
  void Active(bool activate){
    is_active=activate;
    if (is_active){
     colour["BLACK"]  ="\033[30m";
     colour["RED"]    ="\033[31m";
     colour["GREEN"]  ="\033[32m";
     colour["YELLOW"] ="\033[33m";
     colour["BLUE"]   ="\033[34m";
     colour["PURPLE"] ="\033[35m";
     colour["CYAN"]   ="\033[36m";
     colour["WHITE"]  ="\033[37m";
     colour["NORMAL"] ="\033[0;39m";
    } else {
      colour["BLACK"] ="";
      colour["RED"]   ="";
      colour["GREEN"] ="";
      colour["YELLOW"]="";
      colour["BLUE"]  ="";
      colour["PURPLE"]="";
      colour["CYAN"]  ="";
      colour["WHITE"] ="";
      colour["NORMAL"]="";
    }
  };
 };
 std::ostream& operator<< (std::ostream& stream, const GridTime& time);
 class Logger {
 protected:
-    int active;
+  Colours &Painter;
-    std::string name, topName;
+  int active;
  static int timestamp;
  std::string name, topName;
  std::string COLOUR;
 public:
-    static GridStopWatch StopWatch;
+  static GridStopWatch StopWatch;
-    static std::ostream devnull;
+  static std::ostream devnull;
-    Logger(std::string topNm, int on, std::string nm)
+  std::string background() {return Painter.colour["NORMAL"];}
-    : active(on), name(nm), topName(topNm) {};
+  std::string evidence() {return Painter.colour["YELLOW"];}
  std::string colour() {return Painter.colour[COLOUR];}
-    void Active(int on) {active = on;};
+  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)  : active(on),
-    int  isActive(void) {return active;};
+    name(nm),
    topName(topNm),
    Painter(col_class),
    COLOUR(col) {} ;
-    friend std::ostream& operator<< (std::ostream& stream, const Logger& log){
+  void Active(int on) {active = on;};
-        if ( log.active ) {
+  int  isActive(void) {return active;};
-            StopWatch.Stop();
+  static void Timestamp(int on) {timestamp = on;};
-            GridTime now = StopWatch.Elapsed();
+  
-            StopWatch.Start();
+  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
-            stream << std::setw(8) << std::left << log.topName << " : ";
+
-            stream << std::setw(12) << std::left << log.name << " : ";
+    if ( log.active ) {
-            stream << now << " : ";
+      stream << log.background()<< log.topName << log.background()<< " : ";
-            return stream;
+      stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
-        } else { 
+      if ( log.timestamp ) {
-            return devnull;
+	StopWatch.Stop();
-        }
+	GridTime now = StopWatch.Elapsed();
 	StopWatch.Start();
 	stream << log.evidence()<< now << log.background() << " : " ;
      }
      stream << log.colour();
      return stream;
    } else { 
      return devnull;
    }
  }
 };
 class GridLogger: public Logger {
 public:
-  GridLogger(int on, std::string nm): Logger("Grid", on, nm){};
+  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
  Logger("Grid", on, nm, col_class, col_key){};
 };
 void GridLogConfigure(std::vector<std::string> &logstreams);
@ -49,6 +141,42 @@ extern GridLogger GridLogMessage;
 extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
 #define _NBACKTRACE (256)
 extern void * Grid_backtrace_buffer[_NBACKTRACE];
 #define BACKTRACEFILE() {\
 char string[20];					\
 std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
 std::FILE * fp = std::fopen(string,"w");				\
 BACKTRACEFP(fp)\
 std::fclose(fp);	    \
 }
 #ifdef HAVE_EXECINFO_H
 #define BACKTRACEFP(fp) { \
 int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
 char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
 for (int i = 0; i < symbols; i++){\
  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, demangle(strings[i]).c_str()); std::fflush(fp); \
 }\
 }
 #else 
 #define BACKTRACEFP(fp) { \
 std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
 std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
 std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
 std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
 }
 #endif
 #define BACKTRACE() BACKTRACEFP(stdout) 
 }
 #endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
@ -1,4 +0,0 @@
 HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./PerfCount.h ./pugixml/pugixml.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Avx512Asm.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h
 CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./PerfCount.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -1,28 +1,37 @@
 # additional include paths necessary to compile the C++ library
 AM_CXXFLAGS = -I$(top_srcdir)/
 extra_sources=
 if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_mpi3.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_MPI3L
  extra_sources+=communicator/Communicator_mpi3_leader.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_SHMEM
  extra_sources+=communicator/Communicator_shmem.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_none.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 #
 # Libraries
 #
 include Make.inc
 include Eigen.inc
 lib_LIBRARIES = libGrid.a
 libGrid_a_SOURCES = $(CCFILES) $(extra_sources)
 #	qcd/action/fermion/PartialFractionFermion5D.cc\	\
 #
 # Include files
 #
 nobase_include_HEADERS=$(HFILES)
 libGrid_a_SOURCES              = $(CCFILES) $(extra_sources)
 libGrid_adir                   = $(pkgincludedir)
 nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
--- a/lib/Old/Endeavour.tgz
+++ b/lib/Old/Endeavour.tgz
--- a/lib/Old/Tensor_peek.h
+++ b/lib/Old/Tensor_peek.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Old/Tensor_peek.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_MATH_PEEK_H
 #define GRID_MATH_PEEK_H
 namespace Grid {
--- a/lib/Old/Tensor_poke.h
+++ b/lib/Old/Tensor_poke.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Old/Tensor_poke.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_MATH_POKE_H
 #define GRID_MATH_POKE_H
 namespace Grid {
--- a/lib/PerfCount.cc
+++ b/lib/PerfCount.cc
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/PerfCount.cc
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
@ -5,28 +32,44 @@
 namespace Grid {
 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
-
+#define RawConfig(A,B) (A<<8|B)
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
 #ifdef __linux__
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." , INSTRUCTIONS},
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." , CACHE_REFERENCES},
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......"},
+    // 4
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS...."},
+#ifdef AVX512
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS....."},
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS..."},
+    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS.."},
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS"},
+    { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS  },
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS......."},
+    { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS  },
-  //  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS....."},
+    { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......"},
+    { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS...."},
+    // 11
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS..."},
+#else
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS."},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS....",INSTRUCTIONS},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......"},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......",L1D_READ_ACCESS},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS...."}
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
    // 11
 #endif
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS.......",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS.....",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
    //15
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......",INSTRUCTIONS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS....",INSTRUCTIONS}
    //19
  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
 #endif
 };
 }
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@ -1,3 +1,32 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/PerfCount.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <peterboyle@MacBook-Pro.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_PERFCOUNT_H
 #define GRID_PERFCOUNT_H
@ -5,7 +34,7 @@
 #include <ctime>
 #include <chrono>
 #include <string.h>
-
+#include <unistd.h>
 #include <sys/ioctl.h>
 #ifdef __linux__
@ -14,8 +43,11 @@
 #else
 #include <sys/syscall.h>
 #endif
-namespace Grid {
+#ifdef __x86_64__
 #include <x86intrin.h>
 #endif
 namespace Grid {
 #ifdef __linux__
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
@ -29,6 +61,48 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 }
 #endif
 #ifdef TIMERS_OFF
 inline uint64_t cyclecount(void){ 
  return 0;
 }
 #define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
 #define __SSC_STOP  __SSC_MARK(0x110)
 #define __SSC_START __SSC_MARK(0x111)
 #else
 #define __SSC_MARK(mark) 
 #define __SSC_STOP  
 #define __SSC_START 
 /*
 * cycle counters arch dependent
 */
 #ifdef __bgq__
 inline uint64_t cyclecount(void){ 
   uint64_t tmp;
   asm volatile ("mfspr %0,0x10C" : "=&r" (tmp)  );
   return tmp;
 }
 #elif defined __x86_64__
 inline uint64_t cyclecount(void){ 
  return __rdtsc();
  //  unsigned int dummy;
  // return __rdtscp(&dummy);
 }
 #else
 inline uint64_t cyclecount(void){ 
   return 0;
 }
 #endif
 #endif
 class PerformanceCounter {
 private:
@ -38,6 +112,7 @@ private:
    uint32_t type;
    uint64_t config;
    const char *name;
    int normalisation;
  } PerformanceCounterConfig; 
  static const PerformanceCounterConfig PerformanceCounterConfigs [];
@ -45,26 +120,12 @@ private:
 public:
  enum PerformanceCounterType {
-    CPUCYCLES=0,
+    CACHE_REFERENCES=0,
-    INSTRUCTIONS,
+    CACHE_MISSES=1,
-    //    STALL_CYCLES,
+    CPUCYCLES=2,
-    CACHE_REFERENCES,
+    INSTRUCTIONS=3,
-    CACHE_MISSES,
+    L1D_READ_ACCESS=4,
-    L1D_READ_MISS,
+    PERFORMANCE_COUNTER_NUM_TYPES=19
    L1D_READ_ACCESS,
    L1D_WRITE_MISS,
    L1D_WRITE_ACCESS,
    L1D_PREFETCH_MISS,
    L1D_PREFETCH_ACCESS,
    LL_READ_MISS,
    //    LL_READ_ACCESS,
    LL_WRITE_MISS,
    LL_WRITE_ACCESS,
    LL_PREFETCH_MISS,
    LL_PREFETCH_ACCESS,
    L1I_READ_MISS,
    L1I_READ_ACCESS,
    PERFORMANCE_COUNTER_NUM_TYPES
  };
 public:
@ -72,8 +133,10 @@ public:
  int PCT;
  long long count;
  long long cycles;
  int fd;
-  uint64_t elapsed;
+  int cyclefd;
  unsigned long long elapsed;
  uint64_t begin;
  static int NumTypes(void){ 
@ -85,7 +148,9 @@ public:
    assert(_pct>=0);
    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
    fd=-1;
    cyclefd=-1;
    count=0;
    cycles=0;
    PCT =_pct;
    Open();
 #endif
@ -110,6 +175,15 @@ public:
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
    int norm = PerformanceCounterConfigs[PCT].normalisation;
    pe.type  = PerformanceCounterConfigs[norm].type;
    pe.config= PerformanceCounterConfigs[norm].config;
    name = PerformanceCounterConfigs[norm].name;
    cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
    if (cyclefd == -1) {
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
 #endif
  }
@ -117,10 +191,12 @@ public:
  {
 #ifdef __linux__
    if ( fd!= -1) {
-      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
-      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
    }
-    begin  =__rdtsc();
+    begin  =cyclecount();
 #else
    begin = 0;
 #endif
@ -128,12 +204,15 @@ public:
  void Stop(void) {
    count=0;
    cycles=0;
 #ifdef __linux__
    if ( fd!= -1) {
-      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
      ::read(fd, &count, sizeof(long long));
      ::read(cyclefd, &cycles, sizeof(long long));
    }
-    elapsed = __rdtsc() - begin;
+    elapsed = cyclecount() - begin;
 #else
    elapsed = 0;
 #endif
@ -141,16 +220,20 @@ public:
  }
  void Report(void) {
 #ifdef __linux__
-    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
+    int N = PerformanceCounterConfigs[PCT].normalisation;
    const char * sn = PerformanceCounterConfigs[N].name ;
    const char * sc = PerformanceCounterConfigs[PCT].name;
      std::printf("tsc = %llu %s = %llu  %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, 
 		  sc, count, sc,sn, (double)count/(double)cycles);
 #else
-    printf("%llu cycles \n", elapsed );
+    std::printf("%llu cycles \n", elapsed );
 #endif
  }
  ~PerformanceCounter()
  {
 #ifdef __linux__
-    close(fd);
+    ::close(fd);    ::close(cyclefd);
 #endif
  }
--- a/lib/Simd.h
+++ b/lib/Simd.h
@ -1,3 +1,33 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/Simd.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_SIMD_H
 #define GRID_SIMD_H
@ -13,10 +43,13 @@
 #define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
 #define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
 #define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
 #define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
 #define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
 #define RotateBit (0x100)
 namespace Grid {
  typedef uint32_t Integer;
@ -86,6 +119,14 @@ namespace Grid {
  inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
  inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
  inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
  // define projections to real and imaginay parts
  inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
  inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
  inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
  inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
  // define auxiliary functions for complex computations
  inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
  inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
  inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
@ -131,8 +172,8 @@ namespace Grid {
 };
-#include <simd/Grid_vector_types.h>
+#include "simd/Grid_vector_types.h"
-#include <simd/Grid_vector_unops.h>
+#include "simd/Grid_vector_unops.h"
 namespace Grid {
  // Default precision
@ -196,6 +237,18 @@ namespace Grid {
    stream<<">";
    return stream;
  }
  inline std::ostream& operator<< (std::ostream& stream, const vInteger &o){
    int nn=vInteger::Nsimd();
    std::vector<Integer,alignedAllocator<Integer> > buf(nn);
    vstore(o,&buf[0]);
    stream<<"<";
    for(int i=0;i<nn;i++){
      stream<<buf[i];
      if(i<nn-1) stream<<",";
    }
    stream<<">";
    return stream;
  }
 }
--- a/lib/Stat.cc
+++ b/lib/Stat.cc
@ -0,0 +1,247 @@
 #include <Grid.h>
 #include <PerfCount.h>
 #include <Stat.h>
 namespace Grid { 
 bool PmuStat::pmu_initialized=false;
 void PmuStat::init(const char *regname)
 {
 #ifdef __x86_64__
  name = regname;
  if (!pmu_initialized)
    {
      std::cout<<"initialising pmu"<<std::endl;
      pmu_initialized = true;
      pmu_init();
    }
  clear();
 #endif
 }
 void PmuStat::clear(void)
 {
 #ifdef __x86_64__
  count = 0;
  tregion = 0;
  pmc0 = 0;
  pmc1 = 0;
  inst = 0;
  cyc = 0;
  ref = 0;
  tcycles = 0;
  reads = 0;
  writes = 0;
 #endif
 }
 void PmuStat::print(void)
 {
 #ifdef __x86_64__
  std::cout <<"Reg "<<std::string(name)<<":\n";
  std::cout <<"  region "<<tregion<<std::endl;
  std::cout <<"  cycles "<<tcycles<<std::endl;
  std::cout <<"  inst   "<<inst   <<std::endl;
  std::cout <<"  cyc    "<<cyc    <<std::endl;
  std::cout <<"  ref    "<<ref    <<std::endl;
  std::cout <<"  pmc0   "<<pmc0   <<std::endl;
  std::cout <<"  pmc1   "<<pmc1   <<std::endl;
  std::cout <<"  count  "<<count  <<std::endl;
  std::cout <<"  reads  "<<reads  <<std::endl;
  std::cout <<"  writes "<<writes <<std::endl;
 #endif
 }
 void PmuStat::start(void)
 {
 #ifdef __x86_64__
  pmu_start();
  ++count;
  xmemctrs(&mrstart, &mwstart);
  tstart = __rdtsc();
 #endif
 }
 void PmuStat::enter(int t)
 {
 #ifdef __x86_64__
  counters[0][t] = __rdpmc(0);
  counters[1][t] = __rdpmc(1);
  counters[2][t] = __rdpmc((1<<30)|0);
  counters[3][t] = __rdpmc((1<<30)|1);
  counters[4][t] = __rdpmc((1<<30)|2);
  counters[5][t] = __rdtsc();
 #endif
 }
 void PmuStat::exit(int t)
 {
 #ifdef __x86_64__
  counters[0][t] = __rdpmc(0) - counters[0][t];
  counters[1][t] = __rdpmc(1) - counters[1][t];
  counters[2][t] = __rdpmc((1<<30)|0) - counters[2][t];
  counters[3][t] = __rdpmc((1<<30)|1) - counters[3][t];
  counters[4][t] = __rdpmc((1<<30)|2) - counters[4][t];
  counters[5][t] = __rdtsc() - counters[5][t];
 #endif
 }
 void PmuStat::accum(int nthreads)
 {
 #ifdef __x86_64__
  tend = __rdtsc();
  xmemctrs(&mrend, &mwend);
  pmu_stop();
  for (int t = 0; t < nthreads; ++t) {
    pmc0 += counters[0][t];
    pmc1 += counters[1][t];
    inst += counters[2][t];
    cyc += counters[3][t];
    ref += counters[4][t];
    tcycles += counters[5][t];
  }
  uint64_t region = tend - tstart;
  tregion += region;
  uint64_t mreads = mrend - mrstart;
  reads += mreads;
  uint64_t mwrites = mwend - mwstart;
  writes += mwrites;
 #endif
 }
 void PmuStat::pmu_fini(void) {}
 void PmuStat::pmu_start(void) {};
 void PmuStat::pmu_stop(void) {};
 void PmuStat::pmu_init(void)
 {
 #ifdef _KNIGHTS_LANDING_
  KNLsetup();
 #endif
 }
 void PmuStat::xmemctrs(uint64_t *mr, uint64_t *mw)
 {
 #ifdef _KNIGHTS_LANDING_
  ctrs c;
  KNLreadctrs(c);
  uint64_t emr = 0, emw = 0;
  for (int i = 0; i < NEDC; ++i)
    {
      emr += c.edcrd[i];
      emw += c.edcwr[i];
    }
  *mr = emr;
  *mw = emw;
 #else
  *mr = *mw = 0;
 #endif
 }
 #ifdef _KNIGHTS_LANDING_
 struct knl_gbl_ PmuStat::gbl;
 #define PMU_MEM
 void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
 {
  char fname[1024];
  snprintf(fname, sizeof(fname), "%s/type", ename);
  FILE *fp = fopen(fname, "r");
  if (fp == 0) {
    ::printf("open %s", fname);
    ::exit(0);
  }
  int type;
  int ret = fscanf(fp, "%d", &type);
  assert(ret == 1);
  fclose(fp);
  //  std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
  struct perf_event_attr hw = {};
  hw.size = sizeof(hw);
  hw.type = type;
  // see /sys/devices/uncore_*/format/*
  // All of the events we are interested in are configured the same way, but
  // that isn't always true. Proper code would parse the format files
  hw.config = event | (umask << 8);
  //hw.read_format = PERF_FORMAT_GROUP;
  // unfortunately the above only works within a single PMU; might
  // as well just read them one at a time
  int cpu = 0;
  fd = perf_event_open(&hw, -1, cpu, -1, 0);
  if (fd == -1) {
    ::printf("CPU %d, box %s, event 0x%lx", cpu, ename, hw.config);
    ::exit(0);
  } else { 
    //    std::cout << "event "<<std::string(ename)<<" set up for fd "<<fd<<" hw.config "<<hw.config <<std::endl;
  }
 }
 void PmuStat::KNLsetup(void){
   int ret;
   char fname[1024];
   // MC RPQ inserts and WPQ inserts (reads & writes)
   for (int mc = 0; mc < NMC; ++mc)
     {
       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
       // RPQ Inserts
       KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
       // WPQ Inserts
       KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
     }
   // EDC RPQ inserts and WPQ inserts
   for (int edc=0; edc < NEDC; ++edc)
     {
       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
       // RPQ inserts
       KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
       // WPQ inserts
       KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
     }
   // EDC HitE, HitM, MissE, MissM
   for (int edc=0; edc < NEDC; ++edc)
     {
       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
       KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
       KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
       KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
       KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
     }
 }
 uint64_t PmuStat::KNLreadctr(int fd)
 {
  uint64_t data;
  size_t s = ::read(fd, &data, sizeof(data));
  if (s != sizeof(uint64_t)){
    ::printf("read counter %lu", s);
    ::exit(0);
  }
  return data;
 }
 void PmuStat::KNLreadctrs(ctrs &c)
 {
  for (int i = 0; i < NMC; ++i)
    {
      c.mcrd[i] = KNLreadctr(gbl.mc_rd[i]);
      c.mcwr[i] = KNLreadctr(gbl.mc_wr[i]);
    }
  for (int i = 0; i < NEDC; ++i)
    {
      c.edcrd[i] = KNLreadctr(gbl.edc_rd[i]);
      c.edcwr[i] = KNLreadctr(gbl.edc_wr[i]);
    }
  for (int i = 0; i < NEDC; ++i)
    {
      c.edchite[i] = KNLreadctr(gbl.edc_hite[i]);
      c.edchitm[i] = KNLreadctr(gbl.edc_hitm[i]);
      c.edcmisse[i] = KNLreadctr(gbl.edc_misse[i]);
      c.edcmissm[i] = KNLreadctr(gbl.edc_missm[i]);
    }
 }
 #endif
 }
--- a/lib/Stat.h
+++ b/lib/Stat.h
@ -0,0 +1,104 @@
 #ifndef _GRID_STAT_H
 #define _GRID_STAT_H
 #ifdef AVX512
 #define _KNIGHTS_LANDING_ROOTONLY
 #endif
 namespace Grid { 
 ///////////////////////////////////////////////////////////////////////////////
 // Extra KNL counters from MCDRAM
 ///////////////////////////////////////////////////////////////////////////////
 #ifdef _KNIGHTS_LANDING_
 #define NMC 6
 #define NEDC 8
 struct ctrs
 {
    uint64_t mcrd[NMC];
    uint64_t mcwr[NMC];
    uint64_t edcrd[NEDC]; 
    uint64_t edcwr[NEDC];
    uint64_t edchite[NEDC];
    uint64_t edchitm[NEDC];
    uint64_t edcmisse[NEDC];
    uint64_t edcmissm[NEDC];
 };
 // Peter/Azusa:
 // Our modification of a code provided by Larry Meadows from Intel
 // Verified by email exchange non-NDA, ok for github. Should be as uses /sys/devices/ FS
 // so is already public and in the linux kernel for KNL.
 struct knl_gbl_
 {
  int mc_rd[NMC];
  int mc_wr[NMC];
  int edc_rd[NEDC];
  int edc_wr[NEDC];
  int edc_hite[NEDC];
  int edc_hitm[NEDC];
  int edc_misse[NEDC];
  int edc_missm[NEDC];
 };
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 class PmuStat
 {
    uint64_t counters[8][256];
 #ifdef _KNIGHTS_LANDING_
    static struct knl_gbl_ gbl;
 #endif
    const char *name;
    uint64_t reads;     // memory reads
    uint64_t writes;    // memory writes
    uint64_t mrstart;   // memory read counter at start of parallel region
    uint64_t mrend;     // memory read counter at end of parallel region
    uint64_t mwstart;   // memory write counter at start of parallel region
    uint64_t mwend;     // memory write counter at end of parallel region
    // cumulative counters
    uint64_t count;     // number of invocations
    uint64_t tregion;   // total time in parallel region (from thread 0)
    uint64_t tcycles;   // total cycles inside parallel region
    uint64_t inst, ref, cyc;   // fixed counters
    uint64_t pmc0, pmc1;// pmu
    // add memory counters here
    // temp variables
    uint64_t tstart;    // tsc at start of parallel region
    uint64_t tend;      // tsc at end of parallel region
    // map for ctrs values
    // 0 pmc0 start
    // 1 pmc0 end
    // 2 pmc1 start
    // 3 pmc1 end
    // 4 tsc start
    // 5 tsc end
    static bool pmu_initialized;
 public:
    static bool is_init(void){ return pmu_initialized;}
    static void pmu_init(void);
    static void pmu_fini(void);
    static void pmu_start(void);
    static void pmu_stop(void);
    void accum(int nthreads);
    static void xmemctrs(uint64_t *mr, uint64_t *mw);
    void start(void);
    void enter(int t);
    void exit(int t);
    void print(void);
    void init(const char *regname);
    void clear(void);
 #ifdef _KNIGHTS_LANDING_
    static void     KNLsetup(void);
    static uint64_t KNLreadctr(int fd);
    static void     KNLreadctrs(ctrs &c);
    static void     KNLevsetup(const char *ename, int &fd, int event, int umask);
 #endif
  };
 }
 #endif
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@ -1,22 +1,51 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Tensors.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_MATH_H
 #define GRID_MATH_H
-#include <tensors/Tensor_traits.h>
+#include <Grid/tensors/Tensor_traits.h>
-#include <tensors/Tensor_class.h>
+#include <Grid/tensors/Tensor_class.h>
-#include <tensors/Tensor_arith.h>
+#include <Grid/tensors/Tensor_arith.h>
-#include <tensors/Tensor_inner.h>
+#include <Grid/tensors/Tensor_inner.h>
-#include <tensors/Tensor_outer.h>
+#include <Grid/tensors/Tensor_outer.h>
-#include <tensors/Tensor_transpose.h>
+#include <Grid/tensors/Tensor_transpose.h>
-#include <tensors/Tensor_trace.h>
+#include <Grid/tensors/Tensor_trace.h>
-#include <tensors/Tensor_index.h>
+#include <Grid/tensors/Tensor_index.h>
-#include <tensors/Tensor_Ta.h>
+#include <Grid/tensors/Tensor_Ta.h>
-#include <tensors/Tensor_determinant.h>
+#include <Grid/tensors/Tensor_determinant.h>
-#include <tensors/Tensor_exp.h>
+#include <Grid/tensors/Tensor_exp.h>
-//#include <tensors/Tensor_peek.h>
+//#include <Grid/tensors/Tensor_peek.h>
-//#include <tensors/Tensor_poke.h>
+//#include <Grid/tensors/Tensor_poke.h>
-#include <tensors/Tensor_reality.h>
+#include <Grid/tensors/Tensor_reality.h>
-#include <tensors/Tensor_unary.h>
+#include <Grid/tensors/Tensor_unary.h>
-#include <tensors/Tensor_extract_merge.h>
+#include <Grid/tensors/Tensor_extract_merge.h>
-#include <tensors/Tensor_logical.h>
+#include <Grid/tensors/Tensor_logical.h>
 #endif
--- a/lib/Threads.h
+++ b/lib/Threads.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Threads.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_THREADS_H
 #define GRID_THREADS_H
@ -9,11 +37,20 @@
 #ifdef GRID_OMP
 #include <omp.h>
-#define PARALLEL_FOR_LOOP _Pragma("omp parallel for ")
+#ifdef GRID_NUMA
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
 #else
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(runtime)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)")
 #endif
 #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #define PARALLEL_REGION       _Pragma("omp parallel")
 #else
 #define PARALLEL_FOR_LOOP
 #define PARALLEL_FOR_LOOP_INTERN
 #define PARALLEL_NESTED_LOOP2
 #define PARALLEL_REGION
 #endif
 namespace Grid {
@ -95,6 +132,22 @@ class GridThread {
    ThreadBarrier();
  };
  static void bcopy(const void *src, void *dst, size_t len) {
 #ifdef GRID_OMP
 #pragma omp parallel 
    {
      const char *c_src =(char *) src;
      char *c_dest=(char *) dst;
      int me,mywork,myoff;
      GridThread::GetWorkBarrier(len,me, mywork,myoff);
      bcopy(&c_src[myoff],&c_dest[myoff],mywork);
    }
 #else 
    bcopy(src,dst,len);
 #endif
  }
 };
 }
--- a/lib/Timer.h
+++ b/lib/Timer.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Timer.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_TIME_H
 #define GRID_TIME_H
@ -11,40 +39,62 @@ namespace Grid {
  // Dress the output; use std::chrono
 // C++11 time facilities better?
-double usecond(void);
+inline double usecond(void) {
  struct timeval tv;
 #ifdef TIMERS_ON
  gettimeofday(&tv,NULL);
 #endif
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
 typedef  std::chrono::milliseconds          GridTime;
 typedef  std::chrono::microseconds          GridUsecs;
 inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
 {
  stream << time.count()<<" ms";
  return stream;
 }
 class GridStopWatch {
 private:
  bool running;
  GridTimePoint start;
-  GridTime accumulator;
+  GridUsecs accumulator;
 public:
  GridStopWatch () { 
    Reset();
  }
  void     Start(void) { 
    assert(running == false);
 #ifdef TIMERS_ON
    start = GridClock::now(); 
 #endif
    running = true;
  }
  void     Stop(void)  { 
    assert(running == true);
-    accumulator+= std::chrono::duration_cast<GridTime>(GridClock::now()-start); 
+#ifdef TIMERS_ON
    accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); 
 #endif
    running = false; 
  };
  void     Reset(void){
    running = false;
 #ifdef TIMERS_ON
    start = GridClock::now();
-    accumulator = std::chrono::duration_cast<GridTime>(start-start); 
+#endif
    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
  }
  GridTime Elapsed(void) {
    assert(running == false);
-    return accumulator;
+    return std::chrono::duration_cast<GridTime>( accumulator );
  }
  uint64_t useconds(void){
    assert(running == false);
    return (uint64_t) accumulator.count();
  }
 };
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@ -1,7 +1,36 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/CoarsenedMatrix.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H
 #include <Grid.h>
 namespace Grid {
@ -117,6 +146,56 @@ namespace Grid {
      }
      Orthogonalise();
    }
    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
    {
      // Run a Lanczos with sloppy convergence
 	const int Nstop = nn;
 	const int Nk = nn+20;
 	const int Np = nn+20;
 	const int Nm = Nk+Np;
 	const int MaxIt= 10000;
 	RealD resid = 1.0e-3;
 	Chebyshev<FineField> Cheb(0.5,64.0,21);
 	ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
 	//	IRL.lock = 1;
 	FineField noise(FineGrid); gaussian(RNG,noise);
 	FineField tmp(FineGrid); 
 	std::vector<RealD>     eval(Nm);
 	std::vector<FineField> evec(Nm,FineGrid);
 	int Nconv;
 	IRL.calc(eval,evec,
 		 noise,
 		 Nconv);
    	// pull back nn vectors
 	for(int b=0;b<nn;b++){
 	  subspace[b]   = evec[b];
 	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 	  hermop.Op(subspace[b],tmp); 
 	  std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
 	  noise = tmp -  sqrt(eval[b])*subspace[b] ;
 	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
 	  noise = tmp +  eval[b]*subspace[b] ;
 	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
 	}
 	Orthogonalise();
 	for(int b=0;b<nn;b++){
 	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 	}
    }
    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
      RealD scale;
@ -170,11 +249,10 @@ namespace Grid {
    ////////////////////
    Geometry         geom;
    GridBase *       _grid; 
-    CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil; 
+    CartesianStencil<siteVector,siteVector> Stencil; 
    std::vector<CoarseMatrix> A;
    std::vector<siteVector,alignedAllocator<siteVector> >   comm_buf;
    ///////////////////////
    // Interface
@ -187,7 +265,7 @@ namespace Grid {
      conformable(in._grid,out._grid);
      SimpleCompressor<siteVector> compressor;
-      Stencil.HaloExchange(in,comm_buf,compressor);
+      Stencil.HaloExchange(in,compressor);
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<Grid()->oSites();ss++){
@ -204,7 +282,7 @@ PARALLEL_FOR_LOOP
 	  } else if(SE->_is_local) { 
 	    nbr = in._odata[SE->_offset];
 	  } else {
-	    nbr = comm_buf[SE->_offset];
+	    nbr = Stencil.CommBuf()[SE->_offset];
 	  }
 	  res = res + A[point]._odata[ss]*nbr;
 	}
@ -228,7 +306,6 @@ PARALLEL_FOR_LOOP
      Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
      A(geom.npoint,&CoarseGrid)
    {
      comm_buf.resize(Stencil._unified_buffer_size);
    };
    void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/LinearOperator.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef  GRID_ALGORITHM_LINEAR_OP_H
 #define  GRID_ALGORITHM_LINEAR_OP_H
@ -194,6 +222,7 @@ namespace Grid {
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
 //	std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
@ -223,10 +252,10 @@ namespace Grid {
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
-	_Mat.Meooe(in,tmp);
+	_Mat.Meooe(in,out);
-	_Mat.MooeeInv(tmp,out);
+	_Mat.MooeeInv(out,tmp);
-	_Mat.Meooe(out,tmp);
+	_Mat.Meooe(tmp,out);
-	_Mat.MooeeInv(tmp,out);
+	_Mat.MooeeInv(out,tmp);
 	return axpy_norm(out,-1.0,tmp,in);
      }
@ -242,6 +271,35 @@ namespace Grid {
      }
    };
    template<class Matrix,class Field>
      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
    protected:
      Matrix &_Mat;
    public:
      SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
 	_Mat.MooeeInv(in,out);
 	_Mat.Meooe(out,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);
 	return axpy_norm(out,-1.0,tmp,in);
      }
      virtual  RealD MpcDag   (const Field &in, Field &out){
 	Field tmp(in._grid);
 	_Mat.MeooeDag(in,out);
 	_Mat.MooeeInvDag(out,tmp);
 	_Mat.MeooeDag(tmp,out);
 	_Mat.MooeeInvDag(out,tmp);
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
    /////////////////////////////////////////////////////////////
    // Base classes for functions of operators
--- a/lib/algorithms/Preconditioner.h
+++ b/lib/algorithms/Preconditioner.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/Preconditioner.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_PRECONDITIONER_H
 #define GRID_PRECONDITIONER_H
--- a/lib/algorithms/SparseMatrix.h
+++ b/lib/algorithms/SparseMatrix.h
@ -1,7 +1,33 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/SparseMatrix.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef  GRID_ALGORITHM_SPARSE_MATRIX_H
 #define  GRID_ALGORITHM_SPARSE_MATRIX_H
 #include <Grid.h>
 namespace Grid {
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@ -1,8 +1,35 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/approx/Chebyshev.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CHEBYSHEV_H
 #define GRID_CHEBYSHEV_H
-#include<Grid.h>
+#include <Grid/algorithms/LinearOperator.h>
 #include<algorithms/LinearOperator.h>
 namespace Grid {
@ -30,13 +57,14 @@ namespace Grid {
      Field Mtmp(in._grid);
      AtoN = in;
      out = AtoN*Coeffs[0];
-      //      std::cout <<"Poly in " <<norm2(in)<<std::endl;
+//            std::cout <<"Poly in " <<norm2(in)<<" size "<< Coeffs.size()<<std::endl;
-      //      std::cout <<"0 " <<norm2(out)<<std::endl;
+//            std::cout <<"Coeffs[0]= "<<Coeffs[0]<< " 0 " <<norm2(out)<<std::endl;
      for(int n=1;n<Coeffs.size();n++){
 	Mtmp = AtoN;
 	Linop.HermOp(Mtmp,AtoN);
 	out=out+AtoN*Coeffs[n];
-	//	std::cout << n<<" " <<norm2(out)<<std::endl;
+//            std::cout <<"Coeffs "<<n<<"= "<< Coeffs[n]<< " 0 " <<std::endl;
 //		std::cout << n<<" " <<norm2(out)<<std::endl;
      }
    };
  };
@ -54,7 +82,8 @@ namespace Grid {
  public:
    void csv(std::ostream &out){
-      for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
+	RealD diff = hi-lo;
      for (RealD x=lo-0.2*diff; x<hi+0.2*diff; x+=(hi-lo)/1000) {
 	RealD f = approx(x);
 	out<< x<<" "<<f<<std::endl;
      }
@ -71,10 +100,24 @@ namespace Grid {
    Chebyshev(){};
    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
    Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 // CJ: the one we need for Lanczos
    void Init(RealD _lo,RealD _hi,int _order)
    {
      lo=_lo;
      hi=_hi;
      order=_order;
      if(order < 2) exit(-1);
      Coeffs.resize(order);
      Coeffs.assign(0.,order);
      Coeffs[order-1] = 1.;
    };
    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
    {
      lo=_lo;
@ -154,6 +197,8 @@ namespace Grid {
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      GridBase *grid=in._grid;
 //std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
 //<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
      int vol=grid->gSites();
--- a/lib/algorithms/approx/MultiShiftFunction.cc
+++ b/lib/algorithms/approx/MultiShiftFunction.cc
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/approx/MultiShiftFunction.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
--- a/lib/algorithms/approx/MultiShiftFunction.h
+++ b/lib/algorithms/approx/MultiShiftFunction.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/approx/MultiShiftFunction.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef MULTI_SHIFT_FUNCTION
 #define MULTI_SHIFT_FUNCTION
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@ -16,9 +16,13 @@
 #define INCLUDED_ALG_REMEZ_H
 #include <stddef.h>
 #include <Config.h>
-//#include <algorithms/approx/bigfloat.h>
+#ifdef HAVE_LIBGMP
-#include <algorithms/approx/bigfloat_double.h>
+#include "bigfloat.h"
 #else
 #include "bigfloat_double.h"
 #endif
 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
 #define SUM_MAX 10 // Maximum number of terms in exponential
--- a/lib/algorithms/approx/bigfloat_double.h
+++ b/lib/algorithms/approx/bigfloat_double.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/approx/bigfloat_double.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <math.h>
 typedef double mfloat; 
--- a/lib/algorithms/iterative/AdefGeneric.h
+++ b/lib/algorithms/iterative/AdefGeneric.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/AdefGeneric.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
 #define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@ -1,105 +1,168 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/ConjugateGradient.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_H
 #define GRID_CONJUGATE_GRADIENT_H
 namespace Grid {
-    /////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////
-    // Base classes for iterative processes based on operators
+// Base classes for iterative processes based on operators
-    // single input vec, single output vec.
+// single input vec, single output vec.
-    /////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////
-  template<class Field> 
+template <class Field>
-    class ConjugateGradient : public OperatorFunction<Field> {
+class ConjugateGradient : public OperatorFunction<Field> {
-public:                                                
+ public:
-    RealD   Tolerance;
+  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
-    Integer MaxIterations;
+                           // Defaults true.
-    ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
+  RealD Tolerance;
-    };
+  Integer MaxIterations;
  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
      : Tolerance(tol),
        MaxIterations(maxit),
        ErrorOnNoConverge(err_on_no_conv){};
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
                  Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD cp, c, a, d, b, ssq, qq, b_pred;
    Field p(src);
    Field mmp(src);
    Field r(src);
    // Initial residual computation & set up
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
-    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+    Linop.HermOpAndNorm(psi, mmp, d, b);
      psi.checkerboard = src.checkerboard;
      conformable(psi,src);
-      RealD cp,c,a,d,b,ssq,qq,b_pred;
+    r = src - mmp;
    p = r;
-      Field   p(src);
+    a = norm2(p);
-      Field mmp(src);
+    cp = a;
-      Field   r(src);
+    ssq = norm2(src);
-      //Initial residual computation & set up
+    std::cout << GridLogIterative << std::setprecision(4)
-      RealD guess = norm2(psi);
+              << "ConjugateGradient: guess " << guess << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:   src " << ssq << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:    mp " << d << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:   mmp " << b << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:  cp,r " << cp << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:     p " << a << std::endl;
-      Linop.HermOpAndNorm(psi,mmp,d,b);
+    RealD rsq = Tolerance * Tolerance * ssq;
-      r= src-mmp;
+    // Check if guess is really REALLY good :)
-      p= r;
+    if (cp <= rsq) {
-      
+      return;
      a  =norm2(p);
      cp =a;
      ssq=norm2(src);
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
      RealD rsq =  Tolerance* Tolerance*ssq;
      //Check if guess is really REALLY good :)
      if ( cp <= rsq ) {
 	return;
      }
      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
      int k;
      for (k=1;k<=MaxIterations;k++){
 	c=cp;
 	Linop.HermOpAndNorm(p,mmp,d,qq);
 	RealD    qqck = norm2(mmp);
 	ComplexD dck  = innerProduct(p,mmp);
 	a      = c/d;
 	b_pred = a*(a*qq-d)/c;
 	cp = axpy_norm(r,-a,mmp,r);
 	b = cp/c;
 	// Fuse these loops ; should be really easy
 	psi= a*p+psi;
 	p  = p*b+r;
 	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 	// Stopping condition
 	if ( cp <= rsq ) { 
 	  Linop.HermOpAndNorm(psi,mmp,d,qq);
 	  p=mmp-src;
 	  RealD mmpnorm = sqrt(norm2(mmp));
 	  RealD psinorm = sqrt(norm2(psi));
 	  RealD srcnorm = sqrt(norm2(src));
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm/srcnorm;
 	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
 		   <<" computed residual "<<sqrt(cp/ssq)
 		   <<" true residual     "<<true_residual
 		   <<" target "<<Tolerance<<std::endl;
 	  return;
 	}
      }
      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
      assert(0);
    }
-  };
+
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq
              << std::endl;
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
      c = cp;
      MatrixTimer.Start();
      Linop.HermOpAndNorm(p, mmp, d, qq);
      MatrixTimer.Stop();
      LinalgTimer.Start();
      //  RealD    qqck = norm2(mmp);
      //  ComplexD dck  = innerProduct(p,mmp);
      a = c / d;
      b_pred = a * (a * qq - d) / c;
      cp = axpy_norm(r, -a, mmp, r);
      b = cp / c;
      // Fuse these loops ; should be really easy
      psi = a * p + psi;
      p = p * b + r;
      LinalgTimer.Stop();
      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
                << " residual " << cp << " target " << rsq << std::endl;
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
        RealD mmpnorm = sqrt(norm2(mmp));
        RealD psinorm = sqrt(norm2(psi));
        RealD srcnorm = sqrt(norm2(src));
        RealD resnorm = sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage
                  << "ConjugateGradient: Converged on iteration " << k << std::endl;
        std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq)
                  << " true residual " << true_residual << " target "
                  << Tolerance << std::endl;
        std::cout << GridLogMessage << "Time elapsed: Iterations "
                  << SolverTimer.Elapsed() << " Matrix  "
                  << MatrixTimer.Elapsed() << " Linalg "
                  << LinalgTimer.Elapsed();
        std::cout << std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
        return;
      }
    }
    std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
              << std::endl;
    if (ErrorOnNoConverge) assert(0);
  }
 };
 }
 #endif
--- a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -0,0 +1,142 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
 #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
 namespace Grid {
  //Mixed precision restarted defect correction CG
  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
  public:                                                
    RealD   Tolerance;
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
      Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
      OuterLoopNormMult(100.), guesser(NULL){ };
    void useGuesser(LinearFunction<FieldF> &g){
      guesser = &g;
    }
    void operator() (const FieldD &src_d_in, FieldD &sol_d){
      GridStopWatch TotalTimer;
      TotalTimer.Start();
      int cb = src_d_in.checkerboard;
      sol_d.checkerboard = cb;
      RealD src_norm = norm2(src_d_in);
      RealD stop = src_norm * Tolerance*Tolerance;
      GridBase* DoublePrecGrid = src_d_in._grid;
      FieldD tmp_d(DoublePrecGrid);
      tmp_d.checkerboard = cb;
      FieldD tmp2_d(DoublePrecGrid);
      tmp2_d.checkerboard = cb;
      FieldD src_d(DoublePrecGrid);
      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
      RealD inner_tol = Tolerance;
      FieldF src_f(SinglePrecGrid);
      src_f.checkerboard = cb;
      FieldF sol_f(SinglePrecGrid);
      sol_f.checkerboard = cb;
      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
      CG_f.ErrorOnNoConverge = false;
      GridStopWatch InnerCGtimer;
      GridStopWatch PrecChangeTimer;
      for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
 	//Compute double precision rsd and also new RHS vector.
 	Linop_d.HermOp(sol_d, tmp_d);
 	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
 	if(norm < OuterLoopNormMult * stop){
 	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 	  break;
 	}
 	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 	PrecChangeTimer.Start();
 	precisionChange(src_f, src_d);
 	PrecChangeTimer.Stop();
 	zeroit(sol_f);
 	//Optionally improve inner solver guess (eg using known eigenvectors)
 	if(guesser != NULL)
 	  (*guesser)(src_f, sol_f);
 	//Inner CG
 	CG_f.Tolerance = inner_tol;
 	InnerCGtimer.Start();
 	CG_f(Linop_f, src_f, sol_f);
 	InnerCGtimer.Stop();
 	//Convert sol back to double and add to double prec solution
 	PrecChangeTimer.Start();
 	precisionChange(tmp_d, sol_f);
 	PrecChangeTimer.Stop();
 	axpy(sol_d, 1.0, tmp_d, sol_d);
      }
      //Final trial CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
      CG_d(Linop_d, src_d_in, sol_d);
      TotalTimer.Stop();
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
    }
  };
 }
 #endif
--- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
 #define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
@ -246,7 +274,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  }
  // ugly hack
  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-  assert(0);
+//  assert(0);
 }
  };
--- a/lib/algorithms/iterative/ConjugateResidual.h
+++ b/lib/algorithms/iterative/ConjugateResidual.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateResidual.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_RESIDUAL_H
 #define GRID_CONJUGATE_RESIDUAL_H
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/DenseMatrix.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_DENSE_MATRIX_H
 #define GRID_DENSE_MATRIX_H
@ -102,8 +130,8 @@ DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st,
 }
-#include <algorithms/iterative/Householder.h>
+#include "Householder.h"
-#include <algorithms/iterative/Francis.h>
+#include "Francis.h"
 #endif
--- a/lib/algorithms/iterative/EigenSort.h
+++ b/lib/algorithms/iterative/EigenSort.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/EigenSort.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_EIGENSORT_H
 #define GRID_EIGENSORT_H
@ -11,32 +38,34 @@ template<class Field>
 class SortEigen {
 private:
 //hacking for testing for now
 private:
  static bool less_lmd(RealD left,RealD right){
-    return fabs(left) < fabs(right);
+    return left > right;
  }  
-  static bool less_pair(std::pair<RealD,Field>& left,
+  static bool less_pair(std::pair<RealD,Field const*>& left,
-		 std::pair<RealD,Field>& right){
+                        std::pair<RealD,Field const*>& right){
-    return fabs(left.first) < fabs(right.first);
+    return left.first > (right.first);
  }  
 public:
  void push(DenseVector<RealD>& lmd,
-	    DenseVector<Field>& evec,int N) {
+            DenseVector<Field>& evec,int N) {
    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
-    DenseVector<std::pair<RealD, Field> > emod;
+    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
-    typename DenseVector<std::pair<RealD, Field> >::iterator it;
+    for(int i=0;i<lmd.size();++i)
-    
+      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
    for(int i=0;i<lmd.size();++i){
      emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
    }
    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
-    it=emod.begin();
+    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
    for(int i=0;i<N;++i){
      lmd[i]=it->first;
-      evec[i]=it->second;
+      evec[i]=*(it->second);
      ++it;
    }
  }
--- a/lib/algorithms/iterative/Francis.h
+++ b/lib/algorithms/iterative/Francis.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Francis.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef FRANCIS_H
 #define FRANCIS_H
--- a/lib/algorithms/iterative/Householder.h
+++ b/lib/algorithms/iterative/Householder.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Householder.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef HOUSEHOLDER_H
 #define HOUSEHOLDER_H
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -1,8 +1,44 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_IRL_H
 #define GRID_IRL_H
-#include <algorithms/iterative/DenseMatrix.h>
+#include <string.h> //memset
-#include <algorithms/iterative/EigenSort.h>
+#ifdef USE_LAPACK
 void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
                   double *vl, double *vu, int *il, int *iu, double *abstol,
                   int *m, double *w, double *z, int *ldz, int *isuppz,
                   double *work, int *lwork, int *iwork, int *liwork,
                   int *info);
 #endif
 #include "DenseMatrix.h"
 #include "EigenSort.h"
 namespace Grid {
@ -21,6 +57,7 @@ public:
    int Niter;
    int converged;
    int Nstop;   // Number of evecs checked for convergence
    int Nk;      // Number of converged sought
    int Np;      // Np -- Number of spare vecs in kryloc space
    int Nm;      // Nm -- total number of vectors
@ -29,6 +66,8 @@ public:
    SortEigen<Field> _sort;
 //    GridCartesian &_fgrid;
    LinearOperatorBase<Field> &_Linop;
    OperatorFunction<Field>   &_poly;
@ -39,7 +78,27 @@ public:
    void init(void){};
    void Abort(int ff, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs);
-    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
+    ImplicitlyRestartedLanczos(
 				LinearOperatorBase<Field> &Linop, // op
 			       OperatorFunction<Field> & poly,   // polynmial
 			       int _Nstop, // sought vecs
 			       int _Nk, // sought vecs
 			       int _Nm, // spare vecs
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _Niter) : // Max iterations
      _Linop(Linop),
      _poly(poly),
      Nstop(_Nstop),
      Nk(_Nk),
      Nm(_Nm),
      eresid(_eresid),
      Niter(_Niter)
    { 
      Np = Nm-Nk; assert(Np>0);
    };
    ImplicitlyRestartedLanczos(
 				LinearOperatorBase<Field> &Linop, // op
 			       OperatorFunction<Field> & poly,   // polynmial
 			       int _Nk, // sought vecs
 			       int _Nm, // spare vecs
@ -47,6 +106,7 @@ public:
 			       int _Niter) : // Max iterations
      _Linop(Linop),
      _poly(poly),
      Nstop(_Nk),
      Nk(_Nk),
      Nm(_Nm),
      eresid(_eresid),
@ -114,10 +174,11 @@ public:
      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
                                 // 7. vk+1 := wk/βk+1
 //	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
      const RealD tiny = 1.0e-20;
      if ( beta < tiny ) { 
 	std::cout << " beta is tiny "<<beta<<std::endl;
-      }
+     }
      lmd[k] = alph;
      lme[k]  = beta;
@ -191,15 +252,122 @@ public:
      }
    }
 #ifdef USE_LAPACK
    void diagonalize_lapack(DenseVector<RealD>& lmd,
 		     DenseVector<RealD>& lme, 
 		     int N1,
 		     int N2,
 		     DenseVector<RealD>& Qt,
 		     GridBase *grid){
  const int size = Nm;
 //  tevals.resize(size);
 //  tevecs.resize(size);
  int NN = N1;
  double evals_tmp[NN];
  double evec_tmp[NN][NN];
  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
 //  double AA[NN][NN];
  double DD[NN];
  double EE[NN];
  for (int i = 0; i< NN; i++)
    for (int j = i - 1; j <= i + 1; j++)
      if ( j < NN && j >= 0 ) {
        if (i==j) DD[i] = lmd[i];
        if (i==j) evals_tmp[i] = lmd[i];
        if (j==(i-1)) EE[j] = lme[j];
      }
  int evals_found;
  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
  int liwork =  3+NN*10 ;
  int iwork[liwork];
  double work[lwork];
  int isuppz[2*NN];
  char jobz = 'V'; // calculate evals & evecs
  char range = 'I'; // calculate all evals
  //    char range = 'A'; // calculate all evals
  char uplo = 'U'; // refer to upper half of original matrix
  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
  int ifail[NN];
  int info;
 //  int total = QMP_get_number_of_nodes();
 //  int node = QMP_get_node_number();
 //  GridBase *grid = evec[0]._grid;
  int total = grid->_Nprocessors;
  int node = grid->_processor;
  int interval = (NN/total)+1;
  double vl = 0.0, vu = 0.0;
  int il = interval*node+1 , iu = interval*(node+1);
  if (iu > NN)  iu=NN;
  double tol = 0.0;
    if (1) {
      memset(evals_tmp,0,sizeof(double)*NN);
      if ( il <= NN){
        printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
        LAPACK_dstegr(&jobz, &range, &NN,
            (double*)DD, (double*)EE,
            &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
            &tol, // tolerance
            &evals_found, evals_tmp, (double*)evec_tmp, &NN,
            isuppz,
            work, &lwork, iwork, &liwork,
            &info);
        for (int i = iu-1; i>= il-1; i--){
          printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]);
          evals_tmp[i] = evals_tmp[i - (il-1)];
          if (il>1) evals_tmp[i-(il-1)]=0.;
          for (int j = 0; j< NN; j++){
            evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
            if (il>1) evec_tmp[i-(il-1)][j]=0.;
          }
        }
      }
      {
 //        QMP_sum_double_array(evals_tmp,NN);
 //        QMP_sum_double_array((double *)evec_tmp,NN*NN);
         grid->GlobalSumVector(evals_tmp,NN);
         grid->GlobalSumVector((double*)evec_tmp,NN*NN);
      }
    } 
 // cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
  for(int i=0;i<NN;i++){
    for(int j=0;j<NN;j++)
      Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
      lmd [NN-1-i]=evals_tmp[i];
  }
 }
 #endif
    void diagonalize(DenseVector<RealD>& lmd,
 		     DenseVector<RealD>& lme, 
-		     int Nm2,
+		     int N2,
-		     int Nm,
+		     int N1,
-		     DenseVector<RealD>& Qt)
+		     DenseVector<RealD>& Qt,
 		     GridBase *grid)
    {
-      int Niter = 100*Nm;
+
 #ifdef USE_LAPACK
    const int check_lapack=0; // just use lapack if 0, check against lapack if 1
    if(!check_lapack)
 	return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
 	DenseVector <RealD> lmd2(N1);
 	DenseVector <RealD> lme2(N1);
 	DenseVector<RealD> Qt2(N1*N1);
         for(int k=0; k<N1; ++k){
 	    lmd2[k] = lmd[k];
 	    lme2[k] = lme[k];
 	  }
         for(int k=0; k<N1*N1; ++k)
 	Qt2[k] = Qt[k];
 //	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
 #endif
      int Niter = 100*N1;
      int kmin = 1;
-      int kmax = Nk;
+      int kmax = N2;
      // (this should be more sophisticated)
      for(int iter=0; iter<Niter; ++iter){
@ -211,7 +379,7 @@ public:
 	// (Dsh: shift)
 	// transformation
-	qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax);
+	qr_decomp(lmd,lme,N2,N1,Qt,Dsh,kmin,kmax);
 	// Convergence criterion (redef of kmin and kamx)
 	for(int j=kmax-1; j>= kmin; --j){
@ -222,6 +390,23 @@ public:
 	  }
 	}
 	Niter = iter;
 #ifdef USE_LAPACK
    if(check_lapack){
 	const double SMALL=1e-8;
 	diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
 	DenseVector <RealD> lmd3(N2);
         for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
        _sort.push(lmd3,N2);
        _sort.push(lmd2,N2);
         for(int k=0; k<N2; ++k){
 	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
 //	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
 	  }
         for(int k=0; k<N1*N1; ++k){
 //	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
 	}
    }
 #endif
 	return;
      continued:
@ -237,6 +422,7 @@ public:
      abort();
    }
 #if 1
    static RealD normalise(Field& v) 
    {
      RealD nn = norm2(v);
@ -298,6 +484,7 @@ until convergence
      {
 	GridBase *grid = evec[0]._grid;
 	assert(grid == src._grid);
 	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
 	std::cout << " -- Nm = " << Nm << std::endl;
@ -328,11 +515,21 @@ until convergence
 	// (uniform vector) Why not src??
 	//	evec[0] = 1.0;
 	evec[0] = src;
 	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
 // << src._grid  << std::endl;
 	normalise(evec[0]);
 	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
 // << evec[0]._grid << std::endl;
 	// Initial Nk steps
 	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
 //	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
 //	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
 	RitzMatrix(evec,Nk);
 	for(int k=0; k<Nk; ++k){
 //	std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
 //	std:: cout <<"lme " << k << " " << lme[k] << std::endl;
 	}
 	// Restarting loop begins
 	for(int iter = 0; iter<Niter; ++iter){
@ -354,20 +551,24 @@ until convergence
 	    lme2[k] = lme[k+k1-1];
 	  }
 	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nm,Nm,Qt);
+	  diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
 	  // sorting
 	  _sort.push(eval2,Nm);
 	  // Implicitly shifted QR transformations
 	  setUnit_Qt(Nm,Qt);
-	  for(int ip=k2; ip<Nm; ++ip) 
+	  for(int ip=k2; ip<Nm; ++ip){ 
 	std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
 	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
 	}
 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
 	  for(int j=k1-1; j<k2+1; ++j){
 	    for(int k=0; k<Nm; ++k){
 	    B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+Nm*j] * evec[k];
 	    }
 	  }
@ -390,21 +591,25 @@ until convergence
 	    lme2[k] = lme[k];
 	  }
 	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nk,Nm,Qt);
+	  diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
 	  for(int k = 0; k<Nk; ++k) B[k]=0.0;
 	  for(int j = 0; j<Nk; ++j){
 	    for(int k = 0; k<Nk; ++k){
 	    B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+j*Nm] * evec[k];
 	    }
 //	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
 	  }
 //	_sort.push(eval2,B,Nk);
 	  Nconv = 0;
 	  //	  std::cout << std::setiosflags(std::ios_base::scientific);
 	  for(int i=0; i<Nk; ++i){
-	    _poly(_Linop,B[i],v);
+//	    _poly(_Linop,B[i],v);
 	    _Linop.HermOp(B[i],v);
 	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
 	    RealD vden = norm2(B[i]);
@ -412,11 +617,13 @@ until convergence
 	    v -= eval2[i]*B[i];
 	    RealD vv = norm2(v);
 	    std::cout.precision(13);
 	    std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
 	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
-	    if(vv<eresid*eresid){
+	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
 	    if((vv<eresid*eresid) && (i == Nconv) ){
 	      Iconv[Nconv] = i;
 	      ++Nconv;
 	    }
@ -427,7 +634,7 @@ until convergence
 	  std::cout<<" #modes converged: "<<Nconv<<std::endl;
-	  if( Nconv>=Nk ){
+	  if( Nconv>=Nstop ){
 	    goto converged;
 	  }
 	} // end of iter loop
@ -436,21 +643,20 @@ until convergence
 	abort();
      converged:
-	// Sorting
+       // Sorting
       eval.resize(Nconv);
       evec.resize(Nconv,grid);
       for(int i=0; i<Nconv; ++i){
         eval[i] = eval2[Iconv[i]];
         evec[i] = B[Iconv[i]];
       }
      _sort.push(eval,evec,Nconv);
-	eval.clear();
+      std::cout << "\n Converged\n Summary :\n";
-	evec.clear();
+      std::cout << " -- Iterations  = "<< Nconv  << "\n";
-	for(int i=0; i<Nconv; ++i){
+      std::cout << " -- beta(k)     = "<< beta_k << "\n";
-	  eval.push_back(eval2[Iconv[i]]);
+      std::cout << " -- Nconv       = "<< Nconv  << "\n";
-	  evec.push_back(B[Iconv[i]]);
+     }
 	}
 	_sort.push(eval,evec,Nconv);
 	std::cout << "\n Converged\n Summary :\n";
 	std::cout << " -- Iterations  = "<< Nconv  << "\n";
 	std::cout << " -- beta(k)     = "<< beta_k << "\n";
 	std::cout << " -- Nconv       = "<< Nconv  << "\n";
      }
    /////////////////////////////////////////////////
    // Adapted from Rudy's lanczos factor routine
@ -997,6 +1203,7 @@ static void Lock(DenseMatrix<T> &H, 	///Hess mtx
  }
 }
 #endif
 };
--- a/lib/algorithms/iterative/Matrix.h
+++ b/lib/algorithms/iterative/Matrix.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Matrix.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef MATRIX_H
 #define MATRIX_H
--- a/lib/algorithms/iterative/MatrixUtils.h
+++ b/lib/algorithms/iterative/MatrixUtils.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/MatrixUtils.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_MATRIX_UTILS_H
 #define GRID_MATRIX_UTILS_H
--- a/lib/algorithms/iterative/NormalEquations.h
+++ b/lib/algorithms/iterative/NormalEquations.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/NormalEquations.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_NORMAL_EQUATIONS_H
 #define GRID_NORMAL_EQUATIONS_H
--- a/lib/algorithms/iterative/PrecConjugateResidual.h
+++ b/lib/algorithms/iterative/PrecConjugateResidual.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/PrecConjugateResidual.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
 #define GRID_PREC_CONJUGATE_RESIDUAL_H
--- a/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_PREC_GCR_H
 #define GRID_PREC_GCR_H
@ -19,6 +47,10 @@ namespace Grid {
    int mmax;
    int nstep;
    int steps;
    GridStopWatch PrecTimer;
    GridStopWatch MatTimer;
    GridStopWatch LinalgTimer;
    LinearFunction<Field> &Preconditioner;
   PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
@ -40,14 +72,24 @@ namespace Grid {
      Field r(src._grid);
        PrecTimer.Reset();
         MatTimer.Reset();
      LinalgTimer.Reset();
      GridStopWatch SolverTimer;
      SolverTimer.Start();
      steps=0;
      for(int k=0;k<MaxIterations;k++){
 	cp=GCRnStep(Linop,src,psi,rsq);
-	if ( verbose ) std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
+	std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
 	if(cp<rsq) {
 	  SolverTimer.Stop();
 	  Linop.HermOp(psi,r);
 	  axpy(r,-1.0,src,r);
 	  RealD tr = norm2(r);
@ -55,6 +97,11 @@ namespace Grid {
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "    <<sqrt(tr/ssq)
 	           << " target "           <<Tolerance <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
 	  return;
 	}
@ -62,6 +109,7 @@ namespace Grid {
      std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
      assert(0);
    }
    RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
      RealD cp;
@ -88,24 +136,25 @@ namespace Grid {
      // initial guess x0 is taken as nonzero.
      // r0=src-A x0 = src
      //////////////////////////////////
      MatTimer.Start();
      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
      MatTimer.Stop();
      r=src-Az;
      /////////////////////
      // p = Prec(r)
      /////////////////////
      PrecTimer.Start();
      Preconditioner(r,z);
      PrecTimer.Stop();
-      std::cout<<GridLogMessage<< " Preconditioner in " << norm2(r)<<std::endl; 
+      MatTimer.Start();
      std::cout<<GridLogMessage<< " Preconditioner out " << norm2(z)<<std::endl; 
      Linop.HermOp(z,tmp); 
      MatTimer.Stop();
      std::cout<<GridLogMessage<< " Preconditioner Aout " << norm2(tmp)<<std::endl; 
      ttmp=tmp;
      tmp=tmp-r;
      std::cout<<GridLogMessage<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl; 
      /*
      std::cout<<GridLogMessage<<r<<std::endl;
      std::cout<<GridLogMessage<<z<<std::endl;
@ -113,7 +162,9 @@ namespace Grid {
      std::cout<<GridLogMessage<<tmp<<std::endl;
      */
      MatTimer.Start();
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
      MatTimer.Stop();
      //p[0],q[0],qq[0] 
      p[0]= z;
@ -137,18 +188,22 @@ namespace Grid {
 	cp = axpy_norm(r,-a,q[peri_k],r);  
 	std::cout<<GridLogMessage<< " VPGCR_step resid" <<sqrt(cp/rsq)<<std::endl; 
 	if((k==nstep-1)||(cp<rsq)){
 	  return cp;
 	}
 	std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " <<sqrt(cp/rsq)<<std::endl; 
 	PrecTimer.Start();
 	Preconditioner(r,z);// solve Az = r
 	PrecTimer.Stop();
 	MatTimer.Start();
 	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
 	Linop.HermOp(z,tmp);
 	MatTimer.Stop();
        tmp=tmp-r;
-	std::cout<<GridLogMessage<< " Preconditioner resid" <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
+	std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
 	q[peri_kp]=Az;
 	p[peri_kp]=z;
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_SCHUR_RED_BLACK_H
 #define GRID_SCHUR_RED_BLACK_H
@ -75,6 +102,8 @@ namespace Grid {
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@ -1,7 +1,34 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cartesian/Cartesian_base.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CARTESIAN_BASE_H
 #define GRID_CARTESIAN_BASE_H
 #include <Grid.h>
 namespace Grid{
@ -50,15 +77,12 @@ public:
    // GridCartesian / GridRedBlackCartesian
    ////////////////////////////////////////////////////////////////
    virtual int CheckerBoarded(int dim)=0;
-    virtual int CheckerBoard(std::vector<int> site)=0;
+    virtual int CheckerBoard(std::vector<int> &site)=0;
    virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
-    int  CheckerBoardFromOindex (int Oindex){
+    virtual int CheckerBoardFromOindex (int Oindex)=0;
-      std::vector<int> ocoor;
+    virtual int CheckerBoardFromOindexTable (int Oindex)=0;
      oCoorFromOindex(ocoor,Oindex); 
      return CheckerBoard(ocoor);
    }
    //////////////////////////////////////////////////////////////////////////////////////////////
    // Local layout calculations
@ -79,6 +103,12 @@ public:
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
    virtual int iIndex(std::vector<int> &lcoor)
    {
        int idx=0;
        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
        return idx;
    }
    inline int oIndexReduced(std::vector<int> &ocoor)
    {
      int idx=0; 
@ -87,45 +117,42 @@ public:
      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
      return idx;
    }
    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=0;d<nd;d++){
 	coor[d] = index % dims[d];
 	index   = index / dims[d];
      }
    }
    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
-      CoorFromIndex(coor,Oindex,_rdimensions);
+      Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
    }
    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=0;d<nd;d++){
 	index = index+stride*coor[d];
 	stride=stride*dims[d];
      }
    }
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
    inline int iIndex(std::vector<int> &lcoor)
    {
        int idx=0;
        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
        return idx;
    }
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
    {
-      CoorFromIndex(coor,lane,_simd_layout);
+      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
    }
    inline int PermuteDim(int dimension){
      return _simd_layout[dimension]>1;
    }
    inline int PermuteType(int dimension){
      int permute_type=0;
      //
      // FIXME:
      //
      // Best way to encode this would be to present a mask 
      // for which simd dimensions are rotated, and the rotation
      // size. If there is only one simd dimension rotated, this is just 
      // a permute. 
      //
      // Cases: PermuteType == 1,2,4,8
      // Distance should be either 0,1,2..
      //
      if ( _simd_layout[dimension] > 2 ) { 
 	for(int d=0;d<_ndimension;d++){
 	  if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
 	}
 	permute_type = RotateBit; // How to specify distance; this is not just direction.
 	return permute_type;
      }
      for(int d=_ndimension-1;d>dimension;d--){
 	if (_simd_layout[d]>1 ) permute_type++;
      }
@ -135,12 +162,12 @@ public:
    // Array sizing queries
    ////////////////////////////////////////////////////////////////
-    inline int iSites(void) { return _isites; };
+    inline int iSites(void) const { return _isites; };
-    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
+    inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
-    inline int oSites(void) { return _osites; };
+    inline int oSites(void) const { return _osites; };
-    inline int lSites(void) { return _isites*_osites; }; 
+    inline int lSites(void) const { return _isites*_osites; }; 
-    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
+    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
-    inline int Nd    (void) { return _ndimension;};
+    inline int Nd    (void) const { return _ndimension;};
    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
@ -151,7 +178,10 @@ public:
    // Global addressing
    ////////////////////////////////////////////////////////////////
    void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
-      CoorFromIndex(gcoor,gidx,_gdimensions);
+      Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
    }
    void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
    }
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
      gidx=0;
@ -186,7 +216,7 @@ public:
      }
      i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
-      o_idx= oIndex(lcoor);// this implies divide by 2 on checkerdim
+      o_idx= oIndex(lcoor);  // this implies divide by 2 on checkerdim
    }
    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cartesian/Cartesian_full.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CARTESIAN_FULL_H
 #define GRID_CARTESIAN_FULL_H
@ -12,10 +39,17 @@ class GridCartesian: public GridBase {
 public:
    virtual int  CheckerBoardFromOindexTable (int Oindex) {
      return 0;
    }
    virtual int  CheckerBoardFromOindex (int Oindex)
    {
      return 0;
    }
    virtual int CheckerBoarded(int dim){
      return 0;
    }
-    virtual int CheckerBoard(std::vector<int> site){
+    virtual int CheckerBoard(std::vector<int> &site){
        return 0;
    }
    virtual int CheckerBoardDestination(int cb,int shift,int dim){
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@ -1,19 +1,41 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cartesian/Cartesian_red_black.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CARTESIAN_RED_BLACK_H
 #define GRID_CARTESIAN_RED_BLACK_H
 namespace Grid {
-    static const int CbRed  =0;
+  static const int CbRed  =0;
-    static const int CbBlack=1;
+  static const int CbBlack=1;
-    static const int Even   =CbRed;
+  static const int Even   =CbRed;
-    static const int Odd    =CbBlack;
+  static const int Odd    =CbBlack;
    // Perhaps these are misplaced and 
    // should be in sparse matrix.
    // Also should make these a named enum type
    static const int DaggerNo=0;
    static const int DaggerYes=1;
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
@ -21,12 +43,13 @@ class GridRedBlackCartesian : public GridBase
 public:
    std::vector<int> _checker_dim_mask;
    int              _checker_dim;
    std::vector<int> _checker_board;
    virtual int CheckerBoarded(int dim){
      if( dim==_checker_dim) return 1;
      else return 0;
    }
-    virtual int CheckerBoard(std::vector<int> site){
+    virtual int CheckerBoard(std::vector<int> &site){
      int linear=0;
      assert(site.size()==_ndimension);
      for(int d=0;d<_ndimension;d++){ 
@ -50,12 +73,20 @@ public:
      // or by looping over x,y,z and multiply rather than computing checkerboard.
      if ( (source_cb+ocb)&1 ) {
 	return (shift)/2;
      } else {
 	return (shift+1)/2;
      }
    }
    virtual int  CheckerBoardFromOindexTable (int Oindex) {
      return _checker_board[Oindex];
    }
    virtual int  CheckerBoardFromOindex (int Oindex)
    {
      std::vector<int> ocoor;
      oCoorFromOindex(ocoor,Oindex);
      return CheckerBoard(ocoor);
    }
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
      if(dim != _checker_dim) return shift;
@ -142,9 +173,15 @@ public:
 	// Use a reduced simd grid
 	_simd_layout[d] = simd_layout[d];
 	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
 	assert(_rdimensions[d]>0);
 	// all elements of a simd vector must have same checkerboard.
-	if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0); 
+	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 	if ( _simd_layout[d]>1 ) {
 	  if ( checker_dim_mask[d] ) { 
 	    assert( (_rdimensions[d]&0x1) == 0 );
 	  }
 	}
 	_osites *= _rdimensions[d];
 	_isites *= _simd_layout[d];
@ -157,6 +194,8 @@ public:
 	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
 	}
      }
      ////////////////////////////////////////////////////////////////////////////////////////////
@ -178,6 +217,18 @@ public:
 	block = block*_rdimensions[d];
      }
      ////////////////////////////////////////////////
      // Create a checkerboard lookup table
      ////////////////////////////////////////////////
      int rvol = 1;
      for(int d=0;d<_ndimension;d++){
 	rvol=rvol * _rdimensions[d];
      }
      _checker_board.resize(rvol);
      for(int osite=0;osite<_osites;osite++){
 	_checker_board[osite] = CheckerBoardFromOindex (osite);
      }
    };
 protected:
    virtual int oIndex(std::vector<int> &coor)
@ -190,9 +241,21 @@ protected:
 	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
 	}
      }
-        return idx;
+      return idx;
    };
    virtual int iIndex(std::vector<int> &lcoor)
    {
        int idx=0;
        for(int d=0;d<_ndimension;d++) {
 	  if( d==_checker_dim ) {
 	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
 	  } else { 
 	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
 	  }
 	}
        return idx;
    }
 };
 }
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@ -0,0 +1,124 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_none.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 namespace Grid {
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
 uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
 void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
  heap_top  += bytes;
  heap_bytes+= bytes;
  if (heap_bytes >= MAX_MPI_SHM_BYTES) {
    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
    std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
    assert(heap_bytes<MAX_MPI_SHM_BYTES);
  }
  return ptr;
 }
 void CartesianCommunicator::ShmBufferFreeAll(void) { 
  heap_top  =(size_t)ShmBufferSelf();
  heap_bytes=0;
 }
 /////////////////////////////////
 // Grid information queries
 /////////////////////////////////
 int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 int                      CartesianCommunicator::BossRank(void)          { return 0; };
 int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
 const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
 const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; };
 int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; };
 ////////////////////////////////////////////////////////////////////////////////
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 {
  GlobalSumVector((float *)c,2*N);
 }
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumVector((double *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
 }
 #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int xmit_to_rank,
 						       void *recv,
 						       int recv_from_rank,
 						       int bytes)
 {
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void){};
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
 void *CartesianCommunicator::ShmBuffer(int rank) {
  return NULL;
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
  return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 }
 #endif
 }
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_base.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_COMMUNICATOR_BASE_H
 #define GRID_COMMUNICATOR_BASE_H
@ -7,118 +35,196 @@
 #ifdef GRID_COMMS_MPI
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_MPI3L
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
 namespace Grid {
 class CartesianCommunicator {
  public:    
  // 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
  static const int      MAXLOG2RANKSPERNODE = 16;            
  static uint64_t MAX_MPI_SHM_BYTES;
  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long _ndimension;
-    int              _Nprocessors;     // How many in all
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
-    std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
+  static MPI_Comm communicator_world;
-    int              _processor;       // linear processor rank
+         MPI_Comm communicator;
-    std::vector<int> _processor_coor;  // linear processor coordinate
+  typedef MPI_Request CommsRequest_t;
    unsigned long _ndimension;
 #ifdef GRID_COMMS_MPI
    MPI_Comm communicator;
    typedef MPI_Request CommsRequest_t;
 #else 
-    typedef int CommsRequest_t;
+  typedef int CommsRequest_t;
 #endif
-    // Constructor
+  ////////////////////////////////////////////////////////////////////
-    CartesianCommunicator(const std::vector<int> &pdimensions_in);
+  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
  // Longer term; drop this in favour of a master / slave model with 
  // cartesian communicator on a subset of ranks, slave ranks controlled
  // by group leader with data xfer via shared memory
  ////////////////////////////////////////////////////////////////////
 #ifdef GRID_COMMS_MPI3
-    // Wraps MPI_Cart routines
+  static int ShmRank;
-    void ShiftedRanks(int dim,int shift,int & source, int & dest);
+  static int ShmSize;
-    int  RankFromProcessorCoor(std::vector<int> &coor);
+  static int GroupRank;
-    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
+  static int GroupSize;
  static int WorldRank;
  static int WorldSize;
-    /////////////////////////////////
+  std::vector<int>  WorldDims;
-    // Grid information queries
+  std::vector<int>  GroupDims;
-    /////////////////////////////////
+  std::vector<int>  ShmDims;
    int                      IsBoss(void)            { return _processor==0; };
    int                      BossRank(void)          { return 0; };
    int                      ThisRank(void)          { return _processor; };
    const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
    const std::vector<int> & ProcessorGrid(void)     { return _processors; };
    int                      ProcessorCount(void)    { return _Nprocessors; };
-    ////////////////////////////////////////////////////////////
+  std::vector<int> GroupCoor;
-    // Reduction
+  std::vector<int> ShmCoor;
-    ////////////////////////////////////////////////////////////
+  std::vector<int> WorldCoor;
    void GlobalSum(RealF &);
    void GlobalSumVector(RealF *,int N);
-    void GlobalSum(RealD &);
+  static std::vector<int> GroupRanks; 
-    void GlobalSumVector(RealD *,int N);
+  static std::vector<int> MyGroup;
  static int ShmSetup;
  static MPI_Win ShmWindow; 
  static MPI_Comm ShmComm;
-    void GlobalSum(uint32_t &);
+  std::vector<int>  LexicographicToWorldRank;
-    void GlobalSum(ComplexF &c)
+  static std::vector<void *> ShmCommBufs;
    {
      GlobalSumVector((float *)&c,2);
    }
    void GlobalSumVector(ComplexF *c,int N)
    {
      GlobalSumVector((float *)c,2*N);
    }
-    void GlobalSum(ComplexD &c)
+#else 
-    {
+  static void ShmInitGeneric(void);
-      GlobalSumVector((double *)&c,2);
+  static commVector<uint8_t> ShmBufStorageVector;
-    }
+#endif 
    void GlobalSumVector(ComplexD *c,int N)
    {
      GlobalSumVector((double *)c,2*N);
    }
-    template<class obj> void GlobalSum(obj &o){
+  /////////////////////////////////
-      typedef typename obj::scalar_type scalar_type;
+  // Grid information and queries
-      int words = sizeof(obj)/sizeof(scalar_type);
+  // Implemented in Communicator_base.C
-      scalar_type * ptr = (scalar_type *)& o;
+  /////////////////////////////////
-      GlobalSumVector(ptr,words);
+  static void * ShmCommBuf;
-    }
+  size_t heap_top;
-    ////////////////////////////////////////////////////////////
+  size_t heap_bytes;
    // Face exchange, buffer swap in translational invariant way
    ////////////////////////////////////////////////////////////
    void SendToRecvFrom(void *xmit,
 			int xmit_to_rank,
 			void *recv,
 			int recv_from_rank,
 			int bytes);
-    void RecvFrom(void *recv,
+  void *ShmBufferSelf(void);
-		  int recv_from_rank,
+  void *ShmBuffer(int rank);
-		  int bytes);
+  void *ShmBufferTranslate(int rank,void * local_p);
-    void SendTo(void *xmit,
+  void *ShmBufferMalloc(size_t bytes);
-		int xmit_to_rank,
+  void ShmBufferFreeAll(void) ;
 		int bytes);
-    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+  ////////////////////////////////////////////////
-			 void *xmit,
+  // Must call in Grid startup
-			 int xmit_to_rank,
+  ////////////////////////////////////////////////
-			 void *recv,
+  static void Init(int *argc, char ***argv);
 			 int recv_from_rank,
 			 int bytes);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-    ////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////
-    // Barrier
+  // Constructor of any given grid
-    ////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////
-    void Barrier(void);
+  CartesianCommunicator(const std::vector<int> &pdimensions_in);
-    ////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////
-    // Broadcast a buffer and composite larger
+  // Wraps MPI_Cart routines, or implements equivalent on other impls
-    ////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////
-    void Broadcast(int root,void* data, int bytes);
+  void ShiftedRanks(int dim,int shift,int & source, int & dest);
-    template<class obj> void Broadcast(int root,obj &data)
+  int  RankFromProcessorCoor(std::vector<int> &coor);
  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
  int                      IsBoss(void)            ;
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
  ////////////////////////////////////////////////////////////
  // Reduction
  ////////////////////////////////////////////////////////////
  void GlobalSum(RealF &);
  void GlobalSumVector(RealF *,int N);
  void GlobalSum(RealD &);
  void GlobalSumVector(RealD *,int N);
  void GlobalSum(uint32_t &);
  void GlobalSum(uint64_t &);
  void GlobalSum(ComplexF &c);
  void GlobalSumVector(ComplexF *c,int N);
  void GlobalSum(ComplexD &c);
  void GlobalSumVector(ComplexD *c,int N);
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
    scalar_type * ptr = (scalar_type *)& o;
    GlobalSumVector(ptr,words);
  }
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
  void SendToRecvFrom(void *xmit,
 		      int xmit_to_rank,
 		      void *recv,
 		      int recv_from_rank,
 		      int bytes);
  void SendRecvPacket(void *xmit,
 		      void *recv,
 		      int xmit_to_rank,
 		      int recv_from_rank,
 		      int bytes);
  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			   void *xmit,
 			   int xmit_to_rank,
 			   void *recv,
 			   int recv_from_rank,
 			   int bytes);
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
  void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				  void *xmit,
 				  int xmit_to_rank,
 				  void *recv,
 				  int recv_from_rank,
 				  int bytes);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
  void StencilBarrier(void);
  ////////////////////////////////////////////////////////////
  // Barrier
  ////////////////////////////////////////////////////////////
  void Barrier(void);
  ////////////////////////////////////////////////////////////
  // Broadcast a buffer and composite larger
  ////////////////////////////////////////////////////////////
  void Broadcast(int root,void* data, int bytes);
  template<class obj> void Broadcast(int root,obj &data)
    {
      Broadcast(root,(void *)&data,sizeof(data));
    };
    static void BroadcastWorld(int root,void* data, int bytes);
 }; 
 }
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@ -1,9 +1,51 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 #include <mpi.h>
 namespace Grid {
-  // Should error check all MPI calls.
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  ShmInitGeneric();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
@ -14,7 +56,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  _processors = processors;
  _processor_coor.resize(_ndimension);
-  MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
@ -27,11 +69,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  assert(Size==_Nprocessors);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
@ -81,21 +126,22 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
-void CartesianCommunicator::RecvFrom(void *recv,
+
-				     int from,
+void CartesianCommunicator::SendRecvPacket(void *xmit,
-				     int bytes) 
+					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
-  int ierr=MPI_Recv(recv, bytes, MPI_CHAR,from,from,communicator,&stat);
+  assert(sender != receiver);
-  assert(ierr==0);
+  int tag = sender;
-}
+  if ( _processor == sender ) {
-void CartesianCommunicator::SendTo(void *xmit,
+    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
-				   int dest,
+  }
-				   int bytes)
+  if ( _processor == receiver ) { 
-{
+    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
-  int rank = _processor; // used for tag; must know who it comes from
+  }
  int ierr = MPI_Send(xmit, bytes, MPI_CHAR,dest,_processor,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
@ -123,7 +169,6 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
@ -142,14 +187,22 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 		     communicator);
  assert(ierr==0);
 }
-
+  ///////////////////////////////////////////////////////
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
 int CartesianCommunicator::RankWorld(void){ 
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
-		      MPI_COMM_WORLD);
+		      communicator_world);
  assert(ierr==0);
 }
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@ -0,0 +1,580 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 #include <mpi.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmSetup = 0;
 int CartesianCommunicator::ShmRank;
 int CartesianCommunicator::ShmSize;
 int CartesianCommunicator::GroupRank;
 int CartesianCommunicator::GroupSize;
 int CartesianCommunicator::WorldRank;
 int CartesianCommunicator::WorldSize;
 MPI_Comm CartesianCommunicator::communicator_world;
 MPI_Comm CartesianCommunicator::ShmComm;
 MPI_Win  CartesianCommunicator::ShmWindow;
 std::vector<int> CartesianCommunicator::GroupRanks;  
 std::vector<int> CartesianCommunicator::MyGroup;
 std::vector<void *> CartesianCommunicator::ShmCommBufs;
 void *CartesianCommunicator::ShmBufferSelf(void)
 {
  return ShmCommBufs[ShmRank];
 }
 void *CartesianCommunicator::ShmBuffer(int rank)
 {
  int gpeer = GroupRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    return ShmCommBufs[gpeer];
  }
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
 {
  int gpeer = GroupRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
    return (void *) remote;
  }
 }
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  MPI_Comm_rank(communicator_world,&WorldRank);
  MPI_Comm_size(communicator_world,&WorldSize);
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
  MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
  MPI_Comm_rank(ShmComm     ,&ShmRank);
  MPI_Comm_size(ShmComm     ,&ShmSize);
  GroupSize = WorldSize/ShmSize;
  /////////////////////////////////////////////////////////////////////
  // find world ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
  MPI_Group WorldGroup, ShmGroup;
  MPI_Comm_group (communicator_world, &WorldGroup); 
  MPI_Comm_group (ShmComm, &ShmGroup);
  std::vector<int> world_ranks(WorldSize); 
  GroupRanks.resize(WorldSize); 
  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
  ///////////////////////////////////////////////////////////////////
  // Identify who is in my group and noninate the leader
  ///////////////////////////////////////////////////////////////////
  int g=0;
  MyGroup.resize(ShmSize);
  for(int rank=0;rank<WorldSize;rank++){
    if(GroupRanks[rank]!=MPI_UNDEFINED){
      assert(g<ShmSize);
      MyGroup[g++] = rank;
    }
  }
  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
  int myleader = MyGroup[0];
  std::vector<int> leaders_1hot(WorldSize,0);
  std::vector<int> leaders_group(GroupSize,0);
  leaders_1hot [ myleader ] = 1;
  ///////////////////////////////////////////////////////////////////
  // global sum leaders over comm world
  ///////////////////////////////////////////////////////////////////
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
  assert(ierr==0);
  ///////////////////////////////////////////////////////////////////
  // find the group leaders world rank
  ///////////////////////////////////////////////////////////////////
  int group=0;
  for(int l=0;l<WorldSize;l++){
    if(leaders_1hot[l]){
      leaders_group[group++] = l;
    }
  }
  ///////////////////////////////////////////////////////////////////
  // Identify the rank of the group in which I (and my leader) live
  ///////////////////////////////////////////////////////////////////
  GroupRank=-1;
  for(int g=0;g<GroupSize;g++){
    if (myleader == leaders_group[g]){
      GroupRank=g;
    }
  }
  assert(GroupRank!=-1);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared window for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBuf = 0;
  ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
  assert(ierr==0);
  // KNL hack -- force to numa-domain 1 in flat
 #if 0
  //#include <numaif.h>
  for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
    void *pages = (void *) ( page + ShmCommBuf );
    int status;
    int flags=MPOL_MF_MOVE_ALL;
    int nodes=1; // numa domain == MCDRAM
    unsigned long count=1;
    ierr= move_pages(0,count, &pages,&nodes,&status,flags);
    if (ierr && (page==0)) perror("numa relocate command failed");
  }
 #endif
  MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBufs.resize(ShmSize);
  for(int r=0;r<ShmSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Verbose for now
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  if (WorldRank == 0){
    std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
    std::cout<< WorldSize << " Ranks " ;
    std::cout<< GroupSize << " Nodes " ;
    std::cout<<  ShmSize  << " with ranks-per-node "<<std::endl;
    std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size ";
    std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
    for(int g=0;g<GroupSize;g++){
      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
    }
    std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
    for(int g=0;g<ShmSize;g++){
      std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
      if(g!=ShmSize-1) std::cout<<",";
      else std::cout<<"}"<<std::endl;
    }
  }
  for(int g=0;g<GroupSize;g++){
    if ( (ShmRank == 0) && (GroupRank==g) )  std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
    for(int r=0;r<ShmSize;r++){
      if ( (ShmRank == 0) && (GroupRank==g) ) {
 	std::cout<<MyGroup[r];
 	if(r<ShmSize-1) std::cout<<",";
 	else std::cout<<"}"<<std::endl;
      }
      MPI_Barrier(communicator_world);
    }
  }
  assert(ShmSetup==0);  ShmSetup=1;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Want to implement some magic ... Group sub-cubes into those on same node
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  std::vector<int> coor = _processor_coor;
  assert(std::abs(shift) <_processors[dim]);
  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,source,_processors);
  source = LexicographicToWorldRank[source];
  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,dest,_processors);
  dest = LexicographicToWorldRank[dest];
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  Lexicographic::IndexFromCoor(coor,rank,_processors);
  rank = LexicographicToWorldRank[rank];
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  Lexicographic::CoorFromIndex(coor,rank,_processors);
  rank = LexicographicToWorldRank[rank];
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  int ierr;
  communicator=communicator_world;
  _ndimension = processors.size();
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
  int log2size = -1;
  for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){  
    if ( (0x1<<i) == ShmSize ) {
      log2size = i;
      break;
    }
  }
  assert(log2size != -1);
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int dim = 0;
  std::vector<int> WorldDims = processors;
  ShmDims.resize(_ndimension,1);
  GroupDims.resize(_ndimension);
  ShmCoor.resize(_ndimension);
  GroupCoor.resize(_ndimension);
  WorldCoor.resize(_ndimension);
  for(int l2=0;l2<log2size;l2++){
    while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%_ndimension;
  }
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
  for(int d=0;d<_ndimension;d++){
    GroupDims[d] = WorldDims[d]/ShmDims[d];
  }
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  assert(WorldSize==_Nprocessors);
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  // 
  ////////////////////////////////////////////////////////////////
  LexicographicToWorldRank.resize(WorldSize,0);
  Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
  Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
  for(int d=0;d<_ndimension;d++){
    WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
  }
  _processor_coor = WorldCoor;
  int lexico;
  Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
  LexicographicToWorldRank[lexico]=WorldRank;
  _processor = lexico;
  ///////////////////////////////////////////////////////////////////
  // global sum Lexico to World mapping
  ///////////////////////////////////////////////////////////////////
  ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
  assert(ierr==0);
 };
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
 #if 0
  this->StencilBarrier();
  MPI_Request xrq;
  MPI_Request rrq;
  static int sequence;
  int ierr;
  int tag;
  int check;
  assert(dest != _processor);
  assert(from != _processor);
  int gdest = GroupRanks[dest];
  int gfrom = GroupRanks[from];
  int gme   = GroupRanks[_processor];
  sequence++;
  char *from_ptr = (char *)ShmCommBufs[ShmRank];
  int small = (bytes<MAX_MPI_SHM_BYTES);
  typedef uint64_t T;
  int words = bytes/sizeof(T);
  assert(((size_t)bytes &(sizeof(T)-1))==0);
  assert(gme == ShmRank);
  if ( small && (gdest !=MPI_UNDEFINED) ) {
    char *to_ptr   = (char *)ShmCommBufs[gdest];
    assert(gme != gdest);
    T *ip = (T *)xmit;
    T *op = (T *)to_ptr;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      op[w]=ip[w];
    }
    bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
    bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence));
  } else { 
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
  }
  this->StencilBarrier();
  if (small && (gfrom !=MPI_UNDEFINED) ) {
    T *ip = (T *)from_ptr;
    T *op = (T *)recv;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      op[w]=ip[w];
    }
    bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
    bcopy(&from_ptr[bytes+4],&check,sizeof(check));
    assert(check==sequence);
    assert(tag==from);
  } else { 
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    assert(ierr==0);
    list.push_back(rrq);
  }
  this->StencilBarrier();
 #else
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 #endif
 }
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int dest,
 						       void *recv,
 						       int from,
 						       int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
  int ierr;
  assert(dest != _processor);
  assert(from != _processor);
  int gdest = GroupRanks[dest];
  int gfrom = GroupRanks[from];
  int gme   = GroupRanks[_processor];
  assert(gme == ShmRank);
  if ( gdest == MPI_UNDEFINED ) {
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
  }
  if ( gfrom ==MPI_UNDEFINED) {
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    assert(ierr==0);
    list.push_back(rrq);
  }
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  SendToRecvFromComplete(list);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 }
--- a/lib/communicator/Communicator_mpi3_leader.cc
+++ b/lib/communicator/Communicator_mpi3_leader.cc
@ -0,0 +1,874 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 #include <mpi.h>
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Workarounds:
 /// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
 ///    darwin dispatch semaphores don't seem to be multiprocess.
 ///
 /// ii) openmpi under --mca shmem posix works with two squadrons per node; 
 ///     openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME
 ///     memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI.
 ///
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 #include <semaphore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 typedef sem_t *Grid_semaphore;
 #define SEM_INIT(S)      S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
 #define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
 #define SEM_POST(S) assert ( sem_post(S) == 0 ); 
 #define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
 #include <sys/mman.h>
 namespace Grid {
 enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL };
 struct Descriptor {
  uint64_t buf;
  size_t bytes;
  int rank;
  int tag;
  int command;
  MPI_Request request;
 };
 const int pool = 48;
 class SlaveState {
 public:
  volatile int head;
  volatile int start;
  volatile int tail;
  volatile Descriptor Descrs[pool];
 };
 class Slave {
 public:
  Grid_semaphore  sem_head;
  Grid_semaphore  sem_tail;
  SlaveState *state;
  MPI_Comm squadron;
  uint64_t     base;
  int universe_rank;
  int vertical_rank;
  char sem_name [NAME_MAX];
  ////////////////////////////////////////////////////////////
  // Descriptor circular pointers
  ////////////////////////////////////////////////////////////
  Slave() {};
  void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
  void SemInit(void) {
    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
    //    printf("SEM_NAME: %s \n",sem_name);
    SEM_INIT(sem_head);
    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
    //    printf("SEM_NAME: %s \n",sem_name);
    SEM_INIT(sem_tail);
  }  
  void SemInitExcl(void) {
    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
    //    printf("SEM_INIT_EXCL: %s \n",sem_name);
    SEM_INIT_EXCL(sem_head);
    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
    //    printf("SEM_INIT_EXCL: %s \n",sem_name);
    SEM_INIT_EXCL(sem_tail);
  }  
  void WakeUpDMA(void) { 
    SEM_POST(sem_head);
  };
  void WakeUpCompute(void) { 
    SEM_POST(sem_tail);
  };
  void WaitForCommand(void) { 
    SEM_WAIT(sem_head);
  };
  void WaitForComplete(void) { 
    SEM_WAIT(sem_tail);
  };
  void EventLoop (void) {
    //    std::cout<< " Entering event loop "<<std::endl;
    while(1){
      WaitForCommand();
      //      std::cout << "Getting command "<<std::endl;
      Event();
    }
  }
  int Event (void) ;
  uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
  void WaitAll() {
    //    std::cout << "Queueing WAIT command  "<<std::endl;
    QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
    //    std::cout << "Waking up DMA "<<std::endl;
    WakeUpDMA();
    //    std::cout << "Waiting from semaphore "<<std::endl;
    WaitForComplete();
    //    std::cout << "Checking FIFO is empty "<<std::endl;
    assert ( state->tail == state->head );
  }
 };
 ////////////////////////////////////////////////////////////////////////
 // One instance of a data mover.
 // Master and Slave must agree on location in shared memory
 ////////////////////////////////////////////////////////////////////////
 class MPIoffloadEngine { 
 public:
  static std::vector<Slave> Slaves;
  static int ShmSetup;
  static int UniverseRank;
  static int UniverseSize;
  static MPI_Comm communicator_universe;
  static MPI_Comm communicator_cached;
  static MPI_Comm HorizontalComm;
  static int HorizontalRank;
  static int HorizontalSize;
  static MPI_Comm VerticalComm;
  static MPI_Win  VerticalWindow; 
  static int VerticalSize;
  static int VerticalRank;
  static std::vector<void *> VerticalShmBufs;
  static std::vector<std::vector<int> > UniverseRanks;
  static std::vector<int> UserCommunicatorToWorldRanks; 
  static MPI_Group WorldGroup, CachedGroup;
  static void CommunicatorInit (MPI_Comm &communicator_world,
 				MPI_Comm &ShmComm,
 				void * &ShmCommBuf);
  static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
  /////////////////////////////////////////////////////////
  // routines for master proc must handle any communicator
  /////////////////////////////////////////////////////////
  static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
     //    std::cout<< " Queueing send  "<< bytes<< " slave "<< slave << " to comm "<<rank  <<std::endl;
    Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
    //    std::cout << "Queued send command to rank "<< rank<< " via "<<slave <<std::endl;
    Slaves[slave].WakeUpDMA();
    //    std::cout << "Waking up DMA "<< slave<<std::endl;
  };
  static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl;
    Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
    //    std::cout << "Queued recv command from rank "<< rank<< " via "<<slave <<std::endl;
    Slaves[slave].WakeUpDMA();
    //    std::cout << "Waking up DMA "<< slave<<std::endl;
  };
  static void WaitAll() {
    for(int s=1;s<VerticalSize;s++) {
      //      std::cout << "Waiting for slave "<< s<<std::endl;
      Slaves[s].WaitAll();
    }
    //    std::cout << " Wait all Complete "<<std::endl;
  };
  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
    int basework = nwork/units;
    int backfill = units-(nwork%units);
    if ( me >= units ) { 
      mywork = myoff = 0;
    } else { 
      mywork = (nwork+me)/units;
      myoff  = basework * me;
      if ( me > backfill ) 
 	myoff+= (me-backfill);
    }
    return;
  };
  static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    uint8_t * cbuf = (uint8_t *) buf;
    int mywork, myoff, procs;
    procs = VerticalSize-1;
    for(int s=0;s<procs;s++) {
      GetWork(bytes,s,mywork,myoff,procs);
      QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
    }
  };
  static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    uint8_t * cbuf = (uint8_t *) buf;
    int mywork, myoff, procs;
    procs = VerticalSize-1;
    for(int s=0;s<procs;s++) {
      GetWork(bytes,s,mywork,myoff,procs);
      QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
    }
  };
 };
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 std::vector<Slave> MPIoffloadEngine::Slaves;
 int MPIoffloadEngine::UniverseRank;
 int MPIoffloadEngine::UniverseSize;
 MPI_Comm  MPIoffloadEngine::communicator_universe;
 MPI_Comm  MPIoffloadEngine::communicator_cached;
 MPI_Group MPIoffloadEngine::WorldGroup;
 MPI_Group MPIoffloadEngine::CachedGroup;
 MPI_Comm MPIoffloadEngine::HorizontalComm;
 int      MPIoffloadEngine::HorizontalRank;
 int      MPIoffloadEngine::HorizontalSize;
 MPI_Comm MPIoffloadEngine::VerticalComm;
 int      MPIoffloadEngine::VerticalSize;
 int      MPIoffloadEngine::VerticalRank;
 MPI_Win  MPIoffloadEngine::VerticalWindow; 
 std::vector<void *>            MPIoffloadEngine::VerticalShmBufs;
 std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
 std::vector<int>               MPIoffloadEngine::UserCommunicatorToWorldRanks; 
 int MPIoffloadEngine::ShmSetup = 0;
 void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
 					 MPI_Comm &ShmComm,
 					 void * &ShmCommBuf)
 {      
  int flag;
  assert(ShmSetup==0);  
  //////////////////////////////////////////////////////////////////////
  // Universe is all nodes prior to squadron grouping
  //////////////////////////////////////////////////////////////////////
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe);
  MPI_Comm_rank(communicator_universe,&UniverseRank);
  MPI_Comm_size(communicator_universe,&UniverseSize);
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory (Verticals)
  /////////////////////////////////////////////////////////////////////
 #undef MPI_SHARED_MEM_DEBUG
 #ifdef  MPI_SHARED_MEM_DEBUG
  MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm);
 #else 
  MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
 #endif
  MPI_Comm_rank(VerticalComm     ,&VerticalRank);
  MPI_Comm_size(VerticalComm     ,&VerticalSize);
  //////////////////////////////////////////////////////////////////////
  // Split into horizontal groups by rank in squadron
  //////////////////////////////////////////////////////////////////////
  MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm);
  MPI_Comm_rank(HorizontalComm,&HorizontalRank);
  MPI_Comm_size(HorizontalComm,&HorizontalSize);
  assert(HorizontalSize*VerticalSize==UniverseSize);
  ////////////////////////////////////////////////////////////////////////////////
  // What is my place in the world
  ////////////////////////////////////////////////////////////////////////////////
  int WorldRank=0;
  if(VerticalRank==0) WorldRank = HorizontalRank;
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm);
  assert(ierr==0);
  ////////////////////////////////////////////////////////////////////////////////
  // Where is the world in the universe?
  ////////////////////////////////////////////////////////////////////////////////
  UniverseRanks = std::vector<std::vector<int> >(HorizontalSize,std::vector<int>(VerticalSize,0));
  UniverseRanks[WorldRank][VerticalRank] = UniverseRank;
  for(int w=0;w<HorizontalSize;w++){
    ierr=MPI_Allreduce(MPI_IN_PLACE,&UniverseRanks[w][0],VerticalSize,MPI_INT,MPI_SUM,communicator_universe);
    assert(ierr==0);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared window for our group, pass back Shm info to CartesianCommunicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  VerticalShmBufs.resize(VerticalSize);
 #undef MPI_SHARED_MEM
 #ifdef MPI_SHARED_MEM
  ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
  ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
  assert(ierr==0);
  //  std::cout<<"SHM "<<ShmCommBuf<<std::endl;
  for(int r=0;r<VerticalSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
    //    std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
  }
 #else 
  char shm_name [NAME_MAX];
  MPI_Barrier(VerticalComm);
  if ( VerticalRank == 0 ) {
    for(int r=0;r<VerticalSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
      if ( r>0 ) size = sizeof(SlaveState);
      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
      shm_unlink(shm_name);
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
      if ( fd < 0 ) {
 	perror("failed shm_open");
 	assert(0);
      }
      ftruncate(fd, size);
      VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( VerticalShmBufs[r] == MAP_FAILED ) { 
 	perror("failed mmap");
 	assert(0);
      }
      uint64_t * check = (uint64_t *) VerticalShmBufs[r];
      check[0] = WorldRank;
      check[1] = r;
      //      std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
    }
  }
  MPI_Barrier(VerticalComm);
  if ( VerticalRank != 0 ) { 
  for(int r=0;r<VerticalSize;r++){
    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
    if ( r>0 ) size = sizeof(SlaveState);
    sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
    int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
    if ( fd<0 ) {
      perror("failed shm_open");
      assert(0);
    }
    VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    uint64_t * check = (uint64_t *) VerticalShmBufs[r];
    assert(check[0]== WorldRank);
    assert(check[1]== r);
    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
  }
  }
 #endif
  MPI_Barrier(VerticalComm);
  //////////////////////////////////////////////////////////////////////
  // Map rank of leader on node in their in new world, to the
  // rank in this vertical plane's horizontal communicator
  //////////////////////////////////////////////////////////////////////
  communicator_world = HorizontalComm;
  ShmComm            = VerticalComm;
  ShmCommBuf         = VerticalShmBufs[0];
  MPI_Comm_group (communicator_world, &WorldGroup); 
  ///////////////////////////////////////////////////////////
  // Start the slave data movers
  ///////////////////////////////////////////////////////////
  if ( VerticalRank != 0 ) {
    Slave indentured;
    indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
    indentured.SemInitExcl();// init semaphore in shared memory
    MPI_Barrier(VerticalComm);
    MPI_Barrier(VerticalComm);
    indentured.EventLoop();
    assert(0);
  } else {
    Slaves.resize(VerticalSize);
    for(int i=1;i<VerticalSize;i++){
      Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
    }
    MPI_Barrier(VerticalComm);
    for(int i=1;i<VerticalSize;i++){
      Slaves[i].SemInit();// init semaphore in shared memory
    }
    MPI_Barrier(VerticalComm);
  }
  ///////////////////////////////////////////////////////////
  // Verbose for now
  ///////////////////////////////////////////////////////////
  ShmSetup=1;
  if (UniverseRank == 0){
    std::cout<<GridLogMessage << "Grid MPI-3 configuration: detected ";
    std::cout<<UniverseSize   << " Ranks " ;
    std::cout<<HorizontalSize << " Nodes " ;
    std::cout<<VerticalSize   << " with ranks-per-node "<<std::endl;
    std::cout<<GridLogMessage << "Grid MPI-3 configuration: using one lead process per node " << std::endl;
    std::cout<<GridLogMessage << "Grid MPI-3 configuration: reduced communicator has size " << HorizontalSize << std::endl;
    for(int g=0;g<HorizontalSize;g++){
      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<< UniverseRanks[g][0]<<std::endl;
    }
    for(int g=0;g<HorizontalSize;g++){
      std::cout<<GridLogMessage<<" { ";
      for(int s=0;s<VerticalSize;s++){
 	std::cout<< UniverseRanks[g][s];
 	if ( s<VerticalSize-1 ) {
 	  std::cout<<",";
 	}
      }
      std::cout<<" } "<<std::endl;
    }
  }
 };
  ///////////////////////////////////////////////////////////////////////////////////////////////
  // Map the communicator into communicator_world, and find the neighbour.
  // Cache the mappings; cache size is 1.
  ///////////////////////////////////////////////////////////////////////////////////////////////
 void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank) {
  if ( comm == HorizontalComm ) {
    comm_world_peer = rank;
    //    std::cout << " MapCommRankToWorldRank  horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
  } else if ( comm == communicator_cached ) {
    comm_world_peer = UserCommunicatorToWorldRanks[rank];
    //    std::cout << " MapCommRankToWorldRank  cached " <<rank<<"->"<<comm_world_peer<<std::endl;
  } else { 
    int size;
    MPI_Comm_size(comm,&size);
    UserCommunicatorToWorldRanks.resize(size);
    std::vector<int> cached_ranks(size); 
    for(int r=0;r<size;r++) {
      cached_ranks[r]=r;
    }
    communicator_cached=comm;
    MPI_Comm_group(communicator_cached, &CachedGroup);
    MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]); 
    comm_world_peer = UserCommunicatorToWorldRanks[rank];
    //    std::cout << " MapCommRankToWorldRank  cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
    assert(comm_world_peer != MPI_UNDEFINED);
  }
  assert( (tag & (~0xFFFFL)) ==0); 
  uint64_t icomm = (uint64_t)comm;
  int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
                ^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
  //  hashtag = (comm_hash<<15) | tag;      
  hashtag = tag;      
 };
 void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
 {
  squadron=_squadron;
  universe_rank=_universe_rank;
  vertical_rank=_vertical_rank;
  state   =_state;
  //  std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
  state->head = state->tail = state->start = 0;
  base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
  int rank; MPI_Comm_rank(_squadron,&rank);
 }
 #define PERI_PLUS(A) ( (A+1)%pool )
 int Slave::Event (void) {
  static int tail_last;
  static int head_last;
  static int start_last;
  int ierr;
  ////////////////////////////////////////////////////
  // Try to advance the start pointers
  ////////////////////////////////////////////////////
  int s=state->start;
  if ( s != state->head ) {
    switch ( state->Descrs[s].command ) {
    case COMMAND_ISEND:
      /*
            std::cout<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
      	       << " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
       << " Comm " << MPIoffloadEngine::communicator_universe<< " me " <<universe_rank<< std::endl;
      */
      ierr = MPI_Isend((void *)(state->Descrs[s].buf+base), 
 		       state->Descrs[s].bytes, 
 		       MPI_CHAR,
 		       state->Descrs[s].rank,
 		       state->Descrs[s].tag,
 		       MPIoffloadEngine::communicator_universe,
 		       (MPI_Request *)&state->Descrs[s].request);
      assert(ierr==0);
      state->start = PERI_PLUS(s);
      return 1;
      break;
    case COMMAND_IRECV:
      /*
      std::cout<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
 	       << " from " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
 	       << " Comm " << MPIoffloadEngine::communicator_universe<< " me "<< universe_rank<< std::endl;
      */
      ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base), 
 		     state->Descrs[s].bytes, 
 		     MPI_CHAR,
 		     state->Descrs[s].rank,
 		     state->Descrs[s].tag,
 		     MPIoffloadEngine::communicator_universe,
 		     (MPI_Request *)&state->Descrs[s].request);
      //      std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
      //      std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
      assert(ierr==0);
      state->start = PERI_PLUS(s);
      return 1;
      break;
    case COMMAND_WAITALL:
      for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
 	MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
      };
      s=PERI_PLUS(s);
      state->start = s;
      state->tail  = s;
      WakeUpCompute();
      return 1;
      break;
    default:
      assert(0);
      break;
    }
  }
  return 0;
 }
  //////////////////////////////////////////////////////////////////////////////
  // External interaction with the queue
  //////////////////////////////////////////////////////////////////////////////
 uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank) 
 {
  /////////////////////////////////////////
  // Spin; if FIFO is full until not full
  /////////////////////////////////////////
  int head =state->head;
  int next = PERI_PLUS(head);
  // Set up descriptor
  int worldrank;
  int hashtag;
  MPI_Comm    communicator;
  MPI_Request request;
  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
  uint64_t relative= (uint64_t)buf - base;
  state->Descrs[head].buf    = relative;
  state->Descrs[head].bytes  = bytes;
  state->Descrs[head].rank   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
  state->Descrs[head].tag    = hashtag;
  state->Descrs[head].command= command;
  /*  
  if ( command == COMMAND_ISEND ) { 
  std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank 
            << " to worldrank " << worldrank <<std::endl;
  std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
  std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
  } 
  if ( command == COMMAND_IRECV ) { 
  std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank 
            << " from worldrank " << worldrank <<std::endl;
  std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
  std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
  } 
  */
  // Block until FIFO has space
  while( state->tail==next );
  // Msync on weak order architectures
  // Advance pointer
  state->head = next;
  return 0;
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 void CartesianCommunicator::Init(int *argc, char ***argv) 
 {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
  communicator_world = MPI_COMM_WORLD;
  MPI_Comm ShmComm;
  MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf);
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  _ndimension = processors.size();
  std::vector<int> periodic(_ndimension,1);
  _Nprocessors=1;
  _processors = processors;
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  int Size; 
  MPI_Comm_size(communicator_world,&Size);
  assert(Size==_Nprocessors);
  _processor_coor.resize(_ndimension);
  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank  (communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
 };
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 }
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int dest,
 						       void *recv,
 						       int from,
 						       int bytes)
 {
  uint64_t xmit_i = (uint64_t) xmit;
  uint64_t recv_i = (uint64_t) recv;
  uint64_t shm    = (uint64_t) ShmCommBuf;
  // assert xmit and recv lie in shared memory region
  assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
  assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
  assert(from!=_processor);
  assert(dest!=_processor);
  MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
  MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  MPIoffloadEngine::WaitAll();
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
 void *CartesianCommunicator::ShmBuffer(int rank) {
  return NULL;
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
  return NULL;
 }
 };
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@ -1,6 +1,42 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_none.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
  ShmInitGeneric();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _processors = processors;
@ -20,17 +56,14 @@ void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSum(uint64_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}
-void CartesianCommunicator::RecvFrom(void *recv,
+void CartesianCommunicator::SendRecvPacket(void *xmit,
-				     int recv_from_rank,
+					   void *recv,
-				     int bytes) 
+					   int xmit_to_rank,
-{
+					   int recv_from_rank,
-  assert(0);
+					   int bytes)
 }
 void CartesianCommunicator::SendTo(void *xmit,
 				   int xmit_to_rank,
 				   int bytes)
 {
  assert(0);
 }
@ -59,30 +92,17 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  assert(0);
 }
-void CartesianCommunicator::Barrier(void)
+int  CartesianCommunicator::RankWorld(void){return 0;}
-{
+void CartesianCommunicator::Barrier(void){}
-}
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
-
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
-{
+void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor ;}
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  source =0;
  dest=0;
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  return 0;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
 }
 }
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@ -0,0 +1,337 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_shmem.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 #include <mpp/shmem.h>
 namespace Grid {
  // Should error check all MPI calls.
 #define SHMEM_VET(addr) 
 #define SHMEM_VET_DEBUG(addr) {				\
  if ( ! shmem_addr_accessible(addr,_processor) ) {\
    std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
    BACKTRACEFILE();		   \
  }\
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 typedef struct HandShake_t { 
  uint64_t seq_local;
  uint64_t seq_remote;
 } HandShake;
 std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
  array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
  ret.fill(SHMEM_SYNC_VALUE);
  return ret;
 }
 static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync_init = make_psync_init();
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());
  RConnections.resize(shmem_n_pes());
  for(int pe =0 ; pe<shmem_n_pes();pe++){
    XConnections[pe].seq_local = 0;
    XConnections[pe].seq_remote= 0;
    RConnections[pe].seq_local = 0;
    RConnections[pe].seq_remote= 0;
  }
  shmem_barrier_all();
  ShmInitGeneric();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
  std::vector<int> periodic(_ndimension,1);
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  _processor = shmem_my_pe();
  Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  int Size = shmem_n_pes(); 
  assert(Size==_Nprocessors);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  static long long source ;
  static long long dest   ;
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  //  int nreduce=1;
  //  int pestart=0;
  //  int logStride=0;
  source = u;
  dest   = 0;
  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
  shmem_barrier_all(); // necessary?
  u = dest;
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  static long long source ;
  static long long dest   ;
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  //  int nreduce=1;
  //  int pestart=0;
  //  int logStride=0;
  source = u;
  dest   = 0;
  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
  shmem_barrier_all(); // necessary?
  u = dest;
 }
 void CartesianCommunicator::GlobalSum(float &f){
  static float source ;
  static float dest   ;
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  source = f;
  dest   =0.0;
  shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
  f = dest;
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  static float source ;
  static float dest   = 0 ;
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  if ( shmem_addr_accessible(f,_processor)  ){
    shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
    return;
  }
  for(int i=0;i<N;i++){
    dest   =0.0;
    source = f[i];
    shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
    f[i] = dest;
  }
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  static double source;
  static double dest  ;
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  source = d;
  dest   = 0;
  shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
  d = dest;
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  static double source ;
  static double dest   ;
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  if ( shmem_addr_accessible(d,_processor)  ){
    shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
    return;
  }
  for(int i=0;i<N;i++){
    source = d[i];
    dest   =0.0;
    shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
    d[i] = dest;
  }
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  std::vector<int> coor = _processor_coor;
  assert(std::abs(shift) <_processors[dim]);
  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,source,_processors);
  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,dest,_processors);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  Lexicographic::IndexFromCoor(coor,rank,_processors);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  Lexicographic::CoorFromIndex(coor,rank,_processors);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  SHMEM_VET(xmit);
  SHMEM_VET(recv);
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  static uint64_t seq;
  assert(recv!=xmit);
  volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender];
  volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver];
  if ( _processor == sender ) {
    // Check he has posted a receive
    while(SendSeq->seq_remote == SendSeq->seq_local);
    // Advance our send count
    seq = ++(SendSeq->seq_local);
    // Send this packet 
    SHMEM_VET(recv);
    shmem_putmem(recv,xmit,bytes,receiver);
    shmem_fence();
    //Notify him we're done
    shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
    shmem_fence();
  }
  if ( _processor == receiver ) {
    // Post a receive
    seq = ++(RecvSeq->seq_local);
    shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
    // Now wait until he has advanced our reception counter
    while(RecvSeq->seq_remote != RecvSeq->seq_local);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  SHMEM_VET(xmit);
  SHMEM_VET(recv);
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
  shmem_putmem(recv,xmit,bytes,dest);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  //  shmem_quiet();      // I'm done
  shmem_barrier_all();// He's done too
 }
 void CartesianCommunicator::Barrier(void)
 {
  shmem_barrier_all();
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  static uint32_t word;
  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
  int words = bytes/4;
  if ( shmem_addr_accessible(data,_processor)  ){
    shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync);
    return;
  }
  for(int w=0;w<words;w++){
    word = array[w];
    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
    if ( shmem_my_pe() != root ) {
      array[w] = word;
    }
    shmem_barrier_all();
  }
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  static uint32_t word;
  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
  int words = bytes/4;
  for(int w=0;w<words;w++){
    word = array[w];
    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
    if ( shmem_my_pe() != root ) {
      array[w]= word;
    }
    shmem_barrier_all();
  }
 }
 }
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cshift/Cshift_common.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_CSHIFT_COMMON_H_
 #define _GRID_CSHIFT_COMMON_H_
@ -8,7 +36,7 @@ class SimpleCompressor {
 public:
  void Point(int) {};
-  vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) {
+  vobj operator() (const vobj &arg) {
    return arg;
  }
 };
@ -17,7 +45,7 @@ public:
 // Gather for when there is no need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
+Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
 {
  int rd = rhs._grid->_rdimensions[dimension];
@ -30,26 +58,32 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-	int o  = n*rhs._grid->_slice_stride[dimension];
+	int o  = n*stride;
-	int bo = n*rhs._grid->_slice_block[dimension];
+	int bo = n*e2;
-	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	buffer[off+bo+b]=compress(rhs._odata[so+o+b]);
      }
    }
  } else { 
     int bo=0;
     std::vector<std::pair<int,int> > table;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
-	 int o  = n*rhs._grid->_slice_stride[dimension];
+	 int o  = n*stride;
-	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	 int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
 	 if ( ocb &cbmask ) {
-	   buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	   table.push_back(std::pair<int,int> (bo++,o+b));
 	 }
       }
     }
 PARALLEL_FOR_LOOP     
     for(int i=0;i<table.size();i++){
       buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
     }
  }
 }
@ -70,16 +104,17 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  
+  int n1=rhs._grid->_slice_stride[dimension];
  int n2=rhs._grid->_slice_block[dimension];
  if ( cbmask ==0x3){
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-	int o=n*rhs._grid->_slice_stride[dimension];
+	int o      =   n*n1;
-	int offset = b+n*rhs._grid->_slice_block[dimension];
+	int offset = b+n*n2;
 	cobj temp =compress(rhs._odata[so+o+b]);
 	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	extract<cobj>(temp,pointers,offset);
      }
@ -87,6 +122,7 @@ PARALLEL_NESTED_LOOP2
  } else { 
    assert(0); //Fixme think this is buggy
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o=n*rhs._grid->_slice_stride[dimension];
@ -94,7 +130,7 @@ PARALLEL_NESTED_LOOP2
 	int offset = b+n*rhs._grid->_slice_block[dimension];
 	if ( ocb & cbmask ) {
-	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	  cobj temp =compress(rhs._odata[so+o+b]);
 	  extract<cobj>(temp,pointers,offset);
 	}
      }
@ -105,7 +141,7 @@ PARALLEL_NESTED_LOOP2
 //////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
+template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  SimpleCompressor<vobj> dontcompress;
  Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
@ -123,7 +159,7 @@ template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vec
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];
@ -216,13 +252,13 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
-
+  int stride = rhs._grid->_slice_stride[dimension];
  if(cbmask == 0x3 ){
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-        int o =n*rhs._grid->_slice_stride[dimension]+b;
+        int o =n*stride+b;
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
      }
@ -232,7 +268,7 @@ PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-        int o =n*rhs._grid->_slice_stride[dimension]+b;
+        int o =n*stride+b;
        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
        if ( ocb&cbmask ) {
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
@ -258,11 +294,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block [dimension];
  int stride = rhs._grid->_slice_stride[dimension];
 PARALLEL_NESTED_LOOP2
  for(int n=0;n<e1;n++){
  for(int b=0;b<e2;b++){
-      int o  =n*rhs._grid->_slice_stride[dimension];
+      int o  =n*stride;
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
      if ( ocb&cbmask ) {
 	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
@ -296,6 +333,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int gd = grid->_gdimensions[dimension];
  int ly = grid->_simd_layout[dimension];
  // Map to always positive shift modulo global full dimension.
  shift = (shift+fd)%fd;
@ -304,6 +342,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
  // the permute type
  int permute_dim =grid->PermuteDim(dimension);
  int permute_type=grid->PermuteType(dimension);
  int permute_type_dist;
  for(int x=0;x<rd;x++){       
@ -315,15 +354,31 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
    int sx     = (x+sshift)%rd;
    // FIXME : This must change where we have a 
    // Rotate slice.
    // Document how this works ; why didn't I do this when I first wrote it...
    // wrap is whether sshift > rd.
    //  num is sshift mod rd.
    // 
    int permute_slice=0;
    if(permute_dim){
      int wrap = sshift/rd;
      int  num = sshift%rd;
      if ( x< rd-num ) permute_slice=wrap;
-      else permute_slice = 1-wrap;
+      else permute_slice = (wrap+1)%ly;
      if ( (ly>2) && (permute_slice) ) {
 	assert(permute_type & RotateBit);
 	permute_type_dist = permute_type|permute_slice;
      } else {
 	permute_type_dist = permute_type;
      }
    }
-    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
+    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
--- a/lib/cshift/Cshift_mpi.h
+++ b/lib/cshift/Cshift_mpi.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cshift/Cshift_mpi.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_CSHIFT_MPI_H_
 #define _GRID_CSHIFT_MPI_H_
@ -91,8 +119,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
-  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
+  commVector<vobj> send_buf(buffer_size);
-  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
+  commVector<vobj> recv_buf(buffer_size);
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@ -163,11 +191,12 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  int words = sizeof(vobj)/sizeof(vector_type);
-  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
+  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
-  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
+  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
  int bytes = buffer_size*sizeof(scalar_object);
-  std::vector<scalar_object *>  pointers(Nsimd);  // 
+  std::vector<scalar_object *>  pointers(Nsimd); // 
  std::vector<scalar_object *> rpointers(Nsimd); // received pointers
  ///////////////////////////////////////////
--- a/lib/cshift/Cshift_none.h
+++ b/lib/cshift/Cshift_none.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/cshift/Cshift_none.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_CSHIFT_NONE_H_
 #define _GRID_CSHIFT_NONE_H_
 namespace Grid {
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@ -1,44 +1,74 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/lattice/Lattice_ET.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_LATTICE_ET_H
 #define GRID_LATTICE_ET_H
 #include <iostream>
 #include <vector>
 #include <tuple>
 #include <typeinfo>
 #include <vector>
 namespace Grid {
-  ////////////////////////////////////////////////////
+////////////////////////////////////////////////////
-  // Predicated where support
+// Predicated where support
-  ////////////////////////////////////////////////////
+////////////////////////////////////////////////////
-  template<class iobj,class vobj,class robj>
+template <class iobj, class vobj, class robj>
-    inline vobj predicatedWhere(const iobj &predicate,const vobj &iftrue,const robj &iffalse) {
+inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
                            const robj &iffalse) {
  typename std::remove_const<vobj>::type ret;
-    typename std::remove_const<vobj>::type ret;
+  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
-    typedef typename vobj::scalar_object scalar_object;
+  const int Nsimd = vobj::vector_type::Nsimd();
-    typedef typename vobj::scalar_type scalar_type;
+  const int words = sizeof(vobj) / sizeof(vector_type);
    typedef typename vobj::vector_type vector_type;
-    const int Nsimd = vobj::vector_type::Nsimd();
+  std::vector<Integer> mask(Nsimd);
-    const int words = sizeof(vobj)/sizeof(vector_type);
+  std::vector<scalar_object> truevals(Nsimd);
  std::vector<scalar_object> falsevals(Nsimd);
-    std::vector<Integer> mask(Nsimd);
+  extract(iftrue, truevals);
-    std::vector<scalar_object> truevals (Nsimd);
+  extract(iffalse, falsevals);
-    std::vector<scalar_object> falsevals(Nsimd);
+  extract<vInteger, Integer>(TensorRemove(predicate), mask);
-    extract(iftrue   ,truevals);
+  for (int s = 0; s < Nsimd; s++) {
-    extract(iffalse  ,falsevals);
+    if (mask[s]) falsevals[s] = truevals[s];
    extract<vInteger,Integer>(TensorRemove(predicate),mask);
    for(int s=0;s<Nsimd;s++){
      if (mask[s]) falsevals[s]=truevals[s];
    }
    merge(ret,falsevals);
    return ret;
  }
  merge(ret, falsevals);
  return ret;
 }
 ////////////////////////////////////////////
 // recursive evaluation of expressions; Could
 // switch to generic approach with variadics, a la
@ -46,303 +76,353 @@ namespace Grid {
 // from tuple is hideous; C++14 introduces std::make_index_sequence for this
 ////////////////////////////////////////////
 // leaf eval of lattice ; should enable if protect using traits
-//leaf eval of lattice ; should enable if protect using traits
+template <typename T>
 using is_lattice = std::is_base_of<LatticeBase, T>;
-template <typename T> using is_lattice      = std::is_base_of<LatticeBase,T >;
+template <typename T>
 using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
 template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
 //Specialization of getVectorType for lattices
 template<typename T>
 struct getVectorType<Lattice<T> >{
  typedef typename Lattice<T>::vector_object type;
 };
 template<class sobj>
 inline sobj eval(const unsigned int ss, const sobj &arg)
 {
  return arg;
 }
-template<class lobj>
+template <class lobj>
-inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg)
+inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
-{
+  return arg._odata[ss];
    return arg._odata[ss];
 }
 // handle nodes in syntax tree
 template <typename Op, typename T1>
-auto inline eval(const unsigned int ss, const LatticeUnaryExpression<Op,T1 > &expr) // eval one operand
+auto inline eval(
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second))))
+    const unsigned int ss,
-{
+    const LatticeUnaryExpression<Op, T1> &expr)  // eval one operand
-  return expr.first.func(eval(ss,std::get<0>(expr.second)));
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
  return expr.first.func(eval(ss, std::get<0>(expr.second)));
 }
 template <typename Op, typename T1, typename T2>
-auto inline eval(const unsigned int ss, const LatticeBinaryExpression<Op,T1,T2> &expr) // eval two operands
+auto inline eval(
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second))))
+    const unsigned int ss,
-{
+    const LatticeBinaryExpression<Op, T1, T2> &expr)  // eval two operands
-  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)));
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
                                eval(ss, std::get<1>(expr.second)))) {
  return expr.first.func(eval(ss, std::get<0>(expr.second)),
                         eval(ss, std::get<1>(expr.second)));
 }
 template <typename Op, typename T1, typename T2, typename T3>
-auto inline eval(const unsigned int ss, const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) // eval three operands
+auto inline eval(const unsigned int ss,
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second))))
+                 const LatticeTrinaryExpression<Op, T1, T2, T3>
-{
+                     &expr)  // eval three operands
-  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second)) );
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
                                eval(ss, std::get<1>(expr.second)),
                                eval(ss, std::get<2>(expr.second)))) {
  return expr.first.func(eval(ss, std::get<0>(expr.second)),
                         eval(ss, std::get<1>(expr.second)),
                         eval(ss, std::get<2>(expr.second)));
 }
 //////////////////////////////////////////////////////////////////////////
-// Obtain the grid from an expression, ensuring conformable. This must follow a tree recursion
+// Obtain the grid from an expression, ensuring conformable. This must follow a
 // tree recursion
 //////////////////////////////////////////////////////////////////////////
-template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
+template <class T1,
-inline void GridFromExpression(GridBase * &grid,const T1& lat)   // Lattice leaf
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
 inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf
 {
-  if ( grid ) {
+  if (grid) {
-    conformable(grid,lat._grid);
+    conformable(grid, lat._grid);
  }
-  grid=lat._grid;
+  grid = lat._grid;
 }
 template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
 inline void GridFromExpression(GridBase * &grid,const T1& notlat)   // non-lattice leaf
 {
 }
 template <class T1,
          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
 inline void GridFromExpression(GridBase *&grid,
                               const T1 &notlat)  // non-lattice leaf
 {}
 template <typename Op, typename T1>
-inline void GridFromExpression(GridBase * &grid,const LatticeUnaryExpression<Op,T1 > &expr)
+inline void GridFromExpression(GridBase *&grid,
-{
+                               const LatticeUnaryExpression<Op, T1> &expr) {
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse 
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
 }
 template <typename Op, typename T1, typename T2>
-inline void GridFromExpression(GridBase * &grid,const LatticeBinaryExpression<Op,T1,T2> &expr) 
+inline void GridFromExpression(
-{
+    GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
-  GridFromExpression(grid,std::get<1>(expr.second));
+  GridFromExpression(grid, std::get<1>(expr.second));
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
+inline void GridFromExpression(
-{
+    GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
-  GridFromExpression(grid,std::get<1>(expr.second));
+  GridFromExpression(grid, std::get<1>(expr.second));
-  GridFromExpression(grid,std::get<2>(expr.second));
+  GridFromExpression(grid, std::get<2>(expr.second));
 }
 //////////////////////////////////////////////////////////////////////////
-// Obtain the CB from an expression, ensuring conformable. This must follow a tree recursion
+// Obtain the CB from an expression, ensuring conformable. This must follow a
 // tree recursion
 //////////////////////////////////////////////////////////////////////////
-template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
+template <class T1,
-inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
 inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
 {
-  if ( (cb==Odd) || (cb==Even) ) {
+  if ((cb == Odd) || (cb == Even)) {
-    assert(cb==lat.checkerboard);
+    assert(cb == lat.checkerboard);
  }
-  cb=lat.checkerboard;
+  cb = lat.checkerboard;
  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
 }
-template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
+template <class T1,
-inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
 inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
 {
  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
 }
 template <typename Op, typename T1>
-inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
+inline void CBFromExpression(int &cb,
-{
+                             const LatticeUnaryExpression<Op, T1> &expr) {
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2>
-inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &expr) 
+inline void CBFromExpression(int &cb,
-{
+                             const LatticeBinaryExpression<Op, T1, T2> &expr) {
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
-  CBFromExpression(cb,std::get<1>(expr.second));
+  CBFromExpression(cb, std::get<1>(expr.second));
  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
+inline void CBFromExpression(
-{
+    int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
-  CBFromExpression(cb,std::get<1>(expr.second));
+  CBFromExpression(cb, std::get<1>(expr.second));
-  CBFromExpression(cb,std::get<2>(expr.second));
+  CBFromExpression(cb, std::get<2>(expr.second));
  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
 }
 ////////////////////////////////////////////
 // Unary operators and funcs
 ////////////////////////////////////////////
-#define GridUnopClass(name,ret)\
+#define GridUnopClass(name, ret)                                          \
-template <class arg> struct name\
+  template <class arg>                                                    \
-{\
+  struct name {                                                           \
-  static auto inline func(const arg a)-> decltype(ret) { return ret; } \
+    static auto inline func(const arg a) -> decltype(ret) { return ret; } \
-};
+  };
-GridUnopClass(UnarySub,-a);
+GridUnopClass(UnarySub, -a);
-GridUnopClass(UnaryNot,Not(a));
+GridUnopClass(UnaryNot, Not(a));
-GridUnopClass(UnaryAdj,adj(a));
+GridUnopClass(UnaryAdj, adj(a));
-GridUnopClass(UnaryConj,conjugate(a));
+GridUnopClass(UnaryConj, conjugate(a));
-GridUnopClass(UnaryTrace,trace(a));
+GridUnopClass(UnaryTrace, trace(a));
-GridUnopClass(UnaryTranspose,transpose(a));
+GridUnopClass(UnaryTranspose, transpose(a));
-GridUnopClass(UnaryTa,Ta(a));
+GridUnopClass(UnaryTa, Ta(a));
-GridUnopClass(UnaryProjectOnGroup,ProjectOnGroup(a));
+GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
-GridUnopClass(UnaryReal,real(a));
+GridUnopClass(UnaryReal, real(a));
-GridUnopClass(UnaryImag,imag(a));
+GridUnopClass(UnaryImag, imag(a));
-GridUnopClass(UnaryToReal,toReal(a));
+GridUnopClass(UnaryToReal, toReal(a));
-GridUnopClass(UnaryToComplex,toComplex(a));
+GridUnopClass(UnaryToComplex, toComplex(a));
-GridUnopClass(UnaryAbs,abs(a));
+GridUnopClass(UnaryTimesI, timesI(a));
-GridUnopClass(UnarySqrt,sqrt(a));
+GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
-GridUnopClass(UnaryRsqrt,rsqrt(a));
+GridUnopClass(UnaryAbs, abs(a));
-GridUnopClass(UnarySin,sin(a));
+GridUnopClass(UnarySqrt, sqrt(a));
-GridUnopClass(UnaryCos,cos(a));
+GridUnopClass(UnaryRsqrt, rsqrt(a));
-GridUnopClass(UnaryLog,log(a));
+GridUnopClass(UnarySin, sin(a));
-GridUnopClass(UnaryExp,exp(a));
+GridUnopClass(UnaryCos, cos(a));
 GridUnopClass(UnaryAsin, asin(a));
 GridUnopClass(UnaryAcos, acos(a));
 GridUnopClass(UnaryLog, log(a));
 GridUnopClass(UnaryExp, exp(a));
 ////////////////////////////////////////////
 // Binary operators
 ////////////////////////////////////////////
-#define GridBinOpClass(name,combination)\
+#define GridBinOpClass(name, combination)                      \
-template <class left,class right>\
+  template <class left, class right>                           \
-struct name\
+  struct name {                                                \
-{\
+    static auto inline func(const left &lhs, const right &rhs) \
-  static auto inline func(const left &lhs,const right &rhs)-> decltype(combination) const \
+        -> decltype(combination) const {                       \
-    {\
+      return combination;                                      \
-      return combination;\
+    }                                                          \
-    }\
+  }
-}
+GridBinOpClass(BinaryAdd, lhs + rhs);
-GridBinOpClass(BinaryAdd,lhs+rhs);
+GridBinOpClass(BinarySub, lhs - rhs);
-GridBinOpClass(BinarySub,lhs-rhs);
+GridBinOpClass(BinaryMul, lhs *rhs);
-GridBinOpClass(BinaryMul,lhs*rhs);
+GridBinOpClass(BinaryDiv, lhs /rhs);
-GridBinOpClass(BinaryAnd   ,lhs&rhs);
+GridBinOpClass(BinaryAnd, lhs &rhs);
-GridBinOpClass(BinaryOr    ,lhs|rhs);
+GridBinOpClass(BinaryOr, lhs | rhs);
-GridBinOpClass(BinaryAndAnd,lhs&&rhs);
+GridBinOpClass(BinaryAndAnd, lhs &&rhs);
-GridBinOpClass(BinaryOrOr  ,lhs||rhs);
+GridBinOpClass(BinaryOrOr, lhs || rhs);
 ////////////////////////////////////////////////////
 // Trinary conditional op
 ////////////////////////////////////////////////////
-#define GridTrinOpClass(name,combination)\
+#define GridTrinOpClass(name, combination)                                     \
-template <class predicate,class left, class right>	\
+  template <class predicate, class left, class right>                          \
-struct name\
+  struct name {                                                                \
-{\
+    static auto inline func(const predicate &pred, const left &lhs,            \
-  static auto inline func(const predicate &pred,const left &lhs,const right &rhs)-> decltype(combination) const \
+                            const right &rhs) -> decltype(combination) const { \
-    {\
+      return combination;                                                      \
-      return combination;\
+    }                                                                          \
-    }\
+  }
 }
-GridTrinOpClass(TrinaryWhere,(predicatedWhere<predicate, \
+GridTrinOpClass(
-			       typename std::remove_reference<left>::type, \
+    TrinaryWhere,
-			       typename std::remove_reference<right>::type> (pred,lhs,rhs)));
+    (predicatedWhere<predicate, typename std::remove_reference<left>::type,
                     typename std::remove_reference<right>::type>(pred, lhs,
                                                                  rhs)));
 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
-#define GRID_UNOP(name)   name<decltype(eval(0, arg))>
+#define GRID_UNOP(name) name<decltype(eval(0, arg))>
-#define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
-#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+#define GRID_TRINOP(name) \
  name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
-#define GRID_DEF_UNOP(op, name)\
+#define GRID_DEF_UNOP(op, name)                                             \
-template <typename T1,\
+  template <typename T1,                                                    \
-  typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr> inline auto op(const T1 &arg) \
+            typename std::enable_if<is_lattice<T1>::value ||                \
-  -> decltype(LatticeUnaryExpression<GRID_UNOP(name),const T1&>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg)))) \
+                                        is_lattice_expr<T1>::value,         \
-{ return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg))); }
+                                    T1>::type * = nullptr>                  \
  inline auto op(const T1 &arg)                                             \
      ->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(       \
          std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
    return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(             \
        std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)));     \
  }
-#define GRID_BINOP_LEFT(op, name)\
+#define GRID_BINOP_LEFT(op, name)                                             \
-template <typename T1,typename T2,\
+  template <typename T1, typename T2,                                         \
-          typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr>\
+            typename std::enable_if<is_lattice<T1>::value ||                  \
-inline auto op(const T1 &lhs,const T2&rhs) \
+                                        is_lattice_expr<T1>::value,           \
-  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
+                                    T1>::type * = nullptr>                    \
-											    std::forward_as_tuple(lhs, rhs)))) \
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
-{\
+      ->decltype(                                                             \
- return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
-									  std::forward_as_tuple(lhs, rhs))); \
+              std::make_pair(GRID_BINOP(name)(),                              \
-}
+                             std::forward_as_tuple(lhs, rhs)))) {             \
    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
  }
-#define GRID_BINOP_RIGHT(op, name)\
+#define GRID_BINOP_RIGHT(op, name)                                            \
- template <typename T1,typename T2,\
+  template <typename T1, typename T2,                                         \
-           typename std::enable_if<!is_lattice<T1>::value && !is_lattice_expr<T1>::value, T1>::type* = nullptr,\
+            typename std::enable_if<!is_lattice<T1>::value &&                 \
-           typename std::enable_if< is_lattice<T2>::value ||  is_lattice_expr<T2>::value, T2>::type* = nullptr> \
+                                        !is_lattice_expr<T1>::value,          \
-inline auto op(const T1 &lhs,const T2&rhs)			\
+                                    T1>::type * = nullptr,                    \
-  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
+            typename std::enable_if<is_lattice<T2>::value ||                  \
-											    std::forward_as_tuple(lhs, rhs)))) \
+                                        is_lattice_expr<T2>::value,           \
-{\
+                                    T2>::type * = nullptr>                    \
- return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
-								          std::forward_as_tuple(lhs, rhs))); \
+      ->decltype(                                                             \
-}
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
              std::make_pair(GRID_BINOP(name)(),                              \
                             std::forward_as_tuple(lhs, rhs)))) {             \
    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
  }
-#define GRID_DEF_BINOP(op, name)\
+#define GRID_DEF_BINOP(op, name) \
- GRID_BINOP_LEFT(op,name);\
+  GRID_BINOP_LEFT(op, name);     \
- GRID_BINOP_RIGHT(op,name);
+  GRID_BINOP_RIGHT(op, name);
-
+#define GRID_DEF_TRINOP(op, name)                                              \
-#define GRID_DEF_TRINOP(op, name)\
+  template <typename T1, typename T2, typename T3>                             \
-template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,const T2&lhs,const T3 &rhs) \
+  inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)                 \
-  -> decltype(LatticeTrinaryExpression<GRID_TRINOP(name),const T1&,const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(),\
+      ->decltype(                                                              \
-										   std::forward_as_tuple(pred,lhs,rhs)))) \
+          LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,  \
-{\
+                                   const T3 &>(std::make_pair(                 \
-  return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(), \
+              GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) {  \
-										 std::forward_as_tuple(pred,lhs, rhs))); \
+    return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
-}
+                                    const T3 &>(std::make_pair(                \
        GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)));          \
  }
 ////////////////////////
-//Operator definitions
+// Operator definitions
 ////////////////////////
-GRID_DEF_UNOP(operator -,UnarySub);
+GRID_DEF_UNOP(operator-, UnarySub);
-GRID_DEF_UNOP(Not,UnaryNot);
+GRID_DEF_UNOP(Not, UnaryNot);
-GRID_DEF_UNOP(operator !,UnaryNot);
+GRID_DEF_UNOP(operator!, UnaryNot);
-GRID_DEF_UNOP(adj,UnaryAdj);
+GRID_DEF_UNOP(adj, UnaryAdj);
-GRID_DEF_UNOP(conjugate,UnaryConj);
+GRID_DEF_UNOP(conjugate, UnaryConj);
-GRID_DEF_UNOP(trace,UnaryTrace);
+GRID_DEF_UNOP(trace, UnaryTrace);
-GRID_DEF_UNOP(transpose,UnaryTranspose);
+GRID_DEF_UNOP(transpose, UnaryTranspose);
-GRID_DEF_UNOP(Ta,UnaryTa);
+GRID_DEF_UNOP(Ta, UnaryTa);
-GRID_DEF_UNOP(ProjectOnGroup,UnaryProjectOnGroup);
+GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
-GRID_DEF_UNOP(real,UnaryReal);
+GRID_DEF_UNOP(real, UnaryReal);
-GRID_DEF_UNOP(imag,UnaryImag);
+GRID_DEF_UNOP(imag, UnaryImag);
-GRID_DEF_UNOP(toReal,UnaryToReal);
+GRID_DEF_UNOP(toReal, UnaryToReal);
-GRID_DEF_UNOP(toComplex,UnaryToComplex);
+GRID_DEF_UNOP(toComplex, UnaryToComplex);
-GRID_DEF_UNOP(abs  ,UnaryAbs); //abs overloaded in cmath C++98; DON'T do the abs-fabs-dabs-labs thing
+GRID_DEF_UNOP(timesI, UnaryTimesI);
-GRID_DEF_UNOP(sqrt ,UnarySqrt);
+GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
-GRID_DEF_UNOP(rsqrt,UnaryRsqrt);
+GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
-GRID_DEF_UNOP(sin  ,UnarySin);
+                               // abs-fabs-dabs-labs thing
-GRID_DEF_UNOP(cos  ,UnaryCos);
+GRID_DEF_UNOP(sqrt, UnarySqrt);
-GRID_DEF_UNOP(log  ,UnaryLog);
+GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
-GRID_DEF_UNOP(exp  ,UnaryExp);
+GRID_DEF_UNOP(sin, UnarySin);
 GRID_DEF_UNOP(cos, UnaryCos);
 GRID_DEF_UNOP(asin, UnaryAsin);
 GRID_DEF_UNOP(acos, UnaryAcos);
 GRID_DEF_UNOP(log, UnaryLog);
 GRID_DEF_UNOP(exp, UnaryExp);
-GRID_DEF_BINOP(operator+,BinaryAdd);
+GRID_DEF_BINOP(operator+, BinaryAdd);
-GRID_DEF_BINOP(operator-,BinarySub);
+GRID_DEF_BINOP(operator-, BinarySub);
-GRID_DEF_BINOP(operator*,BinaryMul);
+GRID_DEF_BINOP(operator*, BinaryMul);
 GRID_DEF_BINOP(operator/, BinaryDiv);
-GRID_DEF_BINOP(operator&,BinaryAnd);
+GRID_DEF_BINOP(operator&, BinaryAnd);
-GRID_DEF_BINOP(operator|,BinaryOr);
+GRID_DEF_BINOP(operator|, BinaryOr);
-GRID_DEF_BINOP(operator&&,BinaryAndAnd);
+GRID_DEF_BINOP(operator&&, BinaryAndAnd);
-GRID_DEF_BINOP(operator||,BinaryOrOr);
+GRID_DEF_BINOP(operator||, BinaryOrOr);
-GRID_DEF_TRINOP(where,TrinaryWhere);
+GRID_DEF_TRINOP(where, TrinaryWhere);
 /////////////////////////////////////////////////////////////
 // Closure convenience to force expression to evaluate
 /////////////////////////////////////////////////////////////
-template<class Op,class T1>
+template <class Op, class T1>
-  auto closure(const LatticeUnaryExpression<Op,T1> & expr)
+auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))>
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
-{
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))> ret(expr);
+      expr);
  return ret;
 }
-template<class Op,class T1, class T2>
+template <class Op, class T1, class T2>
-  auto closure(const LatticeBinaryExpression<Op,T1,T2> & expr)
+auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
-				      eval(0,std::get<1>(expr.second))))>
+                                        eval(0, std::get<1>(expr.second))))> {
-{
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second))))>
-				   eval(0,std::get<1>(expr.second))))> ret(expr);
+      ret(expr);
  return ret;
 }
-template<class Op,class T1, class T2, class T3>
+template <class Op, class T1, class T2, class T3>
-  auto closure(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
+auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
-				      eval(0,std::get<1>(expr.second)),
+                                        eval(0, std::get<1>(expr.second)),
-				      eval(0,std::get<2>(expr.second))))>
+                                        eval(0, std::get<2>(expr.second))))> {
-{
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second)),
-				   eval(0,std::get<1>(expr.second)),
+                                   eval(0, std::get<2>(expr.second))))>
-				   eval(0,std::get<2>(expr.second))))> ret(expr);
+      ret(expr);
  return ret;
 }
@ -353,7 +433,6 @@ template<class Op,class T1, class T2, class T3>
 #undef GRID_DEF_UNOP
 #undef GRID_DEF_BINOP
 #undef GRID_DEF_TRINOP
 }
 #if 0
@ -368,7 +447,7 @@ using namespace Grid;
   BinaryAdd<double,double> tmp;
   LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &> 
     expr(std::make_pair(tmp,
-	  std::forward_as_tuple(v1,v2)));
+    std::forward_as_tuple(v1,v2)));
   tmp.func(eval(0,v1),eval(0,v2));
   auto var = v1+v2;
--- a/lib/lattice/Lattice_arith.h
+++ b/lib/lattice/Lattice_arith.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_arith.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_ARITH_H
 #define GRID_LATTICE_ARITH_H
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@ -1,3 +1,33 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/lattice/Lattice_base.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_LATTICE_BASE_H
 #define GRID_LATTICE_BASE_H
@ -26,11 +56,14 @@ extern int GridCshiftPermuteMap[4][16];
 // Basic expressions used in Expression Template
 ////////////////////////////////////////////////
-class LatticeBase {};
+class LatticeBase
-class LatticeExpressionBase {};
+{
 public:
    virtual ~LatticeBase(void) = default;
    GridBase *_grid;
 };
-template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
+class LatticeExpressionBase {};
 template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
 template <typename Op, typename T1>                           
 class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
@ -59,8 +92,6 @@ template<class vobj>
 class Lattice : public LatticeBase
 {
 public:
    GridBase *_grid;
    int checkerboard;
    Vector<vobj> _odata;
@ -68,13 +99,13 @@ public:
    int begin(void) { return 0;};
    int end(void)   { return _odata.size(); }
    vobj & operator[](int i) { return _odata[i]; };
    const vobj & operator[](int i) const { return _odata[i]; };
 public:
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
    typedef vobj vector_object;
  ////////////////////////////////////////////////////////////////////////////////
  // Expression Template closure support
  ////////////////////////////////////////////////////////////////////////////////
@ -149,8 +180,8 @@ PARALLEL_FOR_LOOP
  }
  //GridFromExpression is tricky to do
  template<class Op,class T1>
-    Lattice(const LatticeUnaryExpression<Op,T1> & expr):    _grid(nullptr){
+    Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
-
+    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
@ -171,7 +202,8 @@ PARALLEL_FOR_LOOP
    }
  };
  template<class Op,class T1, class T2>
-  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr):    _grid(nullptr){
+  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
@ -192,7 +224,8 @@ PARALLEL_FOR_LOOP
    }
  };
  template<class Op,class T1, class T2, class T3>
-  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr):    _grid(nullptr){
+  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
@ -212,14 +245,29 @@ PARALLEL_FOR_LOOP
    // Constructor requires "grid" passed.
    // what about a default grid?
    //////////////////////////////////////////////////////////////////
-    Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
+    Lattice(GridBase *grid) : _odata(grid->oSites()) {
-      //        _odata.reserve(_grid->oSites());
+        _grid = grid;
-      //        _odata.resize(_grid->oSites());
+    //        _odata.reserve(_grid->oSites());
    //        _odata.resize(_grid->oSites());
    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
        assert((((uint64_t)&_odata[0])&0xF) ==0);
        checkerboard=0;
    }
    Lattice(const Lattice& r){ // copy constructor
    	_grid = r._grid;
    	checkerboard = r.checkerboard;
    	_odata.resize(_grid->oSites());// essential
  		PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            _odata[ss]=r._odata[ss];
        }  	
    }
    virtual ~Lattice(void) = default;
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
@ -230,7 +278,7 @@ PARALLEL_FOR_LOOP
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
-      std::cout<<GridLogMessage<<"Lattice operator ="<<std::endl;
+      
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
@ -252,17 +300,6 @@ PARALLEL_FOR_LOOP
        *this = (*this)+r;
        return *this;
    }
    strong_inline friend Lattice<vobj> operator / (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
        conformable(lhs,rhs);
        Lattice<vobj> ret(lhs._grid);
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 	  ret._odata[ss] = lhs._odata[ss]*pow(rhs._odata[ss],-1.0);
        }
        return ret;
    };
 }; // class Lattice
  template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
@ -287,27 +324,27 @@ PARALLEL_FOR_LOOP
-#include <lattice/Lattice_conformable.h>
+#include "Lattice_conformable.h"
 #define GRID_LATTICE_EXPRESSION_TEMPLATES
 #ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES
-#include <lattice/Lattice_ET.h>
+#include "Lattice_ET.h"
 #else 
-#include <lattice/Lattice_overload.h>
+#include "Lattice_overload.h"
 #endif
-#include <lattice/Lattice_arith.h>
+#include "Lattice_arith.h"
-#include <lattice/Lattice_trace.h>
+#include "Lattice_trace.h"
-#include <lattice/Lattice_transpose.h>
+#include "Lattice_transpose.h"
-#include <lattice/Lattice_local.h>
+#include "Lattice_local.h"
-#include <lattice/Lattice_reduction.h>
+#include "Lattice_reduction.h"
-#include <lattice/Lattice_peekpoke.h>
+#include "Lattice_peekpoke.h"
-#include <lattice/Lattice_reality.h>
+#include "Lattice_reality.h"
-#include <lattice/Lattice_comparison_utils.h>
+#include "Lattice_comparison_utils.h"
-#include <lattice/Lattice_comparison.h>
+#include "Lattice_comparison.h"
-#include <lattice/Lattice_coordinate.h>
+#include "Lattice_coordinate.h"
-#include <lattice/Lattice_where.h>
+#include "Lattice_where.h"
-#include <lattice/Lattice_rng.h>
+#include "Lattice_rng.h"
-#include <lattice/Lattice_unary.h>
+#include "Lattice_unary.h"
-#include <lattice/Lattice_transfer.h>
+#include "Lattice_transfer.h"
 #endif
--- a/lib/lattice/Lattice_comparison.h
+++ b/lib/lattice/Lattice_comparison.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_comparison.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_COMPARISON_H
 #define GRID_LATTICE_COMPARISON_H
--- a/lib/lattice/Lattice_comparison_utils.h
+++ b/lib/lattice/Lattice_comparison_utils.h
@ -1,3 +1,31 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_comparison_utils.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_COMPARISON_H
 #define GRID_COMPARISON_H
--- a/lib/lattice/Lattice_conformable.h
+++ b/lib/lattice/Lattice_conformable.h
@ -1,3 +1,30 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_conformable.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_CONFORMABLE_H
 #define GRID_LATTICE_CONFORMABLE_H
--- a/Show More
+++ b/Show More
		`@ -1,4 +0,0 @@`

			HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./PerfCount.h ./pugixml/pugixml.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Avx512Asm.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h

			CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./PerfCount.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc