Merge branch 'feature/new-build' into feature/hadrons

# Conflicts: # Makefile.am # scripts/copyright
2026-03-09 22:16:12 +00:00 · 2016-08-03 16:49:16 +01:00
parent e0b7004f96 3b376ed54e
commit 2485ef9c9c
215 changed files with 8454 additions and 4613 deletions
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -29,27 +29,28 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALGORITHMS_H
 #define GRID_ALGORITHMS_H

-#include <algorithms/SparseMatrix.h>
-#include <algorithms/LinearOperator.h>
-#include <algorithms/Preconditioner.h>
+#include <Grid/algorithms/SparseMatrix.h>
+#include <Grid/algorithms/LinearOperator.h>
+#include <Grid/algorithms/Preconditioner.h>

-#include <algorithms/approx/Zolotarev.h>
-#include <algorithms/approx/Chebyshev.h>
-#include <algorithms/approx/Remez.h>
-#include <algorithms/approx/MultiShiftFunction.h>
+#include <Grid/algorithms/approx/Zolotarev.h>
+#include <Grid/algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/Remez.h>
+#include <Grid/algorithms/approx/MultiShiftFunction.h>

-#include <algorithms/iterative/ConjugateGradient.h>
-#include <algorithms/iterative/ConjugateResidual.h>
-#include <algorithms/iterative/NormalEquations.h>
-#include <algorithms/iterative/SchurRedBlack.h>
+#include <Grid/algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateResidual.h>
+#include <Grid/algorithms/iterative/NormalEquations.h>
+#include <Grid/algorithms/iterative/SchurRedBlack.h>

-#include <algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>

 // Lanczos support
-#include <algorithms/iterative/MatrixUtils.h>
-#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/MatrixUtils.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>

-#include <algorithms/CoarsenedMatrix.h>
+#include <Grid/algorithms/CoarsenedMatrix.h>

 // Eigen/lanczos
 // EigCg
--- a/lib/Cartesian.h
+++ b/lib/Cartesian.h
@@ -28,8 +28,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_H
 #define GRID_CARTESIAN_H

-#include <cartesian/Cartesian_base.h>
-#include <cartesian/Cartesian_full.h>
-#include <cartesian/Cartesian_red_black.h> 
+#include <Grid/cartesian/Cartesian_base.h>
+#include <Grid/cartesian/Cartesian_full.h>
+#include <Grid/cartesian/Cartesian_red_black.h> 

 #endif
--- a/lib/Communicator.h
+++ b/lib/Communicator.h
@@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_COMMUNICATOR_H
 #define GRID_COMMUNICATOR_H

-#include <communicator/Communicator_base.h>
+#include <Grid/communicator/Communicator_base.h>

 #endif
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@@ -28,17 +28,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_CSHIFT_H_
 #define _GRID_CSHIFT_H_

-#include <cshift/Cshift_common.h>
+#include <Grid/cshift/Cshift_common.h>

 #ifdef GRID_COMMS_NONE
-#include <cshift/Cshift_none.h>
+#include <Grid/cshift/Cshift_none.h>
 #endif

 #ifdef GRID_COMMS_MPI
-#include <cshift/Cshift_mpi.h>
+#include <Grid/cshift/Cshift_mpi.h>
 #endif 

 #ifdef GRID_COMMS_SHMEM
-#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
+#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
 #endif
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -59,29 +59,29 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ///////////////////
 // Grid headers
 ///////////////////
-#include <serialisation/Serialisation.h>
-#include <Config.h>
-#include <Timer.h>
-#include <PerfCount.h>
-#include <Log.h>
-#include <AlignedAllocator.h>
-#include <Simd.h>
-#include <Threads.h>
-#include <Lexicographic.h>
-#include <Communicator.h> 
-#include <Cartesian.h>    
-#include <Tensors.h>      
-#include <Lattice.h>      
-#include <Cshift.h>       
-#include <Stencil.h>      
-#include <Algorithms.h>   
-#include <parallelIO/BinaryIO.h>
-#include <qcd/QCD.h>
-#include <parallelIO/NerscIO.h>
-#include <Init.h>
+#include <Grid/serialisation/Serialisation.h>
+#include "Config.h"
+#include <Grid/Timer.h>
+#include <Grid/PerfCount.h>
+#include <Grid/Log.h>
+#include <Grid/AlignedAllocator.h>
+#include <Grid/Simd.h>
+#include <Grid/Threads.h>
+#include <Grid/Lexicographic.h>
+#include <Grid/Communicator.h> 
+#include <Grid/Cartesian.h>    
+#include <Grid/Tensors.h>      
+#include <Grid/Lattice.h>      
+#include <Grid/Cshift.h>       
+#include <Grid/Stencil.h>      
+#include <Grid/Algorithms.h>   
+#include <Grid/parallelIO/BinaryIO.h>
+#include <Grid/qcd/QCD.h>
+#include <Grid/parallelIO/NerscIO.h>
+#include <Grid/Init.h>

-#include <qcd/hmc/NerscCheckpointer.h>
-#include <qcd/hmc/HmcRunner.h>
+#include <Grid/qcd/hmc/NerscCheckpointer.h>
+#include <Grid/qcd/hmc/HmcRunner.h>



--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -193,7 +193,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
-    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug"<<std::endl;
+    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    exit(EXIT_SUCCESS);
  }

@@ -234,26 +234,34 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
  }

+  std::string COL_RED    = GridLogColours.colour["RED"];
+  std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
+  std::string COL_BLACK  = GridLogColours.colour["BLACK"];
+  std::string COL_GREEN  = GridLogColours.colour["GREEN"];
+  std::string COL_BLUE   = GridLogColours.colour["BLUE"];
+  std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
+  std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
+
+  
  std::cout <<std::endl;
-  std::cout <<Logger::RED  << "__|__|__|__|__"<<             "|__|__|_"<<Logger::PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|__|__|__|__"<<             "|__|__|_"<<Logger::PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|__|  |  |  "<<             "|  |  | "<<Logger::PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|__         "<<             "        "<<Logger::PURPLE<<"        "<<                "          _|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<" GGGG   "<<Logger::RED<<" RRRR   "<<Logger::BLUE  <<" III    "<<Logger::PURPLE<<"DDDD  "<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<"G       "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D   D "<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<"G       "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D    D"<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<"G  GG   "<<Logger::RED<<" RRRR   "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D    D"<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<"G   G   "<<Logger::RED<<" R  R   "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D   D "<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<" GGGG   "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<" III    "<<Logger::PURPLE<<"DDDD  "<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|__         "<<             "        "<<Logger::GREEN <<"        "<<                "          _|__"<<std::endl; 
-  std::cout <<Logger::BLUE << "__|__|__|__|__"<<             "|__|__|_"<<Logger::GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::BLUE << "__|__|__|__|__"<<             "|__|__|_"<<Logger::GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::BLUE << "  |  |  |  |  "<<             "|  |  | "<<Logger::GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
+  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|__|  |  |  "<<             "|  |  | "<<COL_PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|__         "<<             "        "<<COL_PURPLE<<"        "<<                "          _|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G  GG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G   G   "<<COL_RED<<" R  R   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" R   R  "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|__         "<<             "        "<<COL_GREEN <<"        "<<                "          _|__"<<std::endl; 
+  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_BLUE << "  |  |  |  |  "<<             "|  |  | "<<COL_GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
  std::cout << std::endl;
  std::cout << std::endl;
-  std::cout <<Logger::YELLOW<< std::endl;
+  std::cout <<COL_YELLOW<< std::endl;
  std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
-  std::cout << "Colours by Tadahito Boyle "<<std::endl;
  std::cout << std::endl;
  std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
  std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
@@ -264,7 +272,8 @@ void Grid_init(int *argc,char ***argv)
  std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
  std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
  std::cout << "GNU General Public License for more details."<<std::endl;
-  std::cout << Logger::BLACK <<std::endl;
+  std::cout << COL_BACKGROUND <<std::endl;
+  std::cout << std::endl;
 }

  
--- a/lib/Lattice.h
+++ b/lib/Lattice.h
@@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_LATTICE_H
 #define GRID_LATTICE_H

-#include <lattice/Lattice_base.h>
+#include <Grid/lattice/Lattice_base.h>

 #endif
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -1,126 +1,92 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/Log.cc
+Source file: ./lib/Log.cc

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #include <Grid.h>

 namespace Grid {

 GridStopWatch Logger::StopWatch;
-std::ostream  Logger::devnull(0);
-std::string Logger::BLACK("\033[30m");
-std::string Logger::RED("\033[31m");
-std::string Logger::GREEN("\033[32m");
-std::string Logger::YELLOW("\033[33m");
-std::string Logger::BLUE("\033[34m");
-std::string Logger::PURPLE("\033[35m");
-std::string Logger::CYAN("\033[36m");
-std::string Logger::WHITE("\033[37m");
-std::string Logger::NORMAL("\033[0;39m");
-std::string EMPTY("");
+std::ostream Logger::devnull(0);

-#if 0  
-  GridLogger GridLogError      (1,"Error",Logger::RED);
-  GridLogger GridLogWarning    (1,"Warning",Logger::YELLOW);
-  GridLogger GridLogMessage    (1,"Message",Logger::BLACK);
-  GridLogger GridLogDebug      (1,"Debug",Logger::PURPLE);
-  GridLogger GridLogPerformance(1,"Performance",Logger::GREEN);
-  GridLogger GridLogIterative  (1,"Iterative",Logger::BLUE);
-  GridLogger GridLogIntegrator (1,"Integrator",Logger::BLUE);
-#else
-  GridLogger GridLogError      (1,"Error",EMPTY);
-  GridLogger GridLogWarning    (1,"Warning",EMPTY);
-  GridLogger GridLogMessage    (1,"Message",EMPTY);
-  GridLogger GridLogDebug      (1,"Debug",EMPTY);
-  GridLogger GridLogPerformance(1,"Performance",EMPTY);
-  GridLogger GridLogIterative  (1,"Iterative",EMPTY);
-  GridLogger GridLogIntegrator (1,"Integrator",EMPTY);
-#endif
+Colours GridLogColours(0);
+GridLogger GridLogError(1, "Error", GridLogColours, "RED");
+GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
+GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
+GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE");
+GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
+GridLogger GridLogIterative(1, "Iterative", GridLogColours, "BLUE");
+GridLogger GridLogIntegrator(1, "Integrator", GridLogColours, "BLUE");

-void GridLogConfigure(std::vector<std::string> &logstreams)
-{
+void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
  GridLogWarning.Active(0);
-  GridLogMessage.Active(0);
+  GridLogMessage.Active(1); // at least the messages should be always on
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(0);
+  GridLogColours.Active(0);

-  int blackAndWhite = 1;
-  if(blackAndWhite){
-    Logger::BLACK = std::string("");
-    Logger::RED    =Logger::BLACK;
-    Logger::GREEN  =Logger::BLACK;
-    Logger::YELLOW =Logger::BLACK;
-    Logger::BLUE   =Logger::BLACK;
-    Logger::PURPLE =Logger::BLACK;
-    Logger::CYAN   =Logger::BLACK;
-    Logger::WHITE  =Logger::BLACK;
-    Logger::NORMAL =Logger::BLACK;
-  }
-
-  for(int i=0;i<logstreams.size();i++){
-    if ( logstreams[i]== std::string("Error")       ) GridLogError.Active(1);
-    if ( logstreams[i]== std::string("Warning")     ) GridLogWarning.Active(1);
-    if ( logstreams[i]== std::string("Message")     ) GridLogMessage.Active(1);
-    if ( logstreams[i]== std::string("Iterative")   ) GridLogIterative.Active(1);
-    if ( logstreams[i]== std::string("Debug")       ) GridLogDebug.Active(1);
-    if ( logstreams[i]== std::string("Performance") ) GridLogPerformance.Active(1);
-    if ( logstreams[i]== std::string("Integrator" ) ) GridLogIntegrator.Active(1);
+  for (int i = 0; i < logstreams.size(); i++) {
+    if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
+    if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
+    if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
+    if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
+    if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
+    if (logstreams[i] == std::string("Performance"))
+      GridLogPerformance.Active(1);
+    if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
  }
 }

 ////////////////////////////////////////////////////////////
 // Verbose limiter on MPI tasks
 ////////////////////////////////////////////////////////////
-void Grid_quiesce_nodes(void)
-{
-  int me=0;
+void Grid_quiesce_nodes(void) {
+  int me = 0;
 #ifdef GRID_COMMS_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD,&me);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
  me = shmem_my_pe();
 #endif
-  if ( me ) { 
+  if (me) {
    std::cout.setstate(std::ios::badbit);
  }
 }

-void Grid_unquiesce_nodes(void)
-{
+void Grid_unquiesce_nodes(void) {
 #ifdef GRID_COMMS_MPI
-    std::cout.clear();
+  std::cout.clear();
 #endif
 }
-
-
 }
-
--- a/lib/Log.h
+++ b/lib/Log.h
@@ -6,9 +6,9 @@

    Copyright (C) 2015

-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Antonin Portelli <antonin.portelli@me.com>
+    Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -27,6 +27,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
+#include <map>
+
 #ifndef GRID_LOG_H
 #define GRID_LOG_H

@@ -34,56 +37,99 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <execinfo.h>
 #endif

-namespace Grid {
+    namespace Grid {

 // Dress the output; use std::chrono for time stamping via the StopWatch class
 int Rank(void); // used for early stage debug before library init


+class Colours{
+protected:
+  bool is_active;
+public:
+  std::map<std::string, std::string> colour;
+
+  Colours(bool activate=false){
+    Active(activate);
+  };
+
+  void Active(bool activate){
+    is_active=activate;
+
+    if (is_active){
+     colour["BLACK"]  ="\033[30m";
+     colour["RED"]    ="\033[31m";
+     colour["GREEN"]  ="\033[32m";
+     colour["YELLOW"] ="\033[33m";
+     colour["BLUE"]   ="\033[34m";
+     colour["PURPLE"] ="\033[35m";
+     colour["CYAN"]   ="\033[36m";
+     colour["WHITE"]  ="\033[37m";
+     colour["NORMAL"] ="\033[0;39m";
+   } else {
+    colour["BLACK"] ="";
+    colour["RED"]   ="";
+    colour["GREEN"] ="";
+    colour["YELLOW"]="";
+    colour["BLUE"]  ="";
+    colour["PURPLE"]="";
+    colour["CYAN"]  ="";
+    colour["WHITE"] ="";
+    colour["NORMAL"]="";
+  }
+
+
+};
+
+};
+
+
 class Logger {
 protected:
-    int active;
-    std::string name, topName, COLOUR;
-public:
-    static GridStopWatch StopWatch;
-    static std::ostream devnull;
+  Colours &Painter;
+  int active;
+  std::string name, topName;
+  std::string COLOUR;

-    static std::string BLACK;
-    static std::string RED  ;
-    static std::string GREEN;
-    static std::string YELLOW;
-    static std::string BLUE  ;
-    static std::string PURPLE;
-    static std::string CYAN  ;
-    static std::string WHITE ;
-    static std::string NORMAL;
-    
- Logger(std::string topNm, int on, std::string nm,std::string col)
-   : active(on), name(nm), topName(topNm), COLOUR(col) {};
-    
-    void Active(int on) {active = on;};
-    int  isActive(void) {return active;};
-    
-    friend std::ostream& operator<< (std::ostream& stream, const Logger& log){
-        if ( log.active ) {
-            StopWatch.Stop();
-            GridTime now = StopWatch.Elapsed();
-            StopWatch.Start();
-            stream << BLACK <<std::setw(8) << std::left << log.topName << BLACK<< " : ";
-            stream << log.COLOUR <<std::setw(11)  << log.name << BLACK << " : ";
-            stream << YELLOW <<std::setw(6) << now <<BLACK << " : " ;
-            stream << log.COLOUR;
-            return stream;
-        } else { 
-            return devnull;
-        }
+public:
+  static GridStopWatch StopWatch;
+  static std::ostream devnull;
+
+  std::string background() {return Painter.colour["NORMAL"];}
+  std::string evidence() {return Painter.colour["YELLOW"];}
+  std::string colour() {return Painter.colour[COLOUR];}
+
+  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
+  : active(on),
+  name(nm),
+  topName(topNm),
+  Painter(col_class),
+  COLOUR(col){} ;
+  
+  void Active(int on) {active = on;};
+  int  isActive(void) {return active;};
+  
+  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
+
+    if ( log.active ) {
+      StopWatch.Stop();
+      GridTime now = StopWatch.Elapsed();
+      StopWatch.Start();
+      stream << log.background()<< log.topName << log.background()<< " : ";
+      stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
+      stream << log.evidence()<< now << log.background() << " : " << log.colour();
+      return stream;
+    } else { 
+      return devnull;
    }
-    
+  }
+
 };
-    
+
 class GridLogger: public Logger {
 public:
- GridLogger(int on, std::string nm, std::string col = Logger::BLACK): Logger("Grid", on, nm, col){};
+  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
+  Logger("Grid", on, nm, col_class, col_key){};
 };

 void GridLogConfigure(std::vector<std::string> &logstreams);
@@ -95,38 +141,40 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
+extern Colours    GridLogColours;


 #define _NBACKTRACE (256)
 extern void * Grid_backtrace_buffer[_NBACKTRACE];

 #define BACKTRACEFILE() {\
-    char string[20];					\
-    std::sprintf(string,"backtrace.%d",Rank());				\
-    std::FILE * fp = std::fopen(string,"w");				\
-    BACKTRACEFP(fp)\
-    std::fclose(fp);	    \
+char string[20];					\
+std::sprintf(string,"backtrace.%d",Rank());				\
+std::FILE * fp = std::fopen(string,"w");				\
+BACKTRACEFP(fp)\
+std::fclose(fp);	    \
 }


 #ifdef HAVE_EXECINFO_H
 #define BACKTRACEFP(fp) { \
-  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
-  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
-  for (int i = 0; i < symbols; i++){\
-    std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
-  }\
+int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
+char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
+for (int i = 0; i < symbols; i++){\
+  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
+}\
 }
 #else 
 #define BACKTRACEFP(fp) { \
-    std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
 }
 #endif

 #define BACKTRACE() BACKTRACEFP(stdout) 

+
 }
 #endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -1,6 +1,3 @@
-# additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/
-
 extra_sources=
 if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
@@ -17,16 +14,11 @@ endif
 #
 # Libraries
 #
-
 include Make.inc
+include Eigen.inc

-lib_LIBRARIES = libGrid.a
-libGrid_a_SOURCES = $(CCFILES) $(extra_sources)
-
-
-#	qcd/action/fermion/PartialFractionFermion5D.cc\	\
-#
-# Include files
-#
-nobase_include_HEADERS=$(HFILES)
+lib_LTLIBRARIES = libGrid.la

+libGrid_la_SOURCES             = $(CCFILES) $(extra_sources)
+libGrid_ladir                  = $(pkgincludedir)
+nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -1,32 +1,33 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/Simd.h
+Source file: ./lib/Simd.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_SIMD_H
 #define GRID_SIMD_H

@@ -118,6 +119,14 @@ namespace Grid {
  inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
  inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
  inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
+
+  // define projections to real and imaginay parts
+  inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
+  inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
+  inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
+  inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
+
+  // define auxiliary functions for complex computations
  inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
  inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
  inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
@@ -163,8 +172,8 @@ namespace Grid {

 };

-#include <simd/Grid_vector_types.h>
-#include <simd/Grid_vector_unops.h>
+#include "simd/Grid_vector_types.h"
+#include "simd/Grid_vector_unops.h"

 namespace Grid {
  // Default precision
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -30,7 +30,7 @@

 #include <thread>

- #include <stencil/Lebesgue.h>   // subdir aggregate
+ #include <Grid/stencil/Lebesgue.h>   // subdir aggregate

 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@@ -30,22 +30,22 @@ Author: neo <cossu@post.kek.jp>
 #ifndef GRID_MATH_H
 #define GRID_MATH_H

-#include <tensors/Tensor_traits.h>
-#include <tensors/Tensor_class.h>
-#include <tensors/Tensor_arith.h>
-#include <tensors/Tensor_inner.h>
-#include <tensors/Tensor_outer.h>
-#include <tensors/Tensor_transpose.h>
-#include <tensors/Tensor_trace.h>
-#include <tensors/Tensor_index.h>
-#include <tensors/Tensor_Ta.h>
-#include <tensors/Tensor_determinant.h>
-#include <tensors/Tensor_exp.h>
-//#include <tensors/Tensor_peek.h>
-//#include <tensors/Tensor_poke.h>
-#include <tensors/Tensor_reality.h>
-#include <tensors/Tensor_unary.h>
-#include <tensors/Tensor_extract_merge.h>
-#include <tensors/Tensor_logical.h>
+#include <Grid/tensors/Tensor_traits.h>
+#include <Grid/tensors/Tensor_class.h>
+#include <Grid/tensors/Tensor_arith.h>
+#include <Grid/tensors/Tensor_inner.h>
+#include <Grid/tensors/Tensor_outer.h>
+#include <Grid/tensors/Tensor_transpose.h>
+#include <Grid/tensors/Tensor_trace.h>
+#include <Grid/tensors/Tensor_index.h>
+#include <Grid/tensors/Tensor_Ta.h>
+#include <Grid/tensors/Tensor_determinant.h>
+#include <Grid/tensors/Tensor_exp.h>
+//#include <Grid/tensors/Tensor_peek.h>
+//#include <Grid/tensors/Tensor_poke.h>
+#include <Grid/tensors/Tensor_reality.h>
+#include <Grid/tensors/Tensor_unary.h>
+#include <Grid/tensors/Tensor_extract_merge.h>
+#include <Grid/tensors/Tensor_logical.h>

 #endif
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -31,7 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H

-#include <Grid.h>

 namespace Grid {

--- a/lib/algorithms/SparseMatrix.h
+++ b/lib/algorithms/SparseMatrix.h
@@ -28,7 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_SPARSE_MATRIX_H
 #define  GRID_ALGORITHM_SPARSE_MATRIX_H

-#include <Grid.h>

 namespace Grid {

--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -29,8 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CHEBYSHEV_H
 #define GRID_CHEBYSHEV_H

-#include<Grid.h>
-#include<algorithms/LinearOperator.h>
+#include <Grid/algorithms/LinearOperator.h>

 namespace Grid {

--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@@ -18,10 +18,10 @@
 #include <stddef.h>
 #include <Config.h>

-#ifdef HAVE_GMP_H
-#include <algorithms/approx/bigfloat.h>
+#ifdef HAVE_LIBGMP
+#include "bigfloat.h"
 #else
-#include <algorithms/approx/bigfloat_double.h>
+#include "bigfloat_double.h"
 #endif

 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -40,9 +40,10 @@ namespace Grid {
  template<class Field> 
    class ConjugateGradient : public OperatorFunction<Field> {
 public:                                                
+    bool ErrorOnNoConverge; //throw an assert when the CG fails to converge. Defaults true.
    RealD   Tolerance;
    Integer MaxIterations;
-    ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
+  ConjugateGradient(RealD tol,Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){ 
    };


@@ -137,13 +138,15 @@ public:
 	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 	  std::cout<<std::endl;
 	  
-	  assert(true_residual/Tolerance < 1000.0);
+	  if(ErrorOnNoConverge)
+	    assert(true_residual/Tolerance < 1000.0);

 	  return;
 	}
      }
      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
-      assert(0);
+      if(ErrorOnNoConverge)	
+	assert(0);
    }
  };
 }
--- a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -0,0 +1,142 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+
+namespace Grid {
+
+  //Mixed precision restarted defect correction CG
+  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
+  public:                                                
+    RealD   Tolerance;
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+
+    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+    LinearFunction<FieldF> *guesser;
+    
+    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
+      Linop_f(_Linop_f), Linop_d(_Linop_d),
+      Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      OuterLoopNormMult(100.), guesser(NULL){ };
+
+    void useGuesser(LinearFunction<FieldF> &g){
+      guesser = &g;
+    }
+  
+    void operator() (const FieldD &src_d_in, FieldD &sol_d){
+      GridStopWatch TotalTimer;
+      TotalTimer.Start();
+    
+      int cb = src_d_in.checkerboard;
+      sol_d.checkerboard = cb;
+    
+      RealD src_norm = norm2(src_d_in);
+      RealD stop = src_norm * Tolerance*Tolerance;
+
+      GridBase* DoublePrecGrid = src_d_in._grid;
+      FieldD tmp_d(DoublePrecGrid);
+      tmp_d.checkerboard = cb;
+    
+      FieldD tmp2_d(DoublePrecGrid);
+      tmp2_d.checkerboard = cb;
+    
+      FieldD src_d(DoublePrecGrid);
+      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
+    
+      RealD inner_tol = Tolerance;
+    
+      FieldF src_f(SinglePrecGrid);
+      src_f.checkerboard = cb;
+    
+      FieldF sol_f(SinglePrecGrid);
+      sol_f.checkerboard = cb;
+    
+      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      CG_f.ErrorOnNoConverge = false;
+
+      GridStopWatch InnerCGtimer;
+
+      GridStopWatch PrecChangeTimer;
+    
+      for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
+	//Compute double precision rsd and also new RHS vector.
+	Linop_d.HermOp(sol_d, tmp_d);
+	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
+      
+	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
+
+	if(norm < OuterLoopNormMult * stop){
+	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
+	  break;
+	}
+	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+
+	PrecChangeTimer.Start();
+	precisionChange(src_f, src_d);
+	PrecChangeTimer.Stop();
+      
+	zeroit(sol_f);
+
+	//Optionally improve inner solver guess (eg using known eigenvectors)
+	if(guesser != NULL)
+	  (*guesser)(src_f, sol_f);
+
+	//Inner CG
+	CG_f.Tolerance = inner_tol;
+	InnerCGtimer.Start();
+	CG_f(Linop_f, src_f, sol_f);
+	InnerCGtimer.Stop();
+      
+	//Convert sol back to double and add to double prec solution
+	PrecChangeTimer.Start();
+	precisionChange(tmp_d, sol_f);
+	PrecChangeTimer.Stop();
+      
+	axpy(sol_d, 1.0, tmp_d, sol_d);
+      }
+    
+      //Final trial CG
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
+    
+      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
+      CG_d(Linop_d, src_d_in, sol_d);
+
+      TotalTimer.Stop();
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+    }
+  };
+
+}
+
+#endif
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@@ -130,8 +130,8 @@ DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st,

 }

-#include <algorithms/iterative/Householder.h>
-#include <algorithms/iterative/Francis.h>
+#include "Householder.h"
+#include "Francis.h"

 #endif

--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -33,8 +33,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef USE_LAPACK
 #include <lapacke.h>
 #endif
-#include <algorithms/iterative/DenseMatrix.h>
-#include <algorithms/iterative/EigenSort.h>
+#include "DenseMatrix.h"
+#include "EigenSort.h"

 namespace Grid {

--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -29,7 +29,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_BASE_H
 #define GRID_CARTESIAN_BASE_H

-#include <Grid.h>

 namespace Grid{

@@ -107,6 +106,12 @@ public:
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
+    virtual int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+        return idx;
+    }
    inline int oIndexReduced(std::vector<int> &ocoor)
    {
      int idx=0; 
@@ -123,12 +128,6 @@ public:
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
-    inline int iIndex(std::vector<int> &lcoor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
-        return idx;
-    }
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
    {
      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
@@ -220,7 +219,7 @@ public:
      }

      i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
-      o_idx= oIndex(lcoor);// this implies divide by 2 on checkerdim
+      o_idx= oIndex(lcoor);  // this implies divide by 2 on checkerdim
    }

    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -32,17 +32,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

-    static const int CbRed  =0;
-    static const int CbBlack=1;
-    static const int Even   =CbRed;
-    static const int Odd    =CbBlack;
-
-    // Perhaps these are misplaced and 
-    // should be in sparse matrix.
-    // Also should make these a named enum type
-    static const int DaggerNo=0;
-    static const int DaggerYes=1;
-
+  static const int CbRed  =0;
+  static const int CbBlack=1;
+  static const int Even   =CbRed;
+  static const int Odd    =CbBlack;
+    
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
 {
@@ -224,9 +218,21 @@ protected:
 	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
 	}
      }
-        return idx;
+      return idx;
    };
        
+    virtual int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) {
+	  if( d==_checker_dim ) {
+	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
+	  } else { 
+	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+	  }
+	}
+        return idx;
+    }
 };

 }
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -127,12 +127,21 @@ class CartesianCommunicator {
 			int recv_from_rank,
 			int bytes);

+    void SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+			    void *xmit,
+			    int xmit_to_rank,
+			    void *recv,
+			    int recv_from_rank,
+			    int bytes);
+
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
+
+    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

    ////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -144,6 +144,28 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }

 // Basic Halo comms primitive
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+					       void *xmit,
+					       int dest,
+					       void *recv,
+					       int from,
+					       int bytes)
+{
+  MPI_Request xrq;
+  MPI_Request rrq;
+  int rank = _processor;
+  int ierr;
+  ierr =MPI_Send_init(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Recv_init(recv, bytes, MPI_CHAR,dest,_processor,communicator,&rrq);
+  assert(ierr==0);
+  list.push_back(xrq);
+  list.push_back(rrq);
+}
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
+{
+  MPI_Startall(list.size(),&list[0]);
+}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -151,17 +173,12 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-  MPI_Request xrq;
-  MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-  
-  assert(ierr==0);
-
-  list.push_back(xrq);
-  list.push_back(rrq);
+  std::vector<CommsRequest_t> reqs(0);
+  SendToRecvFromInit(reqs,xmit,dest,recv,from,bytes);
+  SendToRecvFromBegin(reqs);
+  for(int i=0;i<reqs.size();i++){
+    list.push_back(reqs[i]);
+  }
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -84,6 +84,19 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 {
  assert(0);
 }
+void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  assert(0);
+}
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
+{
+  assert(0);
+}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  assert(0);
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -268,6 +268,10 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }

 // Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
+{
+  assert(0); //unimplemented
+}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -280,6 +284,15 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
  shmem_putmem(recv,xmit,bytes,dest);
 }
+void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  assert(0); // Unimplemented
+}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  //  shmem_quiet();      // I'm done
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@@ -1,73 +1,74 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/lattice/Lattice_ET.h
+Source file: ./lib/lattice/Lattice_ET.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_LATTICE_ET_H
 #define GRID_LATTICE_ET_H

 #include <iostream>
-#include <vector>
 #include <tuple>
 #include <typeinfo>
+#include <vector>

 namespace Grid {

-  ////////////////////////////////////////////////////
-  // Predicated where support
-  ////////////////////////////////////////////////////
-  template<class iobj,class vobj,class robj>
-    inline vobj predicatedWhere(const iobj &predicate,const vobj &iftrue,const robj &iffalse) {
+////////////////////////////////////////////////////
+// Predicated where support
+////////////////////////////////////////////////////
+template <class iobj, class vobj, class robj>
+inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
+                            const robj &iffalse) {
+  typename std::remove_const<vobj>::type ret;

-    typename std::remove_const<vobj>::type ret;
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;

-    typedef typename vobj::scalar_object scalar_object;
-    typedef typename vobj::scalar_type scalar_type;
-    typedef typename vobj::vector_type vector_type;
+  const int Nsimd = vobj::vector_type::Nsimd();
+  const int words = sizeof(vobj) / sizeof(vector_type);

-    const int Nsimd = vobj::vector_type::Nsimd();
-    const int words = sizeof(vobj)/sizeof(vector_type);
+  std::vector<Integer> mask(Nsimd);
+  std::vector<scalar_object> truevals(Nsimd);
+  std::vector<scalar_object> falsevals(Nsimd);

-    std::vector<Integer> mask(Nsimd);
-    std::vector<scalar_object> truevals (Nsimd);
-    std::vector<scalar_object> falsevals(Nsimd);
+  extract(iftrue, truevals);
+  extract(iffalse, falsevals);
+  extract<vInteger, Integer>(TensorRemove(predicate), mask);

-    extract(iftrue   ,truevals);
-    extract(iffalse  ,falsevals);
-    extract<vInteger,Integer>(TensorRemove(predicate),mask);
-
-    for(int s=0;s<Nsimd;s++){
-      if (mask[s]) falsevals[s]=truevals[s];
-    }
-
-    merge(ret,falsevals);
-    return ret;
+  for (int s = 0; s < Nsimd; s++) {
+    if (mask[s]) falsevals[s] = truevals[s];
  }

+  merge(ret, falsevals);
+  return ret;
+}
+
 ////////////////////////////////////////////
 // recursive evaluation of expressions; Could
 // switch to generic approach with variadics, a la
@@ -75,303 +76,351 @@ namespace Grid {
 // from tuple is hideous; C++14 introduces std::make_index_sequence for this
 ////////////////////////////////////////////

+// leaf eval of lattice ; should enable if protect using traits

-//leaf eval of lattice ; should enable if protect using traits
+template <typename T>
+using is_lattice = std::is_base_of<LatticeBase, T>;

-template <typename T> using is_lattice      = std::is_base_of<LatticeBase,T >;
+template <typename T>
+using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;

 template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;

+//Specialization of getVectorType for lattices
+template<typename T>
+struct getVectorType<Lattice<T> >{
+  typedef typename Lattice<T>::vector_object type;
+};
+ 
 template<class sobj>
 inline sobj eval(const unsigned int ss, const sobj &arg)
 {
  return arg;
 }
-template<class lobj>
-inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg)
-{
-    return arg._odata[ss];
+template <class lobj>
+inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
+  return arg._odata[ss];
 }

 // handle nodes in syntax tree
 template <typename Op, typename T1>
-auto inline eval(const unsigned int ss, const LatticeUnaryExpression<Op,T1 > &expr) // eval one operand
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second))))
-{
-  return expr.first.func(eval(ss,std::get<0>(expr.second)));
+auto inline eval(
+    const unsigned int ss,
+    const LatticeUnaryExpression<Op, T1> &expr)  // eval one operand
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)));
 }

 template <typename Op, typename T1, typename T2>
-auto inline eval(const unsigned int ss, const LatticeBinaryExpression<Op,T1,T2> &expr) // eval two operands
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second))))
-{
-  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)));
+auto inline eval(
+    const unsigned int ss,
+    const LatticeBinaryExpression<Op, T1, T2> &expr)  // eval two operands
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+                                eval(ss, std::get<1>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)),
+                         eval(ss, std::get<1>(expr.second)));
 }

 template <typename Op, typename T1, typename T2, typename T3>
-auto inline eval(const unsigned int ss, const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) // eval three operands
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second))))
-{
-  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second)) );
+auto inline eval(const unsigned int ss,
+                 const LatticeTrinaryExpression<Op, T1, T2, T3>
+                     &expr)  // eval three operands
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+                                eval(ss, std::get<1>(expr.second)),
+                                eval(ss, std::get<2>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)),
+                         eval(ss, std::get<1>(expr.second)),
+                         eval(ss, std::get<2>(expr.second)));
 }

 //////////////////////////////////////////////////////////////////////////
-// Obtain the grid from an expression, ensuring conformable. This must follow a tree recursion
+// Obtain the grid from an expression, ensuring conformable. This must follow a
+// tree recursion
 //////////////////////////////////////////////////////////////////////////
-template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
-inline void GridFromExpression(GridBase * &grid,const T1& lat)   // Lattice leaf
-{
-  if ( grid ) {
-    conformable(grid,lat._grid);
-  } 
-  grid=lat._grid;
-}
-template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
-inline void GridFromExpression(GridBase * &grid,const T1& notlat)   // non-lattice leaf
+template <class T1,
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf
 {
+  if (grid) {
+    conformable(grid, lat._grid);
+  }
+  grid = lat._grid;
 }
+template <class T1,
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void GridFromExpression(GridBase *&grid,
+                               const T1 &notlat)  // non-lattice leaf
+{}
 template <typename Op, typename T1>
-inline void GridFromExpression(GridBase * &grid,const LatticeUnaryExpression<Op,T1 > &expr)
-{
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse 
+inline void GridFromExpression(GridBase *&grid,
+                               const LatticeUnaryExpression<Op, T1> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
 }

 template <typename Op, typename T1, typename T2>
-inline void GridFromExpression(GridBase * &grid,const LatticeBinaryExpression<Op,T1,T2> &expr) 
-{
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse
-  GridFromExpression(grid,std::get<1>(expr.second));
+inline void GridFromExpression(
+    GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid, std::get<1>(expr.second));
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
-{
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse
-  GridFromExpression(grid,std::get<1>(expr.second));
-  GridFromExpression(grid,std::get<2>(expr.second));
+inline void GridFromExpression(
+    GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid, std::get<1>(expr.second));
+  GridFromExpression(grid, std::get<2>(expr.second));
 }

-
 //////////////////////////////////////////////////////////////////////////
-// Obtain the CB from an expression, ensuring conformable. This must follow a tree recursion
+// Obtain the CB from an expression, ensuring conformable. This must follow a
+// tree recursion
 //////////////////////////////////////////////////////////////////////////
-template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
-inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
+template <class T1,
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
 {
-  if ( (cb==Odd) || (cb==Even) ) {
-    assert(cb==lat.checkerboard);
-  } 
-  cb=lat.checkerboard;
+  if ((cb == Odd) || (cb == Even)) {
+    assert(cb == lat.checkerboard);
+  }
+  cb = lat.checkerboard;
  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
 }
-template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
-inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
+template <class T1,
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
 {
  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
 }
 template <typename Op, typename T1>
-inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
-{
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
+inline void CBFromExpression(int &cb,
+                             const LatticeUnaryExpression<Op, T1> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
 }

 template <typename Op, typename T1, typename T2>
-inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &expr) 
-{
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse
-  CBFromExpression(cb,std::get<1>(expr.second));
+inline void CBFromExpression(int &cb,
+                             const LatticeBinaryExpression<Op, T1, T2> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb, std::get<1>(expr.second));
  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
-{
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse
-  CBFromExpression(cb,std::get<1>(expr.second));
-  CBFromExpression(cb,std::get<2>(expr.second));
+inline void CBFromExpression(
+    int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb, std::get<1>(expr.second));
+  CBFromExpression(cb, std::get<2>(expr.second));
  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
 }

 ////////////////////////////////////////////
 // Unary operators and funcs
 ////////////////////////////////////////////
-#define GridUnopClass(name,ret)\
-template <class arg> struct name\
-{\
-  static auto inline func(const arg a)-> decltype(ret) { return ret; } \
-};
+#define GridUnopClass(name, ret)                                          \
+  template <class arg>                                                    \
+  struct name {                                                           \
+    static auto inline func(const arg a) -> decltype(ret) { return ret; } \
+  };

-GridUnopClass(UnarySub,-a);
-GridUnopClass(UnaryNot,Not(a));
-GridUnopClass(UnaryAdj,adj(a));
-GridUnopClass(UnaryConj,conjugate(a));
-GridUnopClass(UnaryTrace,trace(a));
-GridUnopClass(UnaryTranspose,transpose(a));
-GridUnopClass(UnaryTa,Ta(a));
-GridUnopClass(UnaryProjectOnGroup,ProjectOnGroup(a));
-GridUnopClass(UnaryReal,real(a));
-GridUnopClass(UnaryImag,imag(a));
-GridUnopClass(UnaryToReal,toReal(a));
-GridUnopClass(UnaryToComplex,toComplex(a));
-GridUnopClass(UnaryAbs,abs(a));
-GridUnopClass(UnarySqrt,sqrt(a));
-GridUnopClass(UnaryRsqrt,rsqrt(a));
-GridUnopClass(UnarySin,sin(a));
-GridUnopClass(UnaryCos,cos(a));
-GridUnopClass(UnaryLog,log(a));
-GridUnopClass(UnaryExp,exp(a));
+GridUnopClass(UnarySub, -a);
+GridUnopClass(UnaryNot, Not(a));
+GridUnopClass(UnaryAdj, adj(a));
+GridUnopClass(UnaryConj, conjugate(a));
+GridUnopClass(UnaryTrace, trace(a));
+GridUnopClass(UnaryTranspose, transpose(a));
+GridUnopClass(UnaryTa, Ta(a));
+GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
+GridUnopClass(UnaryReal, real(a));
+GridUnopClass(UnaryImag, imag(a));
+GridUnopClass(UnaryToReal, toReal(a));
+GridUnopClass(UnaryToComplex, toComplex(a));
+GridUnopClass(UnaryTimesI, timesI(a));
+GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
+GridUnopClass(UnaryAbs, abs(a));
+GridUnopClass(UnarySqrt, sqrt(a));
+GridUnopClass(UnaryRsqrt, rsqrt(a));
+GridUnopClass(UnarySin, sin(a));
+GridUnopClass(UnaryCos, cos(a));
+GridUnopClass(UnaryAsin, asin(a));
+GridUnopClass(UnaryAcos, acos(a));
+GridUnopClass(UnaryLog, log(a));
+GridUnopClass(UnaryExp, exp(a));

 ////////////////////////////////////////////
 // Binary operators
 ////////////////////////////////////////////
-#define GridBinOpClass(name,combination)\
-template <class left,class right>\
-struct name\
-{\
-  static auto inline func(const left &lhs,const right &rhs)-> decltype(combination) const \
-    {\
-      return combination;\
-    }\
-}
-GridBinOpClass(BinaryAdd,lhs+rhs);
-GridBinOpClass(BinarySub,lhs-rhs);
-GridBinOpClass(BinaryMul,lhs*rhs);
+#define GridBinOpClass(name, combination)                      \
+  template <class left, class right>                           \
+  struct name {                                                \
+    static auto inline func(const left &lhs, const right &rhs) \
+        -> decltype(combination) const {                       \
+      return combination;                                      \
+    }                                                          \
+  }
+GridBinOpClass(BinaryAdd, lhs + rhs);
+GridBinOpClass(BinarySub, lhs - rhs);
+GridBinOpClass(BinaryMul, lhs *rhs);

-GridBinOpClass(BinaryAnd   ,lhs&rhs);
-GridBinOpClass(BinaryOr    ,lhs|rhs);
-GridBinOpClass(BinaryAndAnd,lhs&&rhs);
-GridBinOpClass(BinaryOrOr  ,lhs||rhs);
+GridBinOpClass(BinaryAnd, lhs &rhs);
+GridBinOpClass(BinaryOr, lhs | rhs);
+GridBinOpClass(BinaryAndAnd, lhs &&rhs);
+GridBinOpClass(BinaryOrOr, lhs || rhs);

 ////////////////////////////////////////////////////
 // Trinary conditional op
 ////////////////////////////////////////////////////
-#define GridTrinOpClass(name,combination)\
-template <class predicate,class left, class right>	\
-struct name\
-{\
-  static auto inline func(const predicate &pred,const left &lhs,const right &rhs)-> decltype(combination) const \
-    {\
-      return combination;\
-    }\
-}
+#define GridTrinOpClass(name, combination)                                     \
+  template <class predicate, class left, class right>                          \
+  struct name {                                                                \
+    static auto inline func(const predicate &pred, const left &lhs,            \
+                            const right &rhs) -> decltype(combination) const { \
+      return combination;                                                      \
+    }                                                                          \
+  }

-GridTrinOpClass(TrinaryWhere,(predicatedWhere<predicate, \
-			       typename std::remove_reference<left>::type, \
-			       typename std::remove_reference<right>::type> (pred,lhs,rhs)));
+GridTrinOpClass(
+    TrinaryWhere,
+    (predicatedWhere<predicate, typename std::remove_reference<left>::type,
+                     typename std::remove_reference<right>::type>(pred, lhs,
+                                                                  rhs)));

 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
- 
-#define GRID_UNOP(name)   name<decltype(eval(0, arg))>
-#define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
-#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>

-#define GRID_DEF_UNOP(op, name)\
-template <typename T1,\
-  typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr> inline auto op(const T1 &arg) \
-  -> decltype(LatticeUnaryExpression<GRID_UNOP(name),const T1&>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg)))) \
-{ return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg))); }
+#define GRID_UNOP(name) name<decltype(eval(0, arg))>
+#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+#define GRID_TRINOP(name) \
+  name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>

-#define GRID_BINOP_LEFT(op, name)\
-template <typename T1,typename T2,\
-          typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr>\
-inline auto op(const T1 &lhs,const T2&rhs) \
-  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-											    std::forward_as_tuple(lhs, rhs)))) \
-{\
- return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-									  std::forward_as_tuple(lhs, rhs))); \
-}
+#define GRID_DEF_UNOP(op, name)                                             \
+  template <typename T1,                                                    \
+            typename std::enable_if<is_lattice<T1>::value ||                \
+                                        is_lattice_expr<T1>::value,         \
+                                    T1>::type * = nullptr>                  \
+  inline auto op(const T1 &arg)                                             \
+      ->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(       \
+          std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
+    return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(             \
+        std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)));     \
+  }

-#define GRID_BINOP_RIGHT(op, name)\
- template <typename T1,typename T2,\
-           typename std::enable_if<!is_lattice<T1>::value && !is_lattice_expr<T1>::value, T1>::type* = nullptr,\
-           typename std::enable_if< is_lattice<T2>::value ||  is_lattice_expr<T2>::value, T2>::type* = nullptr> \
-inline auto op(const T1 &lhs,const T2&rhs)			\
-  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-											    std::forward_as_tuple(lhs, rhs)))) \
-{\
- return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-								          std::forward_as_tuple(lhs, rhs))); \
-}
+#define GRID_BINOP_LEFT(op, name)                                             \
+  template <typename T1, typename T2,                                         \
+            typename std::enable_if<is_lattice<T1>::value ||                  \
+                                        is_lattice_expr<T1>::value,           \
+                                    T1>::type * = nullptr>                    \
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
+      ->decltype(                                                             \
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
+              std::make_pair(GRID_BINOP(name)(),                              \
+                             std::forward_as_tuple(lhs, rhs)))) {             \
+    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
+        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
+  }

-#define GRID_DEF_BINOP(op, name)\
- GRID_BINOP_LEFT(op,name);\
- GRID_BINOP_RIGHT(op,name);
+#define GRID_BINOP_RIGHT(op, name)                                            \
+  template <typename T1, typename T2,                                         \
+            typename std::enable_if<!is_lattice<T1>::value &&                 \
+                                        !is_lattice_expr<T1>::value,          \
+                                    T1>::type * = nullptr,                    \
+            typename std::enable_if<is_lattice<T2>::value ||                  \
+                                        is_lattice_expr<T2>::value,           \
+                                    T2>::type * = nullptr>                    \
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
+      ->decltype(                                                             \
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
+              std::make_pair(GRID_BINOP(name)(),                              \
+                             std::forward_as_tuple(lhs, rhs)))) {             \
+    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
+        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
+  }

+#define GRID_DEF_BINOP(op, name) \
+  GRID_BINOP_LEFT(op, name);     \
+  GRID_BINOP_RIGHT(op, name);

-#define GRID_DEF_TRINOP(op, name)\
-template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,const T2&lhs,const T3 &rhs) \
-  -> decltype(LatticeTrinaryExpression<GRID_TRINOP(name),const T1&,const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(),\
-										   std::forward_as_tuple(pred,lhs,rhs)))) \
-{\
-  return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(), \
-										 std::forward_as_tuple(pred,lhs, rhs))); \
-}
+#define GRID_DEF_TRINOP(op, name)                                              \
+  template <typename T1, typename T2, typename T3>                             \
+  inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)                 \
+      ->decltype(                                                              \
+          LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,  \
+                                   const T3 &>(std::make_pair(                 \
+              GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) {  \
+    return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
+                                    const T3 &>(std::make_pair(                \
+        GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)));          \
+  }
 ////////////////////////
-//Operator definitions
+// Operator definitions
 ////////////////////////

-GRID_DEF_UNOP(operator -,UnarySub);
-GRID_DEF_UNOP(Not,UnaryNot);
-GRID_DEF_UNOP(operator !,UnaryNot);
-GRID_DEF_UNOP(adj,UnaryAdj);
-GRID_DEF_UNOP(conjugate,UnaryConj);
-GRID_DEF_UNOP(trace,UnaryTrace);
-GRID_DEF_UNOP(transpose,UnaryTranspose);
-GRID_DEF_UNOP(Ta,UnaryTa);
-GRID_DEF_UNOP(ProjectOnGroup,UnaryProjectOnGroup);
-GRID_DEF_UNOP(real,UnaryReal);
-GRID_DEF_UNOP(imag,UnaryImag);
-GRID_DEF_UNOP(toReal,UnaryToReal);
-GRID_DEF_UNOP(toComplex,UnaryToComplex);
-GRID_DEF_UNOP(abs  ,UnaryAbs); //abs overloaded in cmath C++98; DON'T do the abs-fabs-dabs-labs thing
-GRID_DEF_UNOP(sqrt ,UnarySqrt);
-GRID_DEF_UNOP(rsqrt,UnaryRsqrt);
-GRID_DEF_UNOP(sin  ,UnarySin);
-GRID_DEF_UNOP(cos  ,UnaryCos);
-GRID_DEF_UNOP(log  ,UnaryLog);
-GRID_DEF_UNOP(exp  ,UnaryExp);
+GRID_DEF_UNOP(operator-, UnarySub);
+GRID_DEF_UNOP(Not, UnaryNot);
+GRID_DEF_UNOP(operator!, UnaryNot);
+GRID_DEF_UNOP(adj, UnaryAdj);
+GRID_DEF_UNOP(conjugate, UnaryConj);
+GRID_DEF_UNOP(trace, UnaryTrace);
+GRID_DEF_UNOP(transpose, UnaryTranspose);
+GRID_DEF_UNOP(Ta, UnaryTa);
+GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
+GRID_DEF_UNOP(real, UnaryReal);
+GRID_DEF_UNOP(imag, UnaryImag);
+GRID_DEF_UNOP(toReal, UnaryToReal);
+GRID_DEF_UNOP(toComplex, UnaryToComplex);
+GRID_DEF_UNOP(timesI, UnaryTimesI);
+GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
+GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
+                               // abs-fabs-dabs-labs thing
+GRID_DEF_UNOP(sqrt, UnarySqrt);
+GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
+GRID_DEF_UNOP(sin, UnarySin);
+GRID_DEF_UNOP(cos, UnaryCos);
+GRID_DEF_UNOP(asin, UnaryAsin);
+GRID_DEF_UNOP(acos, UnaryAcos);
+GRID_DEF_UNOP(log, UnaryLog);
+GRID_DEF_UNOP(exp, UnaryExp);

-GRID_DEF_BINOP(operator+,BinaryAdd);
-GRID_DEF_BINOP(operator-,BinarySub);
-GRID_DEF_BINOP(operator*,BinaryMul);
+GRID_DEF_BINOP(operator+, BinaryAdd);
+GRID_DEF_BINOP(operator-, BinarySub);
+GRID_DEF_BINOP(operator*, BinaryMul);

-GRID_DEF_BINOP(operator&,BinaryAnd);
-GRID_DEF_BINOP(operator|,BinaryOr);
-GRID_DEF_BINOP(operator&&,BinaryAndAnd);
-GRID_DEF_BINOP(operator||,BinaryOrOr);
+GRID_DEF_BINOP(operator&, BinaryAnd);
+GRID_DEF_BINOP(operator|, BinaryOr);
+GRID_DEF_BINOP(operator&&, BinaryAndAnd);
+GRID_DEF_BINOP(operator||, BinaryOrOr);

-GRID_DEF_TRINOP(where,TrinaryWhere);
+GRID_DEF_TRINOP(where, TrinaryWhere);

 /////////////////////////////////////////////////////////////
 // Closure convenience to force expression to evaluate
 /////////////////////////////////////////////////////////////
-template<class Op,class T1>
-  auto closure(const LatticeUnaryExpression<Op,T1> & expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))>
-{
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))> ret(expr);
+template <class Op, class T1>
+auto closure(const LatticeUnaryExpression<Op, T1> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
+      expr);
  return ret;
 }
-template<class Op,class T1, class T2>
-  auto closure(const LatticeBinaryExpression<Op,T1,T2> & expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				      eval(0,std::get<1>(expr.second))))>
-{
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				   eval(0,std::get<1>(expr.second))))> ret(expr);
+template <class Op, class T1, class T2>
+auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                        eval(0, std::get<1>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second))))>
+      ret(expr);
  return ret;
 }
-template<class Op,class T1, class T2, class T3>
-  auto closure(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				      eval(0,std::get<1>(expr.second)),
-				      eval(0,std::get<2>(expr.second))))>
-{
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				   eval(0,std::get<1>(expr.second)),
-				   eval(0,std::get<2>(expr.second))))> ret(expr);
+template <class Op, class T1, class T2, class T3>
+auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                        eval(0, std::get<1>(expr.second)),
+                                        eval(0, std::get<2>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second)),
+                                   eval(0, std::get<2>(expr.second))))>
+      ret(expr);
  return ret;
 }

@@ -382,12 +431,11 @@ template<class Op,class T1, class T2, class T3>
 #undef GRID_DEF_UNOP
 #undef GRID_DEF_BINOP
 #undef GRID_DEF_TRINOP
-
 }

 #if 0
 using namespace Grid;
- 	      
+        
 int main(int argc,char **argv){
   
   Lattice<double> v1(16);
@@ -397,7 +445,7 @@ using namespace Grid;
   BinaryAdd<double,double> tmp;
   LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &> 
     expr(std::make_pair(tmp,
-	  std::forward_as_tuple(v1,v2)));
+    std::forward_as_tuple(v1,v2)));
   tmp.func(eval(0,v1),eval(0,v2));

   auto var = v1+v2;
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -1,32 +1,33 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/lattice/Lattice_base.h
+Source file: ./lib/lattice/Lattice_base.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_LATTICE_BASE_H
 #define GRID_LATTICE_BASE_H

@@ -101,6 +102,7 @@ public:
    int begin(void) { return 0;};
    int end(void)   { return _odata.size(); }
    vobj & operator[](int i) { return _odata[i]; };
+    const vobj & operator[](int i) const { return _odata[i]; };

 public:
    typedef typename vobj::scalar_type scalar_type;
@@ -255,6 +257,18 @@ PARALLEL_FOR_LOOP
        checkerboard=0;
    }

+    Lattice(const Lattice& r){ // copy constructor
+    	_grid = r._grid;
+    	checkerboard = r.checkerboard;
+    	_odata.resize(_grid->oSites());// essential
+  		PARALLEL_FOR_LOOP
+        for(int ss=0;ss<_grid->oSites();ss++){
+            _odata[ss]=r._odata[ss];
+        }  	
+    }
+
+
+
    virtual ~Lattice(void) = default;
    
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
@@ -267,7 +281,7 @@ PARALLEL_FOR_LOOP
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
-      std::cout<<GridLogMessage<<"Lattice operator ="<<std::endl;
+      
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
@@ -324,27 +338,27 @@ PARALLEL_FOR_LOOP



-#include <lattice/Lattice_conformable.h>
+#include "Lattice_conformable.h"
 #define GRID_LATTICE_EXPRESSION_TEMPLATES
 #ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES
-#include <lattice/Lattice_ET.h>
+#include "Lattice_ET.h"
 #else 
-#include <lattice/Lattice_overload.h>
+#include "Lattice_overload.h"
 #endif
-#include <lattice/Lattice_arith.h>
-#include <lattice/Lattice_trace.h>
-#include <lattice/Lattice_transpose.h>
-#include <lattice/Lattice_local.h>
-#include <lattice/Lattice_reduction.h>
-#include <lattice/Lattice_peekpoke.h>
-#include <lattice/Lattice_reality.h>
-#include <lattice/Lattice_comparison_utils.h>
-#include <lattice/Lattice_comparison.h>
-#include <lattice/Lattice_coordinate.h>
-#include <lattice/Lattice_where.h>
-#include <lattice/Lattice_rng.h>
-#include <lattice/Lattice_unary.h>
-#include <lattice/Lattice_transfer.h>
+#include "Lattice_arith.h"
+#include "Lattice_trace.h"
+#include "Lattice_transpose.h"
+#include "Lattice_local.h"
+#include "Lattice_reduction.h"
+#include "Lattice_peekpoke.h"
+#include "Lattice_reality.h"
+#include "Lattice_comparison_utils.h"
+#include "Lattice_comparison.h"
+#include "Lattice_coordinate.h"
+#include "Lattice_where.h"
+#include "Lattice_rng.h"
+#include "Lattice_unary.h"
+#include "Lattice_transfer.h"


 #endif
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -40,7 +40,7 @@ namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
    ComplexD nrm = innerProduct(arg,arg);
-    return real(nrm); 
+    return std::real(nrm); 
  }

    template<class vobj>
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -386,7 +386,7 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
  }

  // the above should guarantee that the operations are local
-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -420,15 +420,15 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
  assert(hg->_processors[orthog]==1);

  int dl; dl = 0;
-  for(int d=0;d<nh;d++){
-    if ( d != orthog) {
-      assert(lg->_processors[dl]  == hg->_processors[d]);
-      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
-      dl++;
+    for(int d=0;d<nh;d++){
+      if ( d != orthog) {
+	assert(lg->_processors[dl]  == hg->_processors[d]);
+	assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
+	dl++;
    }
  }
  // the above should guarantee that the operations are local
-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -482,6 +482,96 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)

 }

+//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
+template<typename vobj, typename sobj>
+typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in){
+  typedef typename vobj::vector_type vtype;
+  
+  GridBase* in_grid = in._grid;
+  out.resize(in_grid->lSites());
+  
+  int ndim = in_grid->Nd();
+  int in_nsimd = vtype::Nsimd();

+  std::vector<std::vector<int> > in_icoor(in_nsimd);
+      
+  for(int lane=0; lane < in_nsimd; lane++){
+    in_icoor[lane].resize(ndim);
+    in_grid->iCoorFromIindex(in_icoor[lane], lane);
+  }
+  
+PARALLEL_FOR_LOOP
+  for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
+    //Assemble vector of pointers to output elements
+    std::vector<sobj*> out_ptrs(in_nsimd);
+
+    std::vector<int> in_ocoor(ndim);
+    in_grid->oCoorFromOindex(in_ocoor, in_oidx);
+
+    std::vector<int> lcoor(in_grid->Nd());
+      
+    for(int lane=0; lane < in_nsimd; lane++){
+      for(int mu=0;mu<ndim;mu++)
+	lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
+
+      int lex;
+      Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
+      out_ptrs[lane] = &out[lex];
+    }
+    
+    //Unpack into those ptrs
+    const vobj & in_vobj = in._odata[in_oidx];
+    extract1(in_vobj, out_ptrs, 0);
+  }
+}
+
+//Convert a Lattice from one precision to another
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
+  assert(out._grid->Nd() == in._grid->Nd());
+  out.checkerboard = in.checkerboard;
+  GridBase *in_grid=in._grid;
+  GridBase *out_grid = out._grid;
+
+  typedef typename VobjOut::scalar_object SobjOut;
+  typedef typename VobjIn::scalar_object SobjIn;
+
+  int ndim = out._grid->Nd();
+  int out_nsimd = out_grid->Nsimd();
+    
+  std::vector<std::vector<int> > out_icoor(out_nsimd);
+      
+  for(int lane=0; lane < out_nsimd; lane++){
+    out_icoor[lane].resize(ndim);
+    out_grid->iCoorFromIindex(out_icoor[lane], lane);
+  }
+        
+  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
+  unvectorizeToLexOrdArray(in_slex_conv, in);
+    
+  PARALLEL_FOR_LOOP
+  for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
+    std::vector<int> out_ocoor(ndim);
+    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
+
+    std::vector<SobjOut*> ptrs(out_nsimd);      
+
+    std::vector<int> lcoor(out_grid->Nd());
+      
+    for(int lane=0; lane < out_nsimd; lane++){
+      for(int mu=0;mu<ndim;mu++)
+	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
+	
+      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
+      ptrs[lane] = &in_slex_conv[llex];
+    }
+    merge(out._odata[out_oidx], ptrs, 0);
+  }
+}
+
+
+  
+
+ 
 }
 #endif
--- a/lib/pugixml/.dirstamp
+++ b/lib/pugixml/.dirstamp
--- a/lib/pugixml/pugixml.h
+++ b/lib/pugixml/pugixml.h
@@ -17,7 +17,7 @@
 #endif

 // Include user configuration file (this can define various configuration macros)
-#include <pugixml/pugiconfig.hpp>
+#include "pugiconfig.hpp"

 #ifndef HEADER_PUGIXML_HPP
 #define HEADER_PUGIXML_HPP
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -60,6 +60,12 @@ namespace QCD {
    static const int SpinIndex   = 1;
    static const int LorentzIndex= 0;

+    // Also should make these a named enum type
+    static const int DaggerNo=0;
+    static const int DaggerYes=1;
+    static const int InverseNo=0;
+    static const int InverseYes=1;
+
    // Useful traits is this a spin index
    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;

@@ -484,16 +490,16 @@ namespace QCD {
 }   //namespace QCD
 } // Grid

-#include <qcd/utils/SpaceTimeGrid.h>
-#include <qcd/spin/Dirac.h>
-#include <qcd/spin/TwoSpinor.h>
-#include <qcd/utils/LinalgUtils.h>
-#include <qcd/utils/CovariantCshift.h>
-#include <qcd/utils/SUn.h>
-#include <qcd/action/Actions.h>
-#include <qcd/hmc/integrators/Integrator.h>
-#include <qcd/hmc/integrators/Integrator_algorithm.h>
-#include <qcd/hmc/HMC.h>
-
+#include <Grid/qcd/utils/SpaceTimeGrid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/spin/TwoSpinor.h>
+#include <Grid/qcd/utils/LinalgUtils.h>
+#include <Grid/qcd/utils/CovariantCshift.h>
+#include <Grid/qcd/utils/SUn.h>
+#include <Grid/qcd/action/Actions.h>
+#include <Grid/qcd/hmc/integrators/Integrator.h>
+#include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
+#include <Grid/qcd/hmc/HMC.h>
+#include <Grid/qcd/smearing/Smearing.h>

 #endif
--- a/lib/qcd/action/ActionBase.h
+++ b/lib/qcd/action/ActionBase.h
@@ -35,6 +35,7 @@ template<class GaugeField>
 class Action { 

 public:
+  bool is_smeared = false;
  // Boundary conditions? // Heatbath?
  virtual void  refresh(const GaugeField &U, GridParallelRNG& pRNG) = 0;// refresh pseudofermions
  virtual RealD S    (const GaugeField &U)                        = 0;  // evaluate the action
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -40,25 +40,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 // Abstract base interface
 ////////////////////////////////////////////
-#include <qcd/action/ActionBase.h>
-#include <qcd/action/ActionParams.h>
+#include <Grid/qcd/action/ActionBase.h>
+#include <Grid/qcd/action/ActionParams.h>

 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
-#include <qcd/action/gauge/GaugeImpl.h>
-#include <qcd/utils/WilsonLoops.h>
+#include <Grid/qcd/action/gauge/GaugeImpl.h>
+#include <Grid/qcd/utils/WilsonLoops.h>

-#include <qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
-#include <qcd/action/fermion/FermionOperatorImpl.h>
-#include <qcd/action/fermion/FermionOperator.h>
-#include <qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
+#include <Grid/qcd/action/fermion/FermionOperator.h>
+#include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions

 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
-#include <qcd/action/gauge/WilsonGaugeAction.h>
-#include <qcd/action/gauge/PlaqPlusRectangleAction.h>
+#include <Grid/qcd/action/gauge/WilsonGaugeAction.h>
+#include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h>

 namespace Grid {
 namespace QCD {
@@ -107,41 +107,50 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 // for EVERY .cc file. This define centralises the list and restores global push of impl cases
 ////////////////////////////////////////////////////////////////////////////////////////////////////

-#define FermOpTemplateInstantiate(A) \
+
+#define FermOp4dVecTemplateInstantiate(A) \
  template class A<WilsonImplF>;		\
  template class A<WilsonImplD>;		\
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		

+#define FermOp5dVecTemplateInstantiate(A) \
+  template class A<DomainWallVec5dImplF>;	\
+  template class A<DomainWallVec5dImplD>;	
+
+#define FermOpTemplateInstantiate(A) \
+ FermOp4dVecTemplateInstantiate(A) \
+ FermOp5dVecTemplateInstantiate(A) 
+
 #define GparityFermOpTemplateInstantiate(A) 

 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////

-#include <qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
-#include <qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
-#include <qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
+#include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types

-//#include <qcd/action/fermion/CloverFermion.h>
+//#include <Grid/qcd/action/fermion/CloverFermion.h>

-#include <qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
-#include <qcd/action/fermion/DomainWallFermion.h>
-#include <qcd/action/fermion/DomainWallFermion.h>
-#include <qcd/action/fermion/MobiusFermion.h>
-#include <qcd/action/fermion/ScaledShamirFermion.h>
-#include <qcd/action/fermion/MobiusZolotarevFermion.h>
-#include <qcd/action/fermion/ShamirZolotarevFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
+#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/MobiusFermion.h>
+#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
+#include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>

-#include <qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
-#include <qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>

-#include <qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
-#include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
@@ -222,21 +231,21 @@ typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
-#include <qcd/action/fermion/g5HermitianLinop.h>
+#include <Grid/qcd/action/fermion/g5HermitianLinop.h>

 ////////////////////////////////////////
 // Pseudo fermion combinations for HMC
 ////////////////////////////////////////
-#include <qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
+#include <Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>

-#include <qcd/action/pseudofermion/TwoFlavour.h>
-#include <qcd/action/pseudofermion/TwoFlavourRatio.h>
-#include <qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
-#include <qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavour.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourRatio.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>

-#include <qcd/action/pseudofermion/OneFlavourRational.h>
-#include <qcd/action/pseudofermion/OneFlavourRationalRatio.h>
-#include <qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
-#include <qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>

 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
 #include <Grid.h>
+
+
 namespace Grid {
 namespace QCD {

@@ -45,486 +48,342 @@ namespace QCD {
 		   FourDimGrid,
 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
- {
- }
+ { }

- template<class Impl>
-  void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
-  {
-    // Assemble Din
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	Din = bs psi[s] + cs[s] psi[s+1}
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	//      Din+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
-	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
-      }
-    }
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag (Ls,1.0);
+  std::vector<RealD> upper(Ls,-1.0); upper[Ls-1]=mass;
+  std::vector<RealD> lower(Ls,-1.0); lower[0]   =mass;
+  M5D(psi,chi,chi,lower,diag,upper);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = bs;
+  std::vector<RealD> upper= cs;
+  std::vector<RealD> lower= cs; 
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,Din,lower,diag,upper);
+}
+template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = beo;
+  std::vector<RealD> upper(Ls);
+  std::vector<RealD> lower(Ls);
+  for(int i=0;i<Ls;i++) {
+    upper[i]=-ceo[i];
+    lower[i]=-ceo[i];
  }
- template<class Impl>
-  void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
-  {
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
-	axpby_ssp_pminus(Din,1.0,Din,-mass*cs[Ls-1],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus (Din,bs[s],psi,-mass*cs[0],psi,s,0);
-	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
-      } else {
-	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
-	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
-      }
-    }
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,chi,lower,diag,upper);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = bee;
+  std::vector<RealD> upper(Ls);
+  std::vector<RealD> lower(Ls);
+  for(int i=0;i<Ls;i++) {
+    upper[i]=-cee[i];
+    lower[i]=-cee[i];
  }
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,chi,lower,diag,upper);
+}

-  // override multiply
- template<class Impl>
-  RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = bee;
+  std::vector<RealD> upper(Ls);
+  std::vector<RealD> lower(Ls);

-    FermionField Din(psi._grid);
-
-    // Assemble Din
-    /*
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	Din = bs psi[s] + cs[s] psi[s+1}
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	//      Din+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
-	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
-      }
-    }
-    */
-    Meooe5D(psi,Din);
-
-    this->DW(Din,chi,DaggerNo);
-    // ((b D_W + D_w hop terms +1) on s-diag
-    axpby(chi,1.0,1.0,chi,psi); 
-
-    // Call Mooee??
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ){
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,Ls-1);
-      } else if ( s==(Ls-1)) {
-	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,0);
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
-      }
-    }
-    return norm2(chi);
-  }
-
- template<class Impl>
-  RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
-  {
-    // Under adjoint
-    //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
-    //D2- P+     D2+            P-D1-^dag D2+dag
-
-    FermionField Din(psi._grid);
-    // Apply Dw
-    this->DW(psi,Din,DaggerYes); 
-
-    MeooeDag5D(Din,chi);
-
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-
-      // Collect the terms in DW
-      //	Chi = bs Din[s] + cs[s] Din[s+1}
-      //    Chi+= -mass*cs[s] psi[s+1}
-      /*
-      if ( s==0 ) {
-	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,-mass*cs[Ls-1],Din,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus (chi,bs[s],Din,-mass*cs[0],Din,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
-      } else {
-	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
-      }
-      */
-
-      // FIXME just call MooeeDag??
-
-      // Collect the terms indept of DW
-      if ( s==0 ){
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,Ls-1);
-      } else if ( s==(Ls-1)) {
-	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
-      }
-    }
-    // ((b D_W + D_w hop terms +1) on s-diag
-    axpby (chi,1.0,1.0,chi,psi); 
-    return norm2(chi);
-  }
-
-  // half checkerboard operations
- template<class Impl>
-  void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-
-    FermionField tmp(psi._grid);
+  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
-    Meooe5D(psi,tmp); 
-#if 0
-    std::cout << "Meooe Test replacement norm2 tmp = " <<norm2(tmp)<<std::endl;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	tmp = bs psi[s] + cs[s] psi[s+1}
-	//      tmp+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
-	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
-	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
-	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      }
-    }
-    std::cout << "Meooe Test replacement norm2 tmp old = " <<norm2(tmp)<<std::endl;
-#endif
-
-    // Apply 4d dslash
-    if ( psi.checkerboard == Odd ) {
-      this->DhopEO(tmp,chi,DaggerNo);
+    if ( s==0 ) {
+      upper[s] = -cee[s+1] ;
+      lower[s] = mass*cee[Ls-1];
+    } else if ( s==(Ls-1)) { 
+      upper[s] = mass*cee[0];
+      lower[s] = -cee[s-1];
    } else {
-      this->DhopOE(tmp,chi,DaggerNo);
+      upper[s]=-cee[s+1];
+      lower[s]=-cee[s-1];
    }
  }

-  template<class Impl>
-  void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
-  {
-    FermionField tmp(psi._grid);
-    // Apply 4d dslash
-    if ( psi.checkerboard == Odd ) {
-      this->DhopEO(psi,tmp,DaggerYes);
-    } else {
-      this->DhopOE(psi,tmp,DaggerYes);
-    }
+  M5Ddag(psi,psi,chi,lower,diag,upper);
+}

-    MeooeDag5D(tmp,chi); 
-#if 0
-    std::cout << "Meooe Test replacement norm2 chi new = " <<norm2(chi)<<std::endl;
-    // Assemble the 5d matrix
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pplus(chi,beo[s],tmp,   -ceo[s+1]  ,tmp,s,s+1);
-	axpby_ssp_pminus(chi,   1.0,chi,mass*ceo[Ls-1],tmp,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus(chi,beo[s],tmp,mass*ceo[0],tmp,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-ceo[s-1],tmp,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,beo[s],tmp,-ceo[s+1],tmp,s,s+1);
-	axpby_ssp_pminus(chi,1.0   ,chi,-ceo[s-1],tmp,s,s-1);
-      }
-    }
-    std::cout << "Meooe Test replacement norm2 chi old = " <<norm2(chi)<<std::endl;
-#endif
+template<class Impl>
+void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag(Ls,1.0);
+  std::vector<RealD> upper(Ls,-1.0);
+  std::vector<RealD> lower(Ls,-1.0);
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5Ddag(psi,chi,chi,lower,diag,upper);
+}

+template<class Impl>
+void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag =bs;
+  std::vector<RealD> upper=cs;
+  std::vector<RealD> lower=cs;
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5Ddag(psi,psi,Din,lower,diag,upper);
+}
+
+template<class Impl>
+RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  
+  FermionField Din(psi._grid);
+  
+  // Assemble Din
+  Meooe5D(psi,Din);
+  
+  this->DW(Din,chi,DaggerNo);
+  // ((b D_W + D_w hop terms +1) on s-diag
+  axpby(chi,1.0,1.0,chi,psi); 
+  
+  M5D(psi,chi);
+  return(norm2(chi));
+}
+
+template<class Impl>
+RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
+{
+  // Under adjoint
+  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
+  //D2- P+     D2+            P-D1-^dag D2+dag
+  
+  FermionField Din(psi._grid);
+  // Apply Dw
+  this->DW(psi,Din,DaggerYes); 
+  
+  MeooeDag5D(Din,chi);
+  
+  M5Ddag(psi,chi);
+  // ((b D_W + D_w hop terms +1) on s-diag
+  axpby (chi,1.0,1.0,chi,psi); 
+  return norm2(chi);
+}
+
+// half checkerboard operations
+template<class Impl>
+void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  FermionField tmp(psi._grid);
+
+  Meooe5D(psi,tmp); 
+
+  if ( psi.checkerboard == Odd ) {
+    this->DhopEO(tmp,chi,DaggerNo);
+  } else {
+    this->DhopOE(tmp,chi,DaggerNo);
  }
+}

- template<class Impl>
-  void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    for (int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pminus(chi,bee[s],psi ,-cee[s],psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,mass*cee[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(chi,bee[s],psi,mass*cee[s],psi,s,0);
-	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(chi,bee[s],psi,-cee[s],psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
-      }
-    }
+template<class Impl>
+void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
+{
+  FermionField tmp(psi._grid);
+  // Apply 4d dslash
+  if ( psi.checkerboard == Odd ) {
+    this->DhopEO(psi,tmp,DaggerYes);
+  } else {
+    this->DhopOE(psi,tmp,DaggerYes);
  }
+  MeooeDag5D(tmp,chi); 
+}

- template<class Impl>
-  void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-    int Ls=this->Ls;
-    FermionField tmp(psi._grid);
-    // Assemble the 5d matrix
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	tmp = bs psi[s] + cs[s] psi[s+1}
-	//      tmp+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
-	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
-	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
-	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      }
-    }
-    // Apply 4d dslash fragment
-    this->DhopDir(tmp,chi,dir,disp);
+template<class Impl>
+void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+  FermionField tmp(psi._grid);
+  Meo5D(psi,tmp);
+  // Apply 4d dslash fragment
+  this->DhopDir(tmp,chi,dir,disp);
+}
+// force terms; five routines; default to Dhop on diagonal
+template<class Impl>
+void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
+  
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDeriv(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    Meooe5D(U,Din);
+    this->DhopDeriv(mat,Din,V,dag);
  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    for (int s=0;s<Ls;s++){
-      // Assemble the 5d matrix
-      if ( s==0 ) {
-	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1]  ,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,mass*cee[Ls-1],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus(chi,bee[s],psi,mass*cee[0],psi,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-cee[s-1],psi,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1],psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0   ,chi,-cee[s-1],psi,s,s-1);
-      }
-    }
-  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    // Apply (L^{\prime})^{-1}
-    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
-    for (int s=1;s<Ls;s++){
-      axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-    }
-    // L_m^{-1} 
-    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
-    }
-    // U_m^{-1} D^{-1}
-    for (int s=0;s<Ls-1;s++){
-      // Chi[s] + 1/d chi[s] 
-      axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
-    }	
-    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
-    
-    // Apply U^{-1}
-    for (int s=Ls-2;s>=0;s--){
-      axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
-    }
-  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    // Apply (U^{\prime})^{-dagger}
-    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
-    for (int s=1;s<Ls;s++){
-      axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
-    }
-    // U_m^{-\dagger} 
-    for (int s=0;s<Ls-1;s++){
-      axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
-    }
-    // L_m^{-\dagger} D^{-dagger}
-    for (int s=0;s<Ls-1;s++){
-      axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
-    }	
-    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
-    
-    // Apply L^{-dagger}
-    for (int s=Ls-2;s>=0;s--){
-      axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
-    }
-  }
-
-  // force terms; five routines; default to Dhop on diagonal
-  template<class Impl>
-  void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDeriv(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-      Meooe5D(U,Din);
-      this->DhopDeriv(mat,Din,V,dag);
-    }
-  };
- template<class Impl>
-  void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDerivOE(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+};
+template<class Impl>
+void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
+  
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDerivOE(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      Meooe5D(U,Din);
      this->DhopDerivOE(mat,Din,V,dag);
-    }
-  };
- template<class Impl>
-  void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDerivEO(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-      Meooe5D(U,Din);
-      this->DhopDerivEO(mat,Din,V,dag);
-    }
-  };
+  }
+};
+template<class Impl>
+void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
  
-  // Tanh
- template<class Impl>
-  void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
-  {
-    SetCoefficientsZolotarev(1.0,zdata,b,c);
-
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDerivEO(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    Meooe5D(U,Din);
+    this->DhopDerivEO(mat,Din,V,dag);
  }
-  //Zolo
- template<class Impl>
-  void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
-  {
-    int Ls=this->Ls;
+};
+  
+// Tanh
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  SetCoefficientsZolotarev(1.0,zdata,b,c);
+}
+//Zolo
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  int Ls=this->Ls;

-    ///////////////////////////////////////////////////////////
-    // The Cayley coeffs (unprec)
-    ///////////////////////////////////////////////////////////
-    omega.resize(Ls);
-    bs.resize(Ls);
-    cs.resize(Ls);
-    as.resize(Ls);
+  ///////////////////////////////////////////////////////////
+  // The Cayley coeffs (unprec)
+  ///////////////////////////////////////////////////////////
+  omega.resize(Ls);
+  bs.resize(Ls);
+  cs.resize(Ls);
+  as.resize(Ls);
+  
+  // 
+  // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
+  //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
+  //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
+  //
+  //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
+  //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
+  //
+  // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
+  // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
+  //
+  // So 
+  //
+  // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
+  //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
+  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
+  //
+  // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
+  //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
+  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
+  // 
    
-    // 
-    // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
-    //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
-    //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
-    //
-    //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
-    //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
-    //
-    // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
-    // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
-    //
-    // So 
-    //
-    // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
-    //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
-    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-    //
-    // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
-    //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
-    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-    // 
+  double bpc = b+c;
+  double bmc = b-c;
+  for(int i=0; i < Ls; i++){
+    as[i] = 1.0;
+    omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    bs[i] = 0.5*(bpc/omega[i] + bmc);
+    cs[i] = 0.5*(bpc/omega[i] - bmc);
+  }
+  
+  ////////////////////////////////////////////////////////
+  // Constants for the preconditioned matrix Cayley form
+  ////////////////////////////////////////////////////////
+  bee.resize(Ls);
+  cee.resize(Ls);
+  beo.resize(Ls);
+  ceo.resize(Ls);
+  
+  for(int i=0;i<Ls;i++){
+    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
+    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
+    beo[i]=as[i]*bs[i];
+    ceo[i]=-as[i]*cs[i];
+  }
+  
+  aee.resize(Ls);
+  aeo.resize(Ls);
+  for(int i=0;i<Ls;i++){
+    aee[i]=cee[i];
+    aeo[i]=ceo[i];
+  }
+  
+  //////////////////////////////////////////
+  // LDU decomposition of eeoo
+  //////////////////////////////////////////
+  dee.resize(Ls);
+  lee.resize(Ls);
+  leem.resize(Ls);
+  uee.resize(Ls);
+  ueem.resize(Ls);
+  
+  for(int i=0;i<Ls;i++){
    
-    double bpc = b+c;
-    double bmc = b-c;
-    for(int i=0; i < Ls; i++){
-      as[i] = 1.0;
-      omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
-      bs[i] = 0.5*(bpc/omega[i] + bmc);
-      cs[i] = 0.5*(bpc/omega[i] - bmc);
-    }
-
-    ////////////////////////////////////////////////////////
-    // Constants for the preconditioned matrix Cayley form
-    ////////////////////////////////////////////////////////
-    bee.resize(Ls);
-    cee.resize(Ls);
-    beo.resize(Ls);
-    ceo.resize(Ls);
+    dee[i] = bee[i];
    
-    for(int i=0;i<Ls;i++){
-      bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
-      cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
-      beo[i]=as[i]*bs[i];
-      ceo[i]=-as[i]*cs[i];
-    }
-
-    aee.resize(Ls);
-    aeo.resize(Ls);
-    for(int i=0;i<Ls;i++){
-      aee[i]=cee[i];
-      aeo[i]=ceo[i];
-    }
-
-    //////////////////////////////////////////
-    // LDU decomposition of eeoo
-    //////////////////////////////////////////
-    dee.resize(Ls);
-    lee.resize(Ls);
-    leem.resize(Ls);
-    uee.resize(Ls);
-    ueem.resize(Ls);
-    
-    for(int i=0;i<Ls;i++){
+    if ( i < Ls-1 ) {
      
-      dee[i] = bee[i];
+      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      
-      if ( i < Ls-1 ) {
-	
-	lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-	    
-	leem[i]=mass*cee[Ls-1]/bee[0];
-	for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
-	
-	uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-	
-	ueem[i]=mass;
-	for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
-	ueem[i]*= aee[0]/bee[0];
-	    
-      } else { 
-	lee[i] =0.0;
-	leem[i]=0.0;
-	uee[i] =0.0;
-	ueem[i]=0.0;
-      }
-    }
-	
-    { 
-      double delta_d=mass*cee[Ls-1];
-      for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
-      dee[Ls-1] += delta_d;
+      leem[i]=mass*cee[Ls-1]/bee[0];
+      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
+      
+      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
+      
+      ueem[i]=mass;
+      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
+      ueem[i]*= aee[0]/bee[0];
+      
+    } else { 
+      lee[i] =0.0;
+      leem[i]=0.0;
+      uee[i] =0.0;
+      ueem[i]=0.0;
    }
  }
+	
+  { 
+    double delta_d=mass*cee[Ls-1];
+    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
+    dee[Ls-1] += delta_d;
+  }  
+}
+
+

  FermOpTemplateInstantiate(CayleyFermion5D);
  GparityFermOpTemplateInstantiate(CayleyFermion5D);
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -51,6 +51,29 @@ namespace Grid {
      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+      virtual void   Meo5D (const FermionField &psi, FermionField &chi);
+
+      virtual void   M5D   (const FermionField &psi, FermionField &chi);
+      virtual void   M5Ddag(const FermionField &psi, FermionField &chi);
+
+      /////////////////////////////////////////////////////
+      // Instantiate different versions depending on Impl
+      /////////////////////////////////////////////////////
+      void M5D(const FermionField &psi,
+	       const FermionField &phi, 
+	       FermionField &chi,
+	       std::vector<RealD> &lower,
+	       std::vector<RealD> &diag,
+	       std::vector<RealD> &upper);
+
+      void M5Ddag(const FermionField &psi,
+		  const FermionField &phi, 
+		  FermionField &chi,
+		  std::vector<RealD> &lower,
+		  std::vector<RealD> &diag,
+		  std::vector<RealD> &upper);
+      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
+
      virtual void   Instantiatable(void)=0;

      // force terms; five routines; default to Dhop on diagonal
@@ -94,6 +117,8 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

+
+
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
@@ -101,5 +126,15 @@ namespace Grid {

  }
 }
+#define INSTANTIATE_DPERP(A)\
+template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
+					std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
+					   std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
+template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
+
+#define CAYLEY_DPERP_CACHE
+#undef  CAYLEY_DPERP_LINALG

 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -0,0 +1,209 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+  // Pminus fowards
+  // Pplus  backwards..
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<RealD> &lower,
+				std::vector<RealD> &diag,
+				std::vector<RealD> &upper)
+{
+  int Ls =this->Ls;
+  GridBase *grid=psi._grid;
+  assert(phi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    for(int s=0;s<Ls;s++){
+      auto tmp = psi._odata[0];
+      if ( s==0 ) {
+ 	                            spProj5m(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	                    spProj5p(tmp,psi._odata[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	                            spProj5m(tmp,psi._odata[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+ 	                    spProj5p(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	                            spProj5m(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	                    spProj5p(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  }
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<RealD> &lower,
+				   std::vector<RealD> &diag,
+				   std::vector<RealD> &upper)
+{
+  int Ls =this->Ls;
+  GridBase *grid=psi._grid;
+  assert(phi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    auto tmp = psi._odata[0];
+    for(int s=0;s<Ls;s++){
+      if ( s==0 ) {
+	spProj5p(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	spProj5p(tmp,psi._odata[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	spProj5p(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  GridBase *grid=psi._grid;
+  int Ls=this->Ls;
+
+  chi.checkerboard=psi.checkerboard;
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    auto tmp = psi._odata[0];
+
+    // Apply (L^{\prime})^{-1}
+    chi[ss]=psi[ss]; // chi[0]=psi[0]
+    for(int s=1;s<Ls;s++){
+                            spProj5p(tmp,chi[ss+s-1]);  
+      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
+    }
+    // L_m^{-1} 
+    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+                                   spProj5m(tmp,chi[ss+s]);    
+      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
+    }
+    // U_m^{-1} D^{-1}
+    for (int s=0;s<Ls-1;s++){
+      // Chi[s] + 1/d chi[s] 
+                                                spProj5p(tmp,chi[ss+Ls-1]); 
+      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
+      
+    // Apply U^{-1}
+    for (int s=Ls-2;s>=0;s--){
+                            spProj5m(tmp,chi[ss+s+1]);  
+      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  GridBase *grid=psi._grid;
+  int Ls=this->Ls;
+
+  assert(psi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+
+    auto tmp = psi._odata[0];
+
+    // Apply (U^{\prime})^{-dagger}
+    chi[ss]=psi[ss];
+    for (int s=1;s<Ls;s++){
+                            spProj5m(tmp,chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s]-uee[s-1]*tmp;
+    }
+    // U_m^{-\dagger} 
+    for (int s=0;s<Ls-1;s++){
+                                   spProj5p(tmp,chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - ueem[s]*tmp;
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for (int s=0;s<Ls-1;s++){
+      spProj5m(tmp,chi[ss+Ls-1]);
+      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(leem[s]/dee[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
+  
+    // Apply L^{-dagger}
+    for (int s=Ls-2;s>=0;s--){
+      spProj5p(tmp,chi[ss+s+1]);
+      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
+    }
+  }
+}
+
+#ifdef CAYLEY_DPERP_CACHE
+  INSTANTIATE_DPERP(WilsonImplF);
+  INSTANTIATE_DPERP(WilsonImplD);
+  INSTANTIATE_DPERP(GparityWilsonImplF);
+  INSTANTIATE_DPERP(GparityWilsonImplD);
+#endif
+
+}}
--- a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
@@ -0,0 +1,133 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Eigen/Dense>
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+  /*
+   * Dense matrix versions of routines
+   */
+
+  /*
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+  */
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  int Ls=this->Ls;
+  int LLs = psi._grid->_rdimensions[0];
+  int vol = psi._grid->oSites()/LLs;
+  
+  chi.checkerboard=psi.checkerboard;
+  
+  assert(Ls==LLs);
+  
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+
+  // For the non-vectorised s-direction this is simple
+  
+  for(auto site=0;site<vol;site++){
+    
+    SiteSpinor     SiteChi;
+    SiteHalfSpinor SitePplus;
+    SiteHalfSpinor SitePminus;
+    
+    for(int s1=0;s1<Ls;s1++){
+      SiteChi =zero;
+      for(int s2=0;s2<Ls;s2++){
+	int lex2 = s2+Ls*site;
+	
+	if ( PplusMat(s1,s2) != 0.0 ) {
+	  spProj5p(SitePplus,psi[lex2]);
+	  accumRecon5p(SiteChi,PplusMat (s1,s2)*SitePplus);
+	}
+	
+	if ( PminusMat(s1,s2) != 0.0 ) {
+	  spProj5m(SitePminus,psi[lex2]);
+	  accumRecon5m(SiteChi,PminusMat(s1,s2)*SitePminus);
+	}
+      }
+      chi[s1+Ls*site] = SiteChi*0.5;
+    }
+  }
+}
+
+template void CayleyFermion5D<GparityWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<GparityWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+
+}}
--- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
@@ -0,0 +1,149 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+  // Pminus fowards
+  // Pplus  backwards
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<RealD> &lower,
+				std::vector<RealD> &diag,
+				std::vector<RealD> &upper)
+{
+  int Ls=this->Ls;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,Ls-1);
+    } else if ( s==(Ls-1)) { 
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,0);
+      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,s-1);
+    } else {
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pplus(chi,1.0,chi,lower[s],psi,s,s-1);
+    }
+  }
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<RealD> &lower,
+				   std::vector<RealD> &diag,
+				   std::vector<RealD> &upper)
+{
+  int Ls=this->Ls;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,Ls-1);
+    } else if ( s==(Ls-1)) { 
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,0);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
+    } else {
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  chi.checkerboard=psi.checkerboard;
+  int Ls=this->Ls;
+  // Apply (L^{\prime})^{-1}
+  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
+  for (int s=1;s<Ls;s++){
+    axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+  }
+  // L_m^{-1} 
+  for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+    axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
+  }
+  // U_m^{-1} D^{-1}
+  for (int s=0;s<Ls-1;s++){
+    // Chi[s] + 1/d chi[s] 
+    axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
+  }	
+  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
+  
+  // Apply U^{-1}
+  for (int s=Ls-2;s>=0;s--){
+    axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  chi.checkerboard=psi.checkerboard;
+  int Ls=this->Ls;
+  // Apply (U^{\prime})^{-dagger}
+  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
+  for (int s=1;s<Ls;s++){
+    axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
+  }
+  // U_m^{-\dagger} 
+  for (int s=0;s<Ls-1;s++){
+    axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
+  }
+  // L_m^{-\dagger} D^{-dagger}
+  for (int s=0;s<Ls-1;s++){
+    axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
+  }	
+  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
+  
+  // Apply L^{-dagger}
+  for (int s=Ls-2;s>=0;s--){
+    axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
+  }
+}
+
+
+#ifdef CAYLEY_DPERP_LINALG
+  INSTANTIATE(WilsonImplF);
+  INSTANTIATE(WilsonImplD);
+  INSTANTIATE(GparityWilsonImplF);
+  INSTANTIATE(GparityWilsonImplD);
+#endif
+
+}
+}
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -0,0 +1,305 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Eigen/Dense>
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+  /*
+   * Dense matrix versions of routines
+   */
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<RealD> &lower,
+				std::vector<RealD> &diag,
+				std::vector<RealD> &upper)
+{
+  GridBase *grid=psi._grid;
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.checkerboard == psi.checkerboard);
+
+  chi.checkerboard=psi.checkerboard;
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+  for(int i=0;i<nsimd;i++){ //inner
+    int s  = o+i*LLs;
+    int ss = o*nsimd+i;
+    u_p[ss] = upper[s];
+    l_p[ss] = lower[s];
+    d_p[ss] = diag[s];
+  }}
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5m(hp,psi[ss+vp]);
+      spProj5p(hm,psi[ss+vm]);
+      
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5m(fp,hp);
+      spRecon5p(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+  }
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<RealD> &lower,
+				   std::vector<RealD> &diag,
+				   std::vector<RealD> &upper)
+{
+  GridBase *grid=psi._grid;
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.checkerboard == psi.checkerboard);
+
+  chi.checkerboard=psi.checkerboard;
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+  for(int i=0;i<nsimd;i++){ //inner
+    int s  = o+i*LLs;
+    int ss = o*nsimd+i;
+    u_p[ss] = upper[s];
+    l_p[ss] = lower[s];
+    d_p[ss] = diag[s];
+  }}
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5p(hp,psi[ss+vp]);
+      spProj5m(hm,psi[ss+vm]);
+
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+      
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5p(fp,hp);
+      spRecon5m(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  int Ls=this->Ls;
+  int LLs = psi._grid->_rdimensions[0];
+  int vol = psi._grid->oSites()/LLs;
+
+  chi.checkerboard=psi.checkerboard;
+  
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+  
+  typedef typename SiteHalfSpinor::scalar_type scalar_type;
+  const int Nsimd=Simd::Nsimd();
+  Vector<iSinglet<Simd> > Matp(Ls*LLs);
+  Vector<iSinglet<Simd> > Matm(Ls*LLs);
+
+  for(int s2=0;s2<Ls;s2++){
+  for(int s1=0;s1<LLs;s1++){
+    int istride = LLs;
+    int ostride = 1;
+      Simd Vp;
+      Simd Vm;
+      scalar_type *sp = (scalar_type *)&Vp;
+      scalar_type *sm = (scalar_type *)&Vm;
+      for(int l=0;l<Nsimd;l++){
+	sp[l] = PplusMat (l*istride+s1*ostride ,s2);
+	sm[l] = PminusMat(l*istride+s1*ostride,s2);
+      }
+      Matp[LLs*s2+s1] = Vp;
+      Matm[LLs*s2+s1] = Vm;
+    }
+  }
+  
+  // Dynamic allocate on stack to get per thread without serialised heap acces
+PARALLEL_FOR_LOOP
+  for(auto site=0;site<vol;site++){
+    
+    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
+    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
+    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
+
+    Vector<SiteHalfSpinor> SitePplus(LLs);
+    Vector<SiteHalfSpinor> SitePminus(LLs);
+    Vector<SiteHalfSpinor> SiteChiP(LLs);
+    Vector<SiteHalfSpinor> SiteChiM(LLs);
+    Vector<SiteSpinor>     SiteChi(LLs);
+
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+
+    for(int s=0;s<LLs;s++){
+      int lex = s+LLs*site;
+      spProj5p(SitePplus[s] ,psi[lex]);
+      spProj5m(SitePminus[s],psi[lex]);
+      SiteChiP[s]=zero;
+      SiteChiM[s]=zero;
+    }
+      
+    int s=0;
+    for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+      for(int s2=0;s2<LLs;s2++){ // Column loop of right hand side
+	vbroadcast(BcastP,SitePplus [s2],l);
+	vbroadcast(BcastM,SitePminus[s2],l);
+	for(int s1=0;s1<LLs;s1++){ // Column loop of reduction variables
+	  SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
+	  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
+	}
+      s++;
+    }}
+
+    for(int s=0;s<LLs;s++){
+      int lex = s+LLs*site;
+      spRecon5p(SiteChi[s],SiteChiP[s]);
+      accumRecon5m(SiteChi[s],SiteChiM[s]);
+      chi[lex] = SiteChi[s]*0.5;
+    }
+  }
+}
+
+INSTANTIATE_DPERP(DomainWallVec5dImplD);
+INSTANTIATE_DPERP(DomainWallVec5dImplF);
+
+template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+
+}}
--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H
 #define  GRID_QCD_DOMAIN_WALL_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -75,7 +75,7 @@ namespace Grid {
    //
    //
    // template<class Impl>
-    // class MyOp : pubic<Impl> { 
+    // class MyOp : public<Impl> { 
    // public:
    //
    //    INHERIT_ALL_IMPL_TYPES(Impl);
@@ -99,7 +99,7 @@ namespace Grid {
    typedef typename Impl::SiteSpinor               SiteSpinor;		\
    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
    typedef typename Impl::Compressor               Compressor;		\
-    typedef typename Impl::StencilImpl              StencilImpl;	\
+    typedef typename Impl::StencilImpl             StencilImpl;	  \
    typedef typename Impl::ImplParams ImplParams;

 #define INHERIT_IMPL_TYPES(Base) \
@@ -110,9 +110,11 @@ namespace Grid {
    // Single flavour four spinors with colour index
    ///////
    template<class S,int Nrepresentation=Nc>
-    class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S, Nrepresentation> > { 
    public:

+      const bool LsVectorised=false;
+
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;

      INHERIT_GIMPL_TYPES(Gimpl);
@@ -191,8 +193,10 @@ PARALLEL_FOR_LOOP
    // Single flavour four spinors with colour index, 5d redblack
    ///////
    template<class S,int Nrepresentation=Nc>
-    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:
+    
+      const bool LsVectorised=true;

      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;

@@ -221,7 +225,7 @@ PARALLEL_FOR_LOOP

      ImplParams Params;

-      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+      DomainWallVec5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 

      bool overlapCommsCompute(void) { return false; };
    
@@ -287,6 +291,8 @@ PARALLEL_FOR_LOOP
    class GparityWilsonImpl : public ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> >{ 
    public:

+      const bool LsVectorised=false;
+
      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;

      INHERIT_GIMPL_TYPES(Gimpl);
@@ -446,10 +452,10 @@ PARALLEL_FOR_LOOP
 	// DhopDir provides U or Uconj depending on coor/flavour.
 	GaugeLinkField link(mat._grid);
 	// use lorentz for flavour as hack.
-	auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde,A));  
 PARALLEL_FOR_LOOP
-        for(auto ss=tmp.begin();ss<tmp.end();ss++){
-	  link[ss]() = tmp[ss](0,0) - conjugate(tmp[ss](1,1)) ;
+        for(auto ss=link.begin();ss<link.end();ss++){
+	  auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[ss],A[ss]));  
+	  link[ss]() = ttmp(0,0) + conjugate(ttmp(1,1)) ; 
 	}
 	PokeIndex<LorentzIndex>(mat,link,mu);
 	return;
@@ -477,9 +483,9 @@ PARALLEL_FOR_LOOP
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double

-    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
-    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
-    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
+    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
+    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
+    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double

    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
--- a/lib/qcd/action/fermion/MobiusFermion.h
+++ b/lib/qcd/action/fermion/MobiusFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_FERMION_H
 #define  GRID_QCD_MOBIUS_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/lib/qcd/action/fermion/MobiusZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
 #define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
 #define  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SCALED_SHAMIR_FERMION_H
 #define  GRID_QCD_SCALED_SHAMIR_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/lib/qcd/action/fermion/ShamirZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -48,9 +48,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 				       GridRedBlackCartesian &FourDimRedBlackGrid,
 				       RealD _M5,const ImplParams &p) :
  Kernels(p),
-  _FiveDimGrid(&FiveDimGrid),
+  _FiveDimGrid        (&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid(&FourDimGrid),
+  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
@@ -62,60 +62,83 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid)
 {
-  // some assertions
-  assert(FiveDimGrid._ndimension==5);
-  assert(FourDimGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._ndimension==5);
-  assert(FourDimRedBlackGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._checker_dim==1);
+  if (Impl::LsVectorised) { 

-  // Dimension zero of the five-d is the Ls direction
-  Ls=FiveDimGrid._fdimensions[0];
-  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
-  assert(FiveDimRedBlackGrid._processors[0] ==1);
-  assert(FiveDimRedBlackGrid._simd_layout[0]==1);
-  assert(FiveDimGrid._processors[0]         ==1);
-  assert(FiveDimGrid._simd_layout[0]        ==1);
+    int nsimd = Simd::Nsimd();
+    
+    // some assertions
+    assert(FiveDimGrid._ndimension==5);
+    assert(FiveDimRedBlackGrid._ndimension==5);
+    assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+    assert(FourDimGrid._ndimension==4);

-  // Other dimensions must match the decomposition of the four-D fields 
-  for(int d=0;d<4;d++){
-    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+    // Dimension zero of the five-d is the Ls direction
+    Ls=FiveDimGrid._fdimensions[0];
+    assert(FiveDimGrid._processors[0]         ==1);
+    assert(FiveDimGrid._simd_layout[0]        ==nsimd);

-    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
-    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+    assert(FiveDimRedBlackGrid._processors[0] ==1);
+    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);

-    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
-    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+    // Other dimensions must match the decomposition of the four-D fields 
+    for(int d=0;d<4;d++){
+      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+      
+      assert(FourDimGrid._simd_layout[d]=1);
+      assert(FourDimRedBlackGrid._simd_layout[d]=1);
+      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);

-    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
-    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    }
+
+  } else {
+
+    // some assertions
+    assert(FiveDimGrid._ndimension==5);
+    assert(FourDimGrid._ndimension==4);
+    assert(FiveDimRedBlackGrid._ndimension==5);
+    assert(FourDimRedBlackGrid._ndimension==4);
+    assert(FiveDimRedBlackGrid._checker_dim==1);
+    
+    // Dimension zero of the five-d is the Ls direction
+    Ls=FiveDimGrid._fdimensions[0];
+    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+    assert(FiveDimRedBlackGrid._processors[0] ==1);
+    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
+    assert(FiveDimGrid._processors[0]         ==1);
+    assert(FiveDimGrid._simd_layout[0]        ==1);
+    
+    // Other dimensions must match the decomposition of the four-D fields 
+    for(int d=0;d<4;d++){
+      assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+      
+      assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
+      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+      
+      assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
+      assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+      
+      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    }
  }
-
+    
  // Allocate the required comms buffer
  ImportGauge(_Umu);
-}  
-
+}
+  /*
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
 				       GridCartesian         &FiveDimGrid,
 				       GridRedBlackCartesian &FiveDimRedBlackGrid,
 				       GridCartesian         &FourDimGrid,
 				       RealD _M5,const ImplParams &p) :
-  Kernels(p),
-  _FiveDimGrid        (&FiveDimGrid),
-  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid        (&FourDimGrid),
-  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
-  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
-  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
-  M5(_M5),
-  Umu(_FourDimGrid),
-  UmuEven(_FourDimGrid),
-  UmuOdd (_FourDimGrid),
-  Lebesgue(_FourDimGrid),
-  LebesgueEvenOdd(_FourDimGrid)
 {
  int nsimd = Simd::Nsimd();

@@ -148,15 +171,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
  }

  {
-    GaugeField HUmu(_Umu._grid);
-    HUmu = _Umu*(-0.5);
-    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
-    UmuEven=Umu;// Really want a reference.
-    UmuOdd =Umu;
  }
 }  
-
-
+  */
+     
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@@ -376,8 +394,6 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag

 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
-template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
-template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
  
 }}

--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -125,12 +125,14 @@ namespace Grid {
 		      double _M5,const ImplParams &p= ImplParams());

      // Constructors
+      /*
      WilsonFermion5D(int simd, 
 		      GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      double _M5,const ImplParams &p= ImplParams());
+      */

      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -572,7 +572,4 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,

  FermOpTemplateInstantiate(WilsonKernels);

-template class WilsonKernels<DomainWallRedBlack5dImplF>;		
-template class WilsonKernels<DomainWallRedBlack5dImplD>;
-
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -90,7 +90,7 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 template<>
-void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@@ -110,10 +110,10 @@ template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl
 template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -867,16 +867,16 @@ template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(Stencil
 									 int ss,int sU,const FermionField &in, FermionField &out);


-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);

--- a/lib/qcd/action/fermion/WilsonTMFermion.h
+++ b/lib/qcd/action/fermion/WilsonTMFermion.h
@@ -28,7 +28,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_WILSON_TM_FERMION_H
 #define  GRID_QCD_WILSON_TM_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/gauge/GaugeImpl.h
+++ b/lib/qcd/action/gauge/GaugeImpl.h
@@ -1,181 +1,188 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/gauge/GaugeImpl.h
+Source file: ./lib/qcd/action/gauge/GaugeImpl.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_GAUGE_IMPL_H
-#define  GRID_QCD_GAUGE_IMPL_H
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_GAUGE_IMPL_H
+#define GRID_QCD_GAUGE_IMPL_H

 namespace Grid {

-  namespace QCD {
+namespace QCD {

-    
-    ////////////////////////////////////////////////////////////////////////
-    // Implementation dependent gauge types
-    ////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Implementation dependent gauge types
+////////////////////////////////////////////////////////////////////////

-template<class Gimpl> class WilsonLoops;
+template <class Gimpl> class WilsonLoops;

-#define INHERIT_GIMPL_TYPES(GImpl) \
-    typedef typename GImpl::Simd                           Simd;\
-    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
-    typedef typename GImpl::GaugeField               GaugeField;\
-    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
-    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;
+#define INHERIT_GIMPL_TYPES(GImpl)                                             \
+  typedef typename GImpl::Simd Simd;                                           \
+  typedef typename GImpl::GaugeLinkField GaugeLinkField;                       \
+  typedef typename GImpl::GaugeField GaugeField;                               \
+  typedef typename GImpl::SiteGaugeField SiteGaugeField;                       \
+  typedef typename GImpl::SiteGaugeLink SiteGaugeLink;

+//
+template <class S, int Nrepresentation = Nc> class GaugeImplTypes {
+public:
+  typedef S Simd;

-    // 
-    template<class S,int Nrepresentation=Nc>
-    class GaugeImplTypes { 
-    public:
-    
-      typedef S Simd;
-    
-      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
-      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd  >;
-    
-      typedef iImplGaugeLink    <Simd>           SiteGaugeLink;
-      typedef iImplGaugeField   <Simd>           SiteGaugeField;
-    
-      typedef Lattice<SiteGaugeLink>                GaugeLinkField; // bit ugly naming; polarised gauge field, lorentz... all ugly
-      typedef Lattice<SiteGaugeField>                   GaugeField;
+  template <typename vtype>
+  using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation>>>;
+  template <typename vtype>
+  using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation>>, Nd>;

-    };
+  typedef iImplGaugeLink<Simd> SiteGaugeLink;
+  typedef iImplGaugeField<Simd> SiteGaugeField;

-    // Composition with smeared link, bc's etc.. probably need multiple inheritance
-    // Variable precision "S" and variable Nc
-    template<class GimplTypes>
-    class PeriodicGaugeImpl : public GimplTypes  { 
-    public:
+  typedef Lattice<SiteGaugeLink> GaugeLinkField; // bit ugly naming; polarised
+                                                 // gauge field, lorentz... all
+                                                 // ugly
+  typedef Lattice<SiteGaugeField> GaugeField;

-    INHERIT_GIMPL_TYPES(GimplTypes);
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Support needed for the assembly of loops including all boundary condition effects such as conjugate bcs
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    
-      template<class covariant>  static inline
-      Lattice<covariant> CovShiftForward (const GaugeLinkField &Link, int mu, const Lattice<covariant> &field) {
-	return PeriodicBC::CovShiftForward(Link,mu,field);
-      }
-
-      template<class covariant> static inline
-      Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,const Lattice<covariant> &field) {
-	return PeriodicBC::CovShiftBackward(Link,mu,field);
-      }
-      static inline
-      GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-	return Cshift(adj(Link),mu,-1);
-      }
-      static inline
-      GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-	return Link;
-      }
-      static inline
-      GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-	return Cshift(Link,mu,1);
-      }
-
-      static inline bool isPeriodicGaugeField(void) {
-	return true;
-      }
-
-    };
-
-    
-    // Composition with smeared link, bc's etc.. probably need multiple inheritance
-    // Variable precision "S" and variable Nc
-    template<class GimplTypes>
-    class ConjugateGaugeImpl : public GimplTypes { 
-    public:
-
-      INHERIT_GIMPL_TYPES(GimplTypes);
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Support needed for the assembly of loops including all boundary condition effects such as Gparity.
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    template<class covariant>  static
-    Lattice<covariant> CovShiftForward (const GaugeLinkField &Link, int mu, const Lattice<covariant> &field) {
-      return ConjugateBC::CovShiftForward(Link,mu,field);
+  // Move this elsewhere?
+  static inline void AddGaugeLink(GaugeField &U, GaugeLinkField &W,
+                                  int mu) { // U[mu] += W
+    PARALLEL_FOR_LOOP
+    for (auto ss = 0; ss < U._grid->oSites(); ss++) {
+      U._odata[ss]._internal[mu] =
+          U._odata[ss]._internal[mu] + W._odata[ss]._internal;
    }
-
-    template<class covariant> static
-    Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,const Lattice<covariant> &field) {
-      return ConjugateBC::CovShiftBackward(Link,mu,field);
-    }
-
-    static inline
-    GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-      GridBase *grid = Link._grid;
-      int Lmu = grid->GlobalDimensions()[mu]-1;
-      
-      Lattice<iScalar<vInteger> > coor(grid);    LatticeCoordinate(coor,mu);
-
-      GaugeLinkField tmp (grid);
-      tmp=adj(Link);
-      tmp = where(coor==Lmu,conjugate(tmp),tmp);
-      return Cshift(tmp,mu,-1);// moves towards positive mu
-    }
-    static inline
-    GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-      return Link;
-    }
-
-    static inline
-    GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-      GridBase *grid = Link._grid;
-      int Lmu = grid->GlobalDimensions()[mu]-1;
-      
-      Lattice<iScalar<vInteger> > coor(grid);    LatticeCoordinate(coor,mu);
-
-      GaugeLinkField tmp (grid);
-      tmp=Cshift(Link,mu,1);
-      tmp=where(coor==Lmu,conjugate(tmp),tmp);
-      return tmp;
-    }
-
-    static inline bool isPeriodicGaugeField(void) {
-      return false;
-    }
-    
-    };
-
-    typedef GaugeImplTypes<vComplex,Nc>     GimplTypesR;
-    typedef GaugeImplTypes<vComplexF,Nc>    GimplTypesF;
-    typedef GaugeImplTypes<vComplexD,Nc>    GimplTypesD;
-
-    typedef PeriodicGaugeImpl<GimplTypesR> PeriodicGimplR; // Real.. whichever prec
-    typedef PeriodicGaugeImpl<GimplTypesF> PeriodicGimplF; // Float
-    typedef PeriodicGaugeImpl<GimplTypesD> PeriodicGimplD; // Double
-
-    typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever prec
-    typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
-    typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
-
  }
+};
+
+// Composition with smeared link, bc's etc.. probably need multiple inheritance
+// Variable precision "S" and variable Nc
+template <class GimplTypes> class PeriodicGaugeImpl : public GimplTypes {
+public:
+  INHERIT_GIMPL_TYPES(GimplTypes);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Support needed for the assembly of loops including all boundary condition
+  // effects such as conjugate bcs
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  template <class covariant>
+  static inline Lattice<covariant>
+  CovShiftForward(const GaugeLinkField &Link, int mu,
+                  const Lattice<covariant> &field) {
+    return PeriodicBC::CovShiftForward(Link, mu, field);
+  }
+
+  template <class covariant>
+  static inline Lattice<covariant>
+  CovShiftBackward(const GaugeLinkField &Link, int mu,
+                   const Lattice<covariant> &field) {
+    return PeriodicBC::CovShiftBackward(Link, mu, field);
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
+    return Cshift(adj(Link), mu, -1);
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
+    return Link;
+  }
+  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
+    return Cshift(Link, mu, 1);
+  }
+
+  static inline bool isPeriodicGaugeField(void) { return true; }
+};
+
+// Composition with smeared link, bc's etc.. probably need multiple inheritance
+// Variable precision "S" and variable Nc
+template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes {
+public:
+  INHERIT_GIMPL_TYPES(GimplTypes);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Support needed for the assembly of loops including all boundary condition
+  // effects such as Gparity.
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  template <class covariant>
+  static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
+                                            const Lattice<covariant> &field) {
+    return ConjugateBC::CovShiftForward(Link, mu, field);
+  }
+
+  template <class covariant>
+  static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
+                                             const Lattice<covariant> &field) {
+    return ConjugateBC::CovShiftBackward(Link, mu, field);
+  }
+
+  static inline GaugeLinkField
+  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
+    GridBase *grid = Link._grid;
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    GaugeLinkField tmp(grid);
+    tmp = adj(Link);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return Cshift(tmp, mu, -1); // moves towards positive mu
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
+    return Link;
+  }
+
+  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
+    GridBase *grid = Link._grid;
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    GaugeLinkField tmp(grid);
+    tmp = Cshift(Link, mu, 1);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return tmp;
+  }
+
+  static inline bool isPeriodicGaugeField(void) { return false; }
+};
+
+typedef GaugeImplTypes<vComplex, Nc> GimplTypesR;
+typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF;
+typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD;
+
+typedef PeriodicGaugeImpl<GimplTypesR> PeriodicGimplR; // Real.. whichever prec
+typedef PeriodicGaugeImpl<GimplTypesF> PeriodicGimplF; // Float
+typedef PeriodicGaugeImpl<GimplTypesD> PeriodicGimplD; // Double
+
+typedef ConjugateGaugeImpl<GimplTypesR>
+    ConjugateGimplR; // Real.. whichever prec
+typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
+typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
+}
 }

 #endif
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
@@ -1,212 +1,214 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+Source file: ./lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H
 #define QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H

-namespace Grid{
-  namespace QCD{
+namespace Grid {
+namespace QCD {

-    ///////////////////////////////////////
-    // One flavour rational
-    ///////////////////////////////////////
+///////////////////////////////////////
+// One flavour rational
+///////////////////////////////////////

-    // S_f = chi^dag *  N(Mpc^dag*Mpc)/D(Mpc^dag*Mpc) * chi
+// S_f = chi^dag *  N(Mpc^dag*Mpc)/D(Mpc^dag*Mpc) * chi
+//
+// Here, M is some operator
+// N and D makeup the rat. poly
+//
+
+template <class Impl>
+class OneFlavourEvenOddRationalPseudoFermionAction
+    : public Action<typename Impl::GaugeField> {
+ public:
+  INHERIT_IMPL_TYPES(Impl);
+
+  typedef OneFlavourRationalParams Params;
+  Params param;
+
+  MultiShiftFunction PowerHalf;
+  MultiShiftFunction PowerNegHalf;
+  MultiShiftFunction PowerQuarter;
+  MultiShiftFunction PowerNegQuarter;
+
+ private:
+  FermionOperator<Impl> &FermOp;  // the basic operator
+
+  // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us
+  // historically
+  // and hasenbusch works better
+
+  FermionField PhiEven;  // the pseudo fermion field for this trajectory
+  FermionField PhiOdd;   // the pseudo fermion field for this trajectory
+
+ public:
+  OneFlavourEvenOddRationalPseudoFermionAction(FermionOperator<Impl> &Op,
+                                               Params &p)
+      : FermOp(Op),
+        PhiEven(Op.FermionRedBlackGrid()),
+        PhiOdd(Op.FermionRedBlackGrid()),
+        param(p) {
+    AlgRemez remez(param.lo, param.hi, param.precision);
+
+    // MdagM^(+- 1/2)
+    std::cout << GridLogMessage << "Generating degree " << param.degree
+              << " for x^(1/2)" << std::endl;
+    remez.generateApprox(param.degree, 1, 2);
+    PowerHalf.Init(remez, param.tolerance, false);
+    PowerNegHalf.Init(remez, param.tolerance, true);
+
+    // MdagM^(+- 1/4)
+    std::cout << GridLogMessage << "Generating degree " << param.degree
+              << " for x^(1/4)" << std::endl;
+    remez.generateApprox(param.degree, 1, 4);
+    PowerQuarter.Init(remez, param.tolerance, false);
+    PowerNegQuarter.Init(remez, param.tolerance, true);
+  };
+
+  virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
+    // P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
+    //        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
+    // Phi = MpcdagMpc^{1/4} eta
    //
-    // Here, M is some operator 
-    // N and D makeup the rat. poly 
+    // P(eta) = e^{- eta^dag eta}
    //
-  
-    template<class Impl>
-    class OneFlavourEvenOddRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
+    // e^{x^2/2 sig^2} => sig^2 = 0.5.
+    //
+    // So eta should be of width sig = 1/sqrt(2).

-      typedef OneFlavourRationalParams Params;
-      Params param;
+    RealD scale = std::sqrt(0.5);

-      MultiShiftFunction PowerHalf   ;
-      MultiShiftFunction PowerNegHalf;
-      MultiShiftFunction PowerQuarter;
-      MultiShiftFunction PowerNegQuarter;
+    FermionField eta(FermOp.FermionGrid());
+    FermionField etaOdd(FermOp.FermionRedBlackGrid());
+    FermionField etaEven(FermOp.FermionRedBlackGrid());

-    private:
-     
-      FermionOperator<Impl> & FermOp;// the basic operator
+    gaussian(pRNG, eta);
+    eta = eta * scale;

-      // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us historically
-      // and hasenbusch works better
+    pickCheckerboard(Even, etaEven, eta);
+    pickCheckerboard(Odd, etaOdd, eta);

-      FermionField PhiEven; // the pseudo fermion field for this trajectory
-      FermionField PhiOdd; // the pseudo fermion field for this trajectory
-                        
+    FermOp.ImportGauge(U);

-    public:
+    // mutishift CG
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter, PowerQuarter);
+    msCG(Mpc, etaOdd, PhiOdd);

-      OneFlavourEvenOddRationalPseudoFermionAction(FermionOperator<Impl>  &Op, 
-						   Params & p ) : FermOp(Op), 
-	PhiEven(Op.FermionRedBlackGrid()), 
-	PhiOdd (Op.FermionRedBlackGrid()), 
-	param(p) 
-      {
-	AlgRemez remez(param.lo,param.hi,param.precision);
+    //////////////////////////////////////////////////////
+    // FIXME : Clover term not yet..
+    //////////////////////////////////////////////////////

-	// MdagM^(+- 1/2)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-	remez.generateApprox(param.degree,1,2);
-	PowerHalf.Init(remez,param.tolerance,false);
-	PowerNegHalf.Init(remez,param.tolerance,true);
+    assert(FermOp.ConstEE() == 1);
+    PhiEven = zero;
+  };

-	// MdagM^(+- 1/4)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
-	remez.generateApprox(param.degree,1,4);
-   	PowerQuarter.Init(remez,param.tolerance,false);
-	PowerNegQuarter.Init(remez,param.tolerance,true);
-      };
-      
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+  //////////////////////////////////////////////////////
+  // S = phi^dag (Mdag M)^-1/2 phi
+  //////////////////////////////////////////////////////
+  virtual RealD S(const GaugeField &U) {
+    FermOp.ImportGauge(U);

-	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
-	//        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
-	// Phi = MpcdagMpc^{1/4} eta 
-	//
-	// P(eta) = e^{- eta^dag eta}
-	//
-	// e^{x^2/2 sig^2} => sig^2 = 0.5.
-	// 
-	// So eta should be of width sig = 1/sqrt(2).
+    FermionField Y(FermOp.FermionRedBlackGrid());

-	RealD scale = std::sqrt(0.5);
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);

-	FermionField eta    (FermOp.FermionGrid());
-	FermionField etaOdd (FermOp.FermionRedBlackGrid());
-	FermionField etaEven(FermOp.FermionRedBlackGrid());
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,
+                                                   PowerNegQuarter);

-	gaussian(pRNG,eta);	eta=eta*scale;
+    msCG(Mpc, PhiOdd, Y);

-	pickCheckerboard(Even,etaEven,eta);
-	pickCheckerboard(Odd,etaOdd,eta);
+    RealD action = norm2(Y);
+    std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 "
+                                   "solve or -1/2 solve faster??? "
+              << action << std::endl;

-	FermOp.ImportGauge(U);
+    return action;
+  };

-	// mutishift CG
-	SchurDifferentiableOperator<Impl> Mpc(FermOp);
-	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerQuarter);
-	msCG(Mpc,etaOdd,PhiOdd);
+  //////////////////////////////////////////////////////
+  // Need
+  // dS_f/dU = chi^dag   d[N/D]  chi
+  //
+  // N/D is expressed as partial fraction expansion:
+  //
+  //           a0 + \sum_k ak/(M^dagM + bk)
+  //
+  // d[N/D] is then
+  //
+  //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M +
+  //          bk]^{-1}
+  //
+  // Need
+  //       Mf Phi_k = [MdagM+bk]^{-1} Phi
+  //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
+  //
+  // With these building blocks
+  //
+  //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf
+  //       Phi_k
+  //        S    = innerprodReal(Phi,Mf Phi);
+  //////////////////////////////////////////////////////
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
+    const int Npole = PowerNegHalf.poles.size();

-	//////////////////////////////////////////////////////
-	// FIXME : Clover term not yet..
-	//////////////////////////////////////////////////////
+    std::vector<FermionField> MPhi_k(Npole, FermOp.FermionRedBlackGrid());

-	assert(FermOp.ConstEE() == 1);
-	PhiEven = zero;
-	
-      };
+    FermionField X(FermOp.FermionRedBlackGrid());
+    FermionField Y(FermOp.FermionRedBlackGrid());

-      //////////////////////////////////////////////////////
-      // S = phi^dag (Mdag M)^-1/2 phi
-      //////////////////////////////////////////////////////
-      virtual RealD S(const GaugeField &U) {
+    GaugeField tmp(FermOp.GaugeGrid());

-	FermOp.ImportGauge(U);
+    FermOp.ImportGauge(U);

-	FermionField Y(FermOp.FermionRedBlackGrid());
-	
-	SchurDifferentiableOperator<Impl> Mpc(FermOp);
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);

-	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegQuarter);
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter, PowerNegHalf);

-	msCG(Mpc,PhiOdd,Y);
+    msCG(Mpc, PhiOdd, MPhi_k);

-	RealD action = norm2(Y);
-	std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 solve or -1/2 solve faster??? "<<action<<std::endl;
+    dSdU = zero;
+    for (int k = 0; k < Npole; k++) {
+      RealD ak = PowerNegHalf.residues[k];

-	return action;
-      };
+      X = MPhi_k[k];

-      //////////////////////////////////////////////////////
-      // Need
-      // dS_f/dU = chi^dag   d[N/D]  chi
-      //
-      // N/D is expressed as partial fraction expansion:
-      //
-      //           a0 + \sum_k ak/(M^dagM + bk)
-      //
-      // d[N/D] is then
-      //
-      //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M + bk]^{-1}
-      //
-      // Need
-      //       Mf Phi_k = [MdagM+bk]^{-1} Phi
-      //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
-      //
-      // With these building blocks
-      //
-      //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf Phi_k
-      //        S    = innerprodReal(Phi,Mf Phi);
-      //////////////////////////////////////////////////////
-      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+      Mpc.Mpc(X, Y);
+      Mpc.MpcDeriv(tmp, Y, X);
+      dSdU = dSdU + ak * tmp;
+      Mpc.MpcDagDeriv(tmp, X, Y);
+      dSdU = dSdU + ak * tmp;
+    }

-	const int Npole = PowerNegHalf.poles.size();
-
-	std::vector<FermionField> MPhi_k (Npole,FermOp.FermionRedBlackGrid());
-
-	FermionField X(FermOp.FermionRedBlackGrid());
-	FermionField Y(FermOp.FermionRedBlackGrid());
-
-	GaugeField   tmp(FermOp.GaugeGrid());
-
-	FermOp.ImportGauge(U);
-
-	SchurDifferentiableOperator<Impl> Mpc(FermOp);
-
-	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegHalf);
-
-	msCG(Mpc,PhiOdd,MPhi_k);
-
-	dSdU = zero;
-	for(int k=0;k<Npole;k++){
-
-	  RealD ak = PowerNegHalf.residues[k];
-
-	  X  = MPhi_k[k];
-
-	  Mpc.Mpc(X,Y);
-	  Mpc.MpcDeriv   (tmp , Y, X );  dSdU=dSdU+ak*tmp;
-	  Mpc.MpcDagDeriv(tmp , X, Y );  dSdU=dSdU+ak*tmp;
-
-	}
-
-	dSdU = Ta(dSdU);
-
-      };
-    };
-  }
+    // dSdU = Ta(dSdU);
+  };
+};
+}
 }

-
 #endif
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -256,7 +256,7 @@ namespace Grid{

 	}

-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);

      };
    };
--- a/lib/qcd/action/pseudofermion/OneFlavourRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRational.h
@@ -186,7 +186,7 @@ namespace Grid{

 	}

-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);

      };
    };
--- a/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@@ -242,7 +242,7 @@ namespace Grid{

 	}

-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);

      };
    };
--- a/lib/qcd/action/pseudofermion/TwoFlavour.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavour.h
@@ -137,7 +137,7 @@ namespace Grid{
 	FermOp.MDeriv(tmp , Y, X,DaggerNo );  dSdU=tmp;
 	FermOp.MDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
 	
-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);

      };

--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
@@ -100,7 +100,7 @@ namespace Grid{

 	PhiOdd =PhiOdd*scale;
 	PhiEven=PhiEven*scale;
-	
+
      };

      //////////////////////////////////////////////////////
@@ -173,7 +173,7 @@ namespace Grid{
 	FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
 	*/
 	
-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);

      };

--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -188,8 +188,9 @@ namespace Grid{
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);

-	dSdU = -Ta(dSdU);
-
+	//dSdU = -Ta(dSdU);
+	dSdU = -dSdU;
+	
      };
    };
  }
--- a/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
@@ -155,7 +155,8 @@ namespace Grid{
 	DenOp.MDeriv(force,Y,X,DaggerNo);   dSdU=dSdU-force;
 	DenOp.MDeriv(force,X,Y,DaggerYes);  dSdU=dSdU-force;

-	dSdU = - Ta(dSdU);
+	dSdU *= -1.0;
+	//dSdU = - Ta(dSdU);

      };
    };
--- a/lib/qcd/hmc/HMC.h
+++ b/lib/qcd/hmc/HMC.h
@@ -1,33 +1,34 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/hmc/HMC.h
+Source file: ./lib/qcd/hmc/HMC.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 //--------------------------------------------------------------------
 /*! @file HMC.h
 * @brief Classes for Hybrid Monte Carlo update
@@ -41,172 +42,195 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #include <string>

+namespace Grid {
+namespace QCD {

-namespace Grid{
-  namespace QCD{
-    
+struct HMCparameters {
+  Integer StartTrajectory;
+  Integer Trajectories; /* @brief Number of sweeps in this run */
+  bool MetropolisTest;
+  Integer NoMetropolisUntil;

-    struct HMCparameters{
+  HMCparameters() {
+    ////////////////////////////// Default values
+    MetropolisTest = true;
+    NoMetropolisUntil = 10;
+    StartTrajectory = 0;
+    Trajectories = 200;
+    /////////////////////////////////
+  }

-      Integer StartTrajectory;
-      Integer Trajectories; /* @brief Number of sweeps in this run */
-      bool    MetropolisTest;
-      Integer NoMetropolisUntil;
+  void print() const {
+    std::cout << GridLogMessage << "[HMC parameter] Trajectories            : " << Trajectories << "\n";
+    std::cout << GridLogMessage << "[HMC parameter] Start trajectory        : " << StartTrajectory << "\n";
+    std::cout << GridLogMessage << "[HMC parameter] Metropolis test (on/off): " << MetropolisTest << "\n";
+    std::cout << GridLogMessage << "[HMC parameter] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+  }
+  
+};

-      HMCparameters(){
-	////////////////////////////// Default values
-	MetropolisTest      = true;
-	NoMetropolisUntil   = 10;
-	StartTrajectory     = 0;
-	Trajectories        = 200;
-	/////////////////////////////////
-      }
-    };
+template <class GaugeField>
+class HmcObservable {
+ public:
+  virtual void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
+                                  GridParallelRNG &pRNG) = 0;
+};

-    template<class GaugeField> 
-    class HmcObservable {
-    public:
-      virtual void TrajectoryComplete (int traj, GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG & pRNG )=0;
-    };
+template <class Gimpl>
+class PlaquetteLogger : public HmcObservable<typename Gimpl::GaugeField> {
+ private:
+  std::string Stem;

-    template<class Gimpl> 
-    class PlaquetteLogger : public HmcObservable<typename Gimpl::GaugeField> {
-    private:
-      std::string Stem;
-    public:
-      INHERIT_GIMPL_TYPES(Gimpl);
-      PlaquetteLogger(std::string cf) {
-        Stem  = cf;
-      };
+ public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+  PlaquetteLogger(std::string cf) { Stem = cf; };

-      void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG & pRNG )
-      {
-	  std::string file;   { std::ostringstream os; os << Stem     <<"."<< traj; file = os.str(); }
-	  std::ofstream of(file);
+  void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
+                          GridParallelRNG &pRNG) {
+    std::string file;
+    {
+      std::ostringstream os;
+      os << Stem << "." << traj;
+      file = os.str();
+    }
+    std::ofstream of(file);

-	  RealD peri_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
-	  RealD peri_rect = WilsonLoops<PeriodicGimplR>::avgRectangle(U);
+    RealD peri_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
+    RealD peri_rect = WilsonLoops<PeriodicGimplR>::avgRectangle(U);

-	  RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
-	  RealD impl_rect = WilsonLoops<Gimpl>::avgRectangle(U);
+    RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
+    RealD impl_rect = WilsonLoops<Gimpl>::avgRectangle(U);

-	  of << traj<<" "<< impl_plaq << " " << impl_rect << "  "<< peri_plaq<<" "<<peri_rect<<std::endl;
-	  std::cout<< GridLogMessage<< "traj"<<" "<< "plaq " << " " << " rect  " << "  "<< "peri_plaq" <<" "<<"peri_rect"<<std::endl;
-	  std::cout<< GridLogMessage<< traj<<" "<< impl_plaq << " " << impl_rect << "  "<< peri_plaq<<" "<<peri_rect<<std::endl;
-      }
-    };
+    of << traj << " " << impl_plaq << " " << impl_rect << "  " << peri_plaq
+       << " " << peri_rect << std::endl;
+    std::cout << GridLogMessage << "traj"
+              << " "
+              << "plaq "
+              << " "
+              << " rect  "
+              << "  "
+              << "peri_plaq"
+              << " "
+              << "peri_rect" << std::endl;
+    std::cout << GridLogMessage << traj << " " << impl_plaq << " " << impl_rect
+              << "  " << peri_plaq << " " << peri_rect << std::endl;
+  }
+};

-    //    template <class GaugeField, class Integrator, class Smearer, class Boundary> 
-    template <class GaugeField, class IntegratorType>
-    class HybridMonteCarlo {
-    private:
+//    template <class GaugeField, class Integrator, class Smearer, class
+//    Boundary>
+template <class GaugeField, class IntegratorType>
+class HybridMonteCarlo {
+ private:
+  const HMCparameters Params;

-      const HMCparameters Params;
-      
-      GridSerialRNG   &sRNG; // Fixme: need a RNG management strategy.
-      GridParallelRNG &pRNG; // Fixme: need a RNG management strategy.
-      GaugeField      & Ucur;
+  GridSerialRNG &sRNG;    // Fixme: need a RNG management strategy.
+  GridParallelRNG &pRNG;  // Fixme: need a RNG management strategy.
+  GaugeField &Ucur;

-      IntegratorType &TheIntegrator;
-      std::vector<HmcObservable<GaugeField> *> Observables;
+  IntegratorType &TheIntegrator;
+  std::vector<HmcObservable<GaugeField> *> Observables;

-      /////////////////////////////////////////////////////////
-      // Metropolis step
-      /////////////////////////////////////////////////////////
-      bool metropolis_test(const RealD DeltaH){
+  /////////////////////////////////////////////////////////
+  // Metropolis step
+  /////////////////////////////////////////////////////////
+  bool metropolis_test(const RealD DeltaH) {
+    RealD rn_test;

-	RealD rn_test;
+    RealD prob = std::exp(-DeltaH);

-	RealD prob = std::exp(-DeltaH);
+    random(sRNG, rn_test);

-	random(sRNG,rn_test);
-      
-	std::cout<<GridLogMessage<< "--------------------------------------------\n";
-	std::cout<<GridLogMessage<< "dH = "<<DeltaH << "  Random = "<< rn_test <<"\n";
-	std::cout<<GridLogMessage<< "Acc. Probability = " << ((prob<1.0)? prob: 1.0)<< "   ";
-      
-	if((prob >1.0) || (rn_test <= prob)){       // accepted
-	  std::cout<<GridLogMessage <<"-- ACCEPTED\n";
-	  return true;
-	} else {                               // rejected
-	  std::cout<<GridLogMessage <<"-- REJECTED\n";
-	  return false;
-	}
+    std::cout << GridLogMessage
+              << "--------------------------------------------------\n";
+    std::cout << GridLogMessage << "exp(-dH) = " << prob
+              << "  Random = " << rn_test << "\n";
+    std::cout << GridLogMessage
+              << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";

+    if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
+      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
+      std::cout << GridLogMessage
+                << "--------------------------------------------------\n";
+      return true;
+    } else {  // rejected
+      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
+      std::cout << GridLogMessage
+                << "--------------------------------------------------\n";
+      return false;
+    }
+  }
+
+  /////////////////////////////////////////////////////////
+  // Evolution
+  /////////////////////////////////////////////////////////
+  RealD evolve_step(GaugeField &U) {
+    TheIntegrator.refresh(U, pRNG);  // set U and initialize P and phi's
+
+    RealD H0 = TheIntegrator.S(U);  // initial state action
+
+    std::streamsize current_precision = std::cout.precision();
+    std::cout.precision(17);
+    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
+    std::cout.precision(current_precision);
+
+    TheIntegrator.integrate(U);
+
+    RealD H1 = TheIntegrator.S(U);  // updated state action
+
+    std::cout.precision(17);
+    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
+              << "  dH = " << H1 - H0 << "\n";
+    std::cout.precision(current_precision);
+
+    return (H1 - H0);
+  }
+
+ public:
+  /////////////////////////////////////////
+  // Constructor
+  /////////////////////////////////////////
+  HybridMonteCarlo(HMCparameters Pams, IntegratorType &_Int,
+                   GridSerialRNG &_sRNG, GridParallelRNG &_pRNG, GaugeField &_U)
+      : Params(Pams), TheIntegrator(_Int), sRNG(_sRNG), pRNG(_pRNG), Ucur(_U) {}
+  ~HybridMonteCarlo(){};
+
+  void AddObservable(HmcObservable<GaugeField> *obs) {
+    Observables.push_back(obs);
+  }
+
+  void evolve(void) {
+    Real DeltaH;
+
+    GaugeField Ucopy(Ucur._grid);
+
+    Params.print();
+
+    // Actual updates (evolve a copy Ucopy then copy back eventually)
+    for (int traj = Params.StartTrajectory;
+         traj < Params.Trajectories + Params.StartTrajectory; ++traj) {
+      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
+      Ucopy = Ucur;
+
+      DeltaH = evolve_step(Ucopy);
+
+      bool accept = true;
+      if (traj >= Params.NoMetropolisUntil) {
+        accept = metropolis_test(DeltaH);
      }

-      /////////////////////////////////////////////////////////
-      // Evolution
-      /////////////////////////////////////////////////////////
-      RealD evolve_step(GaugeField& U){
-
-	TheIntegrator.refresh(U,pRNG); // set U and initialize P and phi's 
-
-	RealD H0 = TheIntegrator.S(U); // initial state action  
-
-	std::cout<<GridLogMessage<<"Total H before = "<< H0 << "\n";
-
-	TheIntegrator.integrate(U);
-      
-	RealD H1 = TheIntegrator.S(U); // updated state action            
-
-	std::cout<<GridLogMessage<<"Total H after = "<< H1 << "\n";
-
-	return (H1-H0);
-      }
-      
-    public:
-
-      /////////////////////////////////////////
-      // Constructor
-      /////////////////////////////////////////
-      HybridMonteCarlo(HMCparameters Pms,  IntegratorType &_Int, GridSerialRNG &_sRNG, GridParallelRNG &_pRNG, GaugeField &_U ) :
-        Params(Pms), 
-	TheIntegrator(_Int), 
-	sRNG(_sRNG),
-	pRNG(_pRNG),
-	Ucur(_U)
-      {
-      }
-      ~HybridMonteCarlo(){};
-
-      void AddObservable(HmcObservable<GaugeField> *obs) {
-	Observables.push_back(obs);
+      if (accept) {
+        Ucur = Ucopy;
      }

-      void evolve(void){
-
-	Real DeltaH;
-
-	GaugeField Ucopy(Ucur._grid);
-	
-	// Actual updates (evolve a copy Ucopy then copy back eventually)
-	for(int traj=Params.StartTrajectory; traj < Params.Trajectories+Params.StartTrajectory; ++traj){
-
-	  std::cout<<GridLogMessage << "-- # Trajectory = "<< traj <<  "\n";
-	  Ucopy = Ucur;
-
-	  DeltaH = evolve_step(Ucopy);
-
-	  bool accept = true;
-	  if ( traj > Params.NoMetropolisUntil) { 
-	    accept = metropolis_test(DeltaH);
-	  }
-	  
-	  if ( accept ) {
-	    Ucur = Ucopy;
-	  }
-
-	  for(int obs = 0;obs<Observables.size();obs++){
-	    Observables[obs]->TrajectoryComplete (traj+1,Ucur,sRNG,pRNG);
-	  }
-
-	}
+      for (int obs = 0; obs < Observables.size(); obs++) {
+        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
      }
-    };
-    
-  }// QCD
-}// Grid
+    }
+  }
+};

+}  // QCD
+}  // Grid

-#endif 
+#endif
--- a/lib/qcd/hmc/HmcRunner.h
+++ b/lib/qcd/hmc/HmcRunner.h
@@ -47,7 +47,7 @@ public:
  GridRedBlackCartesian * UrbGrid ;
  GridRedBlackCartesian * FrbGrid ;

-  virtual void BuildTheAction (int argc, char **argv) = 0;
+  virtual void BuildTheAction (int argc, char **argv) = 0; // necessary?

  
  void Run (int argc, char  **argv){
@@ -81,55 +81,78 @@ public:
      NumTraj = ivec[0];
    }

-    // Create integrator
-    typedef MinimumNorm2<GaugeField>  IntegratorType;// change here to change the algorithm
-    IntegratorParameters MDpar(20);
-    IntegratorType MDynamics(UGrid,MDpar, TheAction);
+    int NumThermalizations = 10;
+    if( GridCmdOptionExists(argv,argv+argc,"--Thermalizations") ){
+      arg= GridCmdOptionPayload(argv,argv+argc,"--Thermalizations");
+      std::vector<int> ivec(0);
+      GridCmdOptionIntVector(arg,ivec);
+      NumThermalizations = ivec[0];
+    }

+
+    GridSerialRNG    sRNG;
+    GridParallelRNG  pRNG(UGrid);
+    LatticeGaugeField  U(UGrid); // change this to an extended field (smearing class)
+
+    std::vector<int> SerSeed({1,2,3,4,5});
+    std::vector<int> ParSeed({6,7,8,9,10});
+
+    
+    // Create integrator, including the smearing policy
+    // Smearing policy
+    std::cout << GridLogDebug << " Creating the Stout class\n";
+    double rho = 0.1; // smearing parameter, now hardcoded
+    int Nsmear = 1;   // number of smearing levels
+    Smear_Stout<Gimpl> Stout(rho);
+    std::cout << GridLogDebug << " Creating the SmearedConfiguration class\n";
+    SmearedConfiguration<Gimpl> SmearingPolicy(UGrid, Nsmear, Stout);
+    std::cout << GridLogDebug << " done\n";
+    //////////////
+    typedef MinimumNorm2<GaugeField, SmearedConfiguration<Gimpl> >  IntegratorType;// change here to change the algorithm
+    IntegratorParameters MDpar(20);
+    IntegratorType MDynamics(UGrid, MDpar, TheAction, SmearingPolicy);
+
+    
    // Checkpoint strategy
    NerscHmcCheckpointer<Gimpl> Checkpoint(std::string("ckpoint_lat"),std::string("ckpoint_rng"),1);
    PlaquetteLogger<Gimpl>      PlaqLog(std::string("plaq"));

    HMCparameters HMCpar;
-    HMCpar.StartTrajectory = StartTraj;
-    HMCpar.Trajectories    = NumTraj;
+    HMCpar.StartTrajectory   = StartTraj;
+    HMCpar.Trajectories      = NumTraj;
+    HMCpar.NoMetropolisUntil = NumThermalizations;
    
-    GridSerialRNG    sRNG;
-    GridParallelRNG  pRNG(UGrid);
-    LatticeGaugeField  U(UGrid);
-
-    std::vector<int> SerSeed({1,2,3,4,5});
-    std::vector<int> ParSeed({6,7,8,9,10});

    if ( StartType == HotStart ) {
      // Hot start
-      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::HotConfiguration(pRNG, U);
    } else if ( StartType == ColdStart ) { 
      // Cold start
-      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::ColdConfiguration(pRNG, U);
    } else if ( StartType == TepidStart ) {       
      // Tepid start
-      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::TepidConfiguration(pRNG, U);
    } else if ( StartType == CheckpointStart ) { 
-      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      // CheckpointRestart
      Checkpoint.CheckpointRestore(StartTraj, U, sRNG, pRNG);
    }

-    HybridMonteCarlo<GaugeField,IntegratorType>  HMC(HMCpar, MDynamics,sRNG,pRNG,U);
+    // Attach the gauge field to the smearing Policy and create the fill the smeared set
+    // notice that the unit configuration is singular in this procedure
+    std::cout << GridLogMessage << "Filling the smeared set\n"; 
+    SmearingPolicy.set_GaugeField(U);
+    
+    HybridMonteCarlo<GaugeField,IntegratorType>  HMC(HMCpar, MDynamics,sRNG,pRNG,U); 
    HMC.AddObservable(&Checkpoint);
    HMC.AddObservable(&PlaqLog);
    
--- a/lib/qcd/hmc/integrators/Integrator.h
+++ b/lib/qcd/hmc/integrators/Integrator.h
@@ -44,40 +44,40 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #include <memory>

-namespace Grid{
-  namespace QCD{
+ namespace Grid{
+ 	namespace QCD{

-    struct IntegratorParameters{
+ 		struct IntegratorParameters{

-      int Nexp;
+ 			int Nexp;
      int MDsteps;  // number of outer steps
      RealD trajL;  // trajectory length 
      RealD stepsize;

      IntegratorParameters(int MDsteps_, 
-			   RealD trajL_=1.0,
-			   int Nexp_=12):
-        Nexp(Nexp_),
-	MDsteps(MDsteps_),
-	trajL(trajL_),
-	stepsize(trajL/MDsteps)
-        {
+      	RealD trajL_=1.0,
+      	int Nexp_=12):
+      Nexp(Nexp_),
+      MDsteps(MDsteps_),
+      trajL(trajL_),
+      stepsize(trajL/MDsteps)
+      {
 	  // empty body constructor
-	};
+      };

-    };
+  };

    /*! @brief Class for Molecular Dynamics management */   
-    template<class GaugeField>
-    class Integrator {
+    template<class GaugeField, class SmearingPolicy>
+  class Integrator {

-    protected:
+  protected:

-      typedef IntegratorParameters ParameterType;
+  	typedef IntegratorParameters ParameterType;

-      IntegratorParameters Params;
+  	IntegratorParameters Params;

-      const ActionSet<GaugeField> as;
+  	const ActionSet<GaugeField> as;

      int levels;              //
      double t_U;              // Track time passing on each level and for U and for P
@@ -85,17 +85,19 @@ namespace Grid{

      GaugeField P;

+      SmearingPolicy &Smearer;
+      
      // Should match any legal (SU(n)) gauge field
      // Need to use this template to match Ncol to pass to SU<N> class
      template<int Ncol,class vec> void generate_momenta(Lattice< iVector< iScalar< iMatrix<vec,Ncol> >, Nd> > & P,GridParallelRNG& pRNG){
-	typedef Lattice< iScalar< iScalar< iMatrix<vec,Ncol> > > > GaugeLinkField;
-	GaugeLinkField Pmu(P._grid);
-	Pmu = zero;
-	for(int mu=0;mu<Nd;mu++){
-	  SU<Ncol>::GaussianLieAlgebraMatrix(pRNG, Pmu);
-	  PokeIndex<LorentzIndex>(P, Pmu, mu);
-	}
+      typedef Lattice< iScalar< iScalar< iMatrix<vec,Ncol> > > > GaugeLinkField;
+      GaugeLinkField Pmu(P._grid);
+      Pmu = zero;
+      for(int mu=0;mu<Nd;mu++){
+      	SU<Ncol>::GaussianLieAlgebraMatrix(pRNG, Pmu);
+      	PokeIndex<LorentzIndex>(P, Pmu, mu);
      }
+  }


      //ObserverList observers; // not yet
@@ -103,110 +105,128 @@ namespace Grid{
      //      void register_observers();
      //      void notify_observers();

-      void update_P(GaugeField&U, int level,double ep){
-	t_P[level]+=ep;
-	update_P(P,U,level,ep);
+  void update_P(GaugeField&U, int level, double ep){
+  	t_P[level]+=ep;
+  	update_P(P,U,level,ep);

-	std::cout<<GridLogIntegrator<<"["<<level<<"] P " << " dt "<< ep <<" : t_P "<< t_P[level] <<std::endl;
-      }
+  	std::cout<<GridLogIntegrator<<"["<<level<<"] P " << " dt "<< ep <<" : t_P "<< t_P[level] <<std::endl;
+  }

-      void update_P(GaugeField &Mom,GaugeField&U, int level,double ep){
-	for(int a=0; a<as[level].actions.size(); ++a){
-	  GaugeField force(U._grid);
-	  as[level].actions.at(a)->deriv(U,force);
-	  Mom = Mom - force*ep;
+  void update_P(GaugeField &Mom,GaugeField&U, int level,double ep){
+  	// input U actually not used... 
+  	for(int a=0; a<as[level].actions.size(); ++a){
+  		GaugeField force(U._grid);
+  		GaugeField& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
+  		as[level].actions.at(a)->deriv(Us,force); // deriv should NOT include Ta
+
+	  	std::cout<< GridLogIntegrator << "Smearing (on/off): "<<as[level].actions.at(a)->is_smeared <<std::endl;
+	  	if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
+	  	force = Ta(force);
+	  	std::cout<< GridLogIntegrator << "Force average: "<< norm2(force)/(U._grid->gSites()) <<std::endl;
+	  	Mom -= force*ep;
+	  }
 	}
-      }

-      void update_U(GaugeField&U, double ep){
-	update_U(P,U,ep);
+	void update_U(GaugeField&U, double ep){
+		update_U(P,U,ep);

-	t_U+=ep;
-	int fl = levels-1;
-	std::cout<<GridLogIntegrator<<"   "<<"["<<fl<<"] U " << " dt "<< ep <<" : t_U "<< t_U <<std::endl;
+		t_U+=ep;
+		int fl = levels-1;
+		std::cout<< GridLogIntegrator <<"   "<<"["<<fl<<"] U " << " dt "<< ep <<" : t_U "<< t_U <<std::endl;

-      }
-      void update_U(GaugeField &Mom, GaugeField&U, double ep){
+	}
+	void update_U(GaugeField &Mom, GaugeField&U, double ep){
 	//rewrite exponential to deal automatically  with the lorentz index?
 	//	GaugeLinkField Umu(U._grid);
 	//	GaugeLinkField Pmu(U._grid);
-	for (int mu = 0; mu < Nd; mu++){
-	  auto Umu=PeekIndex<LorentzIndex>(U, mu);
-	  auto Pmu=PeekIndex<LorentzIndex>(Mom, mu);
-	  Umu = expMat(Pmu, ep, Params.Nexp)*Umu;
-	  ProjectOnGroup(Umu);
-	  PokeIndex<LorentzIndex>(U, Umu, mu);
+		for (int mu = 0; mu < Nd; mu++){
+			auto Umu=PeekIndex<LorentzIndex>(U, mu);
+			auto Pmu=PeekIndex<LorentzIndex>(Mom, mu);
+			Umu = expMat(Pmu, ep, Params.Nexp)*Umu;
+			ProjectOnGroup(Umu);
+			PokeIndex<LorentzIndex>(U, Umu, mu);
+		}
+	// Update the smeared fields, can be implemented as observer
+		Smearer.set_GaugeField(U);
 	}
-      }
-      
-      virtual void step (GaugeField& U,int level, int first,int last)=0;

-    public:
+	virtual void step (GaugeField& U,int level, int first,int last)=0;

-      Integrator(GridBase* grid, 
-		 IntegratorParameters Par,
-		 ActionSet<GaugeField> & Aset):
-          Params(Par),
-    	  as(Aset),
-	  P(grid),
-	  levels(Aset.size())
-      {
-	t_P.resize(levels,0.0);
-	t_U=0.0;
-      };
-      
-      virtual ~Integrator(){}
+public:
+
+	Integrator(GridBase* grid, 
+		IntegratorParameters Par,
+		ActionSet<GaugeField> & Aset,
+		SmearingPolicy &Sm):
+	Params(Par),
+	as(Aset),
+	P(grid),
+	levels(Aset.size()),
+	Smearer(Sm)
+	{
+		t_P.resize(levels,0.0);
+		t_U=0.0;
+	// initialization of smearer delegated outside of Integrator
+	};
+
+	virtual ~Integrator(){}

      //Initialization of momenta and actions
-      void refresh(GaugeField& U,GridParallelRNG &pRNG){
-	std::cout<<GridLogIntegrator<< "Integrator refresh\n";
-	generate_momenta(P,pRNG);
-	for(int level=0; level< as.size(); ++level){
-	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
-	    as[level].actions.at(actionID)->refresh(U, pRNG);
-	  }
+	void refresh(GaugeField& U,GridParallelRNG &pRNG){
+		std::cout<<GridLogIntegrator<< "Integrator refresh\n";
+		generate_momenta(P,pRNG);
+		for(int level=0; level< as.size(); ++level){
+			for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
+	    // get gauge field from the SmearingPolicy and
+	    // based on the boolean is_smeared in actionID
+				GaugeField& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
+				as[level].actions.at(actionID)->refresh(Us, pRNG);
+			}
+		}
 	}
-      }

      // Calculate action
-      RealD S(GaugeField& U){
+	RealD S(GaugeField& U){// here also U not used

-	LatticeComplex Hloc(U._grid);	Hloc = zero;
+		LatticeComplex Hloc(U._grid);	Hloc = zero;
 	// Momenta
-	for (int mu=0; mu <Nd; mu++){
-	  auto Pmu = PeekIndex<LorentzIndex>(P, mu);
-	  Hloc -= trace(Pmu*Pmu);
-	}
-	Complex Hsum = sum(Hloc);
-	
-	RealD H = Hsum.real();
-	RealD Hterm;
-	std::cout<<GridLogMessage << "Momentum action H_p = "<< H << "\n";
+		for (int mu=0; mu <Nd; mu++){
+			auto Pmu = PeekIndex<LorentzIndex>(P, mu);
+			Hloc -= trace(Pmu*Pmu);
+		}
+		Complex Hsum = sum(Hloc);
+
+		RealD H = Hsum.real();
+		RealD Hterm;
+		std::cout<<GridLogMessage << "Momentum action H_p = "<< H << "\n";

 	// Actions
-	for(int level=0; level<as.size(); ++level){
-	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
-	    Hterm = as[level].actions.at(actionID)->S(U);
-	    std::cout<<GridLogMessage << "Level "<<level<<" term "<<actionID<<" H = "<<Hterm<<std::endl;
-	    H += Hterm;
-	  }
-	}
-	
-	return H;
-      }
+		for(int level=0; level<as.size(); ++level){
+			for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
+	    // get gauge field from the SmearingPolicy and
+	    // based on the boolean is_smeared in actionID
+				GaugeField& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
+				Hterm = as[level].actions.at(actionID)->S(Us);
+				std::cout<<GridLogMessage << "S Level "<<level<<" term "<<actionID<<" H = "<<Hterm<<std::endl;
+				H += Hterm;
+			}
+		}

-      void integrate(GaugeField& U){
+		return H;
+	}
+
+	void integrate(GaugeField& U){

 	// reset the clocks
-	t_U=0;
-	for(int level=0; level<as.size(); ++level){
-	  t_P[level]=0;
-	}	
+		t_U=0;
+		for(int level=0; level<as.size(); ++level){
+			t_P[level]=0;
+		}	

 	for(int step=0; step< Params.MDsteps; ++step){   // MD step
-	  int first_step = (step==0);
-	  int  last_step = (step==Params.MDsteps-1);
-	  this->step(U,0,first_step,last_step);
+		int first_step = (step==0);
+		int  last_step = (step==Params.MDsteps-1);
+		this->step(U,0,first_step,last_step);
 	}

 	// Check the clocks all match on all levels
@@ -219,9 +239,9 @@ namespace Grid{
 	assert(fabs(t_U-Params.trajL) < 1.0e-6);


-      }
-    };
-    
-  }
+}
+};
+
+}
 }
 #endif//INTEGRATOR_INCLUDED
--- a/lib/qcd/hmc/integrators/Integrator_algorithm.h
+++ b/lib/qcd/hmc/integrators/Integrator_algorithm.h
@@ -91,14 +91,17 @@ namespace Grid{
    *  P 1/2                            P 1/2
    */    

-    template<class GaugeField> class LeapFrog : public Integrator<GaugeField> {
+    template<class GaugeField, class SmearingPolicy> class LeapFrog :
+      public Integrator<GaugeField, SmearingPolicy> {
    public:

-      typedef LeapFrog<GaugeField> Algorithm;
+      typedef LeapFrog<GaugeField, SmearingPolicy> Algorithm;

      LeapFrog(GridBase* grid, 
 	       IntegratorParameters Par,
-	       ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+	       ActionSet<GaugeField> & Aset,
+	       SmearingPolicy & Sm):
+	Integrator<GaugeField, SmearingPolicy>(grid,Par,Aset,Sm) {};


      void step (GaugeField& U, int level,int _first, int _last){
@@ -135,7 +138,8 @@ namespace Grid{
      }
    };

-    template<class GaugeField> class MinimumNorm2 : public Integrator<GaugeField> {
+    template<class GaugeField, class SmearingPolicy> class MinimumNorm2 :
+      public Integrator<GaugeField, SmearingPolicy> {
    private:
      const RealD lambda = 0.1931833275037836;

@@ -143,7 +147,9 @@ namespace Grid{

      MinimumNorm2(GridBase* grid, 
 		   IntegratorParameters Par,
-		   ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+		   ActionSet<GaugeField> & Aset,
+		   SmearingPolicy& Sm):
+	Integrator<GaugeField, SmearingPolicy>(grid,Par,Aset,Sm) {};

      void step (GaugeField& U, int level, int _first,int _last){

@@ -191,7 +197,8 @@ namespace Grid{
    };


-    template<class GaugeField> class ForceGradient : public Integrator<GaugeField> {
+    template<class GaugeField, class SmearingPolicy> class ForceGradient :
+      public Integrator<GaugeField, SmearingPolicy> {
    private:
      const RealD lambda = 1.0/6.0;;
      const RealD chi    = 1.0/72.0;
@@ -202,7 +209,9 @@ namespace Grid{
      // Looks like dH scales as dt^4. tested wilson/wilson 2 level.
    ForceGradient(GridBase* grid, 
 		  IntegratorParameters Par,
-		  ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+		  ActionSet<GaugeField> & Aset,
+		  SmearingPolicy &Sm):
+      Integrator<GaugeField, SmearingPolicy>(grid,Par,Aset, Sm) {};


      void FG_update_P(GaugeField&U, int level,double fg_dt,double ep){
--- a/lib/qcd/smearing/APEsmearing.h
+++ b/lib/qcd/smearing/APEsmearing.h
@@ -0,0 +1,130 @@
+/*!
+  @brief Declaration of Smear_APE class for APE smearing
+*/
+
+#ifndef APE_SMEAR_
+#define APE_SMEAR_
+
+  namespace Grid {
+  	namespace QCD {
+
+
+    /*!  @brief APE type smearing of link variables. */
+    template <class Gimpl> 
+  		class Smear_APE: public Smear<Gimpl>{
+  		private:
+      	const std::vector<double> rho;/*!< Array of weights */
+
+//This member must be private - we do not want to control from outside 
+  			std::vector<double> set_rho(const double common_rho) const {
+  				std::vector<double> res;
+
+  				for(int mn=0; mn<Nd*Nd; ++mn) res.push_back(common_rho);
+  					for(int mu=0; mu<Nd; ++mu) res[mu + mu*Nd] = 0.0;
+  						return res;
+  				}
+
+  			public:
+      // Defines the gauge field types
+  				INHERIT_GIMPL_TYPES(Gimpl)
+
+
+      // Constructors and destructors
+  				Smear_APE(const std::vector<double>& rho_):rho(rho_){} // check vector size
+  				Smear_APE(double rho_val):rho(set_rho(rho_val)){}
+  				Smear_APE():rho(set_rho(1.0)){}
+  				~Smear_APE(){}
+
+      ///////////////////////////////////////////////////////////////////////////////
+  				void smear(GaugeField& u_smr, const GaugeField& U)const{
+  					GridBase *grid = U._grid;
+  					GaugeLinkField Cup(grid), tmp_stpl(grid);
+  					WilsonLoops<Gimpl> WL;
+  					u_smr = zero; 
+
+  					for(int mu=0; mu<Nd; ++mu){
+  						Cup = zero;
+  						for(int nu=0; nu<Nd; ++nu){
+  							if (nu != mu) {
+  								// get the staple in direction mu, nu
+	      						WL.Staple(tmp_stpl, U, mu, nu);  //nb staple conventions of IroIro and Grid differ by a dagger
+	      						Cup += tmp_stpl*rho[mu + Nd * nu];
+	      					}
+	      				}
+	  					// save the Cup link-field on the u_smr gauge-field
+	  					pokeLorentz(u_smr, adj(Cup), mu); // u_smr[mu] = Cup^dag   see conventions for Staple
+	  				}
+	  			}
+
+////////////////////////////////////////////////////////////////////////////////
+	  			void derivative(GaugeField& SigmaTerm,
+	  				const GaugeField& iLambda,
+	  				const GaugeField& U)const{
+
+	// Reference 
+	// Morningstar, Peardon, Phys.Rev.D69,054501(2004)
+	// Equation 75
+    // Computing Sigma_mu, derivative of S[fat links] with respect to the thin links
+    // Output SigmaTerm
+
+	  				GridBase *grid = U._grid;
+
+	  				WilsonLoops<Gimpl> WL;
+	  				GaugeLinkField staple(grid), u_tmp(grid);
+	  				GaugeLinkField iLambda_mu(grid), iLambda_nu(grid);
+	  				GaugeLinkField U_mu(grid), U_nu(grid);
+	  				GaugeLinkField sh_field(grid), temp_Sigma(grid);
+	  				Real rho_munu, rho_numu;
+
+	  				for(int mu = 0; mu < Nd; ++mu){
+	  					U_mu       = peekLorentz(      U, mu);
+	  					iLambda_mu = peekLorentz(iLambda, mu);
+
+	  					for(int nu = 0; nu < Nd; ++nu){
+	  						if(nu==mu) continue;
+	  						U_nu       = peekLorentz(      U, nu);
+	  						iLambda_nu = peekLorentz(iLambda, nu);
+
+	  						rho_munu = rho[mu + Nd * nu];
+	  						rho_numu = rho[nu + Nd * mu];
+
+	  						WL.StapleUpper(staple, U, mu, nu);
+
+	  						temp_Sigma = -rho_numu*staple*iLambda_nu;  //ok
+	        				//-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x)
+	  						Gimpl::AddGaugeLink(SigmaTerm, temp_Sigma, mu);
+
+	    					sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity?
+
+	    					temp_Sigma = rho_numu*sh_field*staple; //ok
+	    					//r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)
+	    					Gimpl::AddGaugeLink(SigmaTerm, temp_Sigma, mu);
+
+	    					sh_field = Cshift(iLambda_mu, nu, 1);
+
+	    					temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok
+	    					//-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x)
+	    					Gimpl::AddGaugeLink(SigmaTerm, temp_Sigma, mu);
+
+	    					staple = zero;
+	    					sh_field = Cshift(U_nu, mu, 1);
+
+	    					temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu;
+	    					temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu;
+
+	    					u_tmp = adj(U_nu)*iLambda_nu;
+	    					sh_field = Cshift(u_tmp, mu, 1);
+	    					temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu;
+	    					sh_field = Cshift(temp_Sigma, nu, -1);
+	    					Gimpl::AddGaugeLink(SigmaTerm, sh_field, mu);
+
+	    				}
+	    			}
+	    		}
+	    	};
+
+
+
+  }// namespace QCD
+}//namespace Grid
+#endif  
--- a/lib/qcd/smearing/BaseSmearing.h
+++ b/lib/qcd/smearing/BaseSmearing.h
@@ -0,0 +1,17 @@
+/*
+  @brief Declares base smearing class Smear
+ */
+#ifndef BASE_SMEAR_
+#define BASE_SMEAR_
+
+template <class Gimpl> 
+class Smear{
+public:
+  INHERIT_GIMPL_TYPES(Gimpl) // inherits the types for the gauge fields
+
+  virtual ~Smear(){}
+  virtual void smear     (GaugeField&,const GaugeField&)const = 0;
+  virtual void derivative(GaugeField&,
+			  const GaugeField&,const GaugeField&) const = 0;
+};
+#endif
--- a/lib/qcd/smearing/GaugeConfiguration.h
+++ b/lib/qcd/smearing/GaugeConfiguration.h
@@ -0,0 +1,262 @@
+/*!
+  @file GaugeConfiguration.h
+
+  @brief Declares the GaugeConfiguration class
+*/
+#ifndef GAUGE_CONFIG_
+#define GAUGE_CONFIG_
+
+namespace Grid {
+
+namespace QCD {
+
+/*!
+  @brief Smeared configuration container
+
+  It will behave like a configuration from the point of view of
+  the HMC update and integrators.
+  An "advanced configuration" object that can provide not only the
+  data to store the gauge configuration but also operations to manipulate
+  it, like smearing.
+
+  It stores a list of smeared configurations.
+*/
+template <class Gimpl>
+class SmearedConfiguration {
+ public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+ private:
+  const unsigned int smearingLevels;
+  Smear_Stout<Gimpl> StoutSmearing;
+  std::vector<GaugeField> SmearedSet;
+
+  // Member functions
+  //====================================================================
+  void fill_smearedSet(GaugeField& U) {
+    ThinLinks = &U;  // attach the smearing routine to the field U
+
+    // check the pointer is not null
+    if (ThinLinks == NULL)
+      std::cout << GridLogError
+                << "[SmearedConfiguration] Error in ThinLinks pointer\n";
+
+    if (smearingLevels > 0) {
+      std::cout << GridLogDebug
+                << "[SmearedConfiguration] Filling SmearedSet\n";
+      GaugeField previous_u(ThinLinks->_grid);
+
+      previous_u = *ThinLinks;
+      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) {
+        StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
+        previous_u = SmearedSet[smearLvl];
+
+        // For debug purposes
+        RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(previous_u);
+        std::cout << GridLogDebug
+                  << "[SmearedConfiguration] Plaq: " << impl_plaq << std::endl;
+      }
+    }
+  }
+  //====================================================================
+  GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
+                                  const GaugeField& GaugeK) const {
+    GridBase* grid = GaugeK._grid;
+    GaugeField C(grid), SigmaK(grid), iLambda(grid);
+    GaugeLinkField iLambda_mu(grid);
+    GaugeLinkField iQ(grid), e_iQ(grid);
+    GaugeLinkField SigmaKPrime_mu(grid);
+    GaugeLinkField GaugeKmu(grid), Cmu(grid);
+
+    StoutSmearing.BaseSmear(C, GaugeK);
+    SigmaK = zero;
+    iLambda = zero;
+
+    for (int mu = 0; mu < Nd; mu++) {
+      Cmu = peekLorentz(C, mu);
+      GaugeKmu = peekLorentz(GaugeK, mu);
+      SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu);
+      iQ = Ta(Cmu * adj(GaugeKmu));
+      set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
+      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
+      pokeLorentz(iLambda, iLambda_mu, mu);
+    }
+    StoutSmearing.derivative(SigmaK, iLambda,
+                             GaugeK);  // derivative of SmearBase
+    return SigmaK;
+  }
+
+  /*! @brief Returns smeared configuration at level 'Level' */
+  const GaugeField& get_smeared_conf(int Level) const {
+    return SmearedSet[Level];
+  }
+
+  //====================================================================
+  void set_iLambda(GaugeLinkField& iLambda, GaugeLinkField& e_iQ,
+                   const GaugeLinkField& iQ, const GaugeLinkField& Sigmap,
+                   const GaugeLinkField& GaugeK) const {
+    GridBase* grid = iQ._grid;
+    GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid);
+    GaugeLinkField unity(grid);
+    unity = 1.0;
+
+    LatticeComplex u(grid), w(grid);
+    LatticeComplex f0(grid), f1(grid), f2(grid);
+    LatticeComplex xi0(grid), xi1(grid), tmp(grid);
+    LatticeComplex u2(grid), w2(grid), cosw(grid);
+    LatticeComplex emiu(grid), e2iu(grid), qt(grid), fden(grid);
+    LatticeComplex r01(grid), r11(grid), r21(grid), r02(grid), r12(grid);
+    LatticeComplex r22(grid), tr1(grid), tr2(grid);
+    LatticeComplex b10(grid), b11(grid), b12(grid), b20(grid), b21(grid),
+        b22(grid);
+    LatticeComplex LatticeUnitComplex(grid);
+
+    LatticeUnitComplex = 1.0;
+
+    // Exponential
+    iQ2 = iQ * iQ;
+    iQ3 = iQ * iQ2;
+    StoutSmearing.set_uw(u, w, iQ2, iQ3);
+    StoutSmearing.set_fj(f0, f1, f2, u, w);
+    e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;
+
+    // Getting B1, B2, Gamma and Lambda
+    // simplify this part, reduntant calculations in set_fj
+    xi0 = StoutSmearing.func_xi0(w);
+    xi1 = StoutSmearing.func_xi1(w);
+    u2 = u * u;
+    w2 = w * w;
+    cosw = cos(w);
+
+    emiu = cos(u) - timesI(sin(u));
+    e2iu = cos(2.0 * u) + timesI(sin(2.0 * u));
+
+    r01 = (2.0 * u + timesI(2.0 * (u2 - w2))) * e2iu +
+          emiu * ((16.0 * u * cosw + 2.0 * u * (3.0 * u2 + w2) * xi0) +
+                  timesI(-8.0 * u2 * cosw + 2.0 * (9.0 * u2 + w2) * xi0));
+
+    r11 = (2.0 * LatticeUnitComplex + timesI(4.0 * u)) * e2iu +
+          emiu * ((-2.0 * cosw + (3.0 * u2 - w2) * xi0) +
+                  timesI((2.0 * u * cosw + 6.0 * u * xi0)));
+
+    r21 =
+        2.0 * timesI(e2iu) + emiu * (-3.0 * u * xi0 + timesI(cosw - 3.0 * xi0));
+
+    r02 = -2.0 * e2iu +
+          emiu * (-8.0 * u2 * xi0 +
+                  timesI(2.0 * u * (cosw + xi0 + 3.0 * u2 * xi1)));
+
+    r12 = emiu * (2.0 * u * xi0 + timesI(-cosw - xi0 + 3.0 * u2 * xi1));
+
+    r22 = emiu * (xi0 - timesI(3.0 * u * xi1));
+
+    fden = LatticeUnitComplex / (2.0 * (9.0 * u2 - w2) * (9.0 * u2 - w2));
+
+    b10 = 2.0 * u * r01 + (3.0 * u2 - w2) * r02 - (30.0 * u2 + 2.0 * w2) * f0;
+    b11 = 2.0 * u * r11 + (3.0 * u2 - w2) * r12 - (30.0 * u2 + 2.0 * w2) * f1;
+    b12 = 2.0 * u * r21 + (3.0 * u2 - w2) * r22 - (30.0 * u2 + 2.0 * w2) * f2;
+
+    b20 = r01 - (3.0 * u) * r02 - (24.0 * u) * f0;
+    b21 = r11 - (3.0 * u) * r12 - (24.0 * u) * f1;
+    b22 = r21 - (3.0 * u) * r22 - (24.0 * u) * f2;
+
+    b10 *= fden;
+    b11 *= fden;
+    b12 *= fden;
+    b20 *= fden;
+    b21 *= fden;
+    b22 *= fden;
+
+    B1 = b10 * unity + timesMinusI(b11) * iQ - b12 * iQ2;
+    B2 = b20 * unity + timesMinusI(b21) * iQ - b22 * iQ2;
+    USigmap = GaugeK * Sigmap;
+
+    tr1 = trace(USigmap * B1);
+    tr2 = trace(USigmap * B2);
+
+    GaugeLinkField QUS = iQ * USigmap;
+    GaugeLinkField USQ = USigmap * iQ;
+
+    GaugeLinkField iGamma = tr1 * iQ - timesI(tr2) * iQ2 +
+                            timesI(f1) * USigmap + f2 * QUS + f2 * USQ;
+
+    iLambda = Ta(iGamma);
+  }
+
+  //====================================================================
+ public:
+  GaugeField*
+      ThinLinks; /*!< @brief Pointer to the thin
+                                                         links configuration */
+
+  /*! @brief Standard constructor */
+  SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
+                       Smear_Stout<Gimpl>& Stout)
+      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) {
+    for (unsigned int i = 0; i < smearingLevels; ++i)
+      SmearedSet.push_back(*(new GaugeField(UGrid)));
+  }
+
+  /*! For just thin links */
+  SmearedConfiguration()
+      : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}
+
+  // attach the smeared routines to the thin links U and fill the smeared set
+  void set_GaugeField(GaugeField& U) { fill_smearedSet(U); }
+
+  //====================================================================
+  void smeared_force(GaugeField& SigmaTilde) const {
+    if (smearingLevels > 0) {
+      GaugeField force = SigmaTilde; // actually = U*SigmaTilde
+      GaugeLinkField tmp_mu(SigmaTilde._grid);
+
+      for (int mu = 0; mu < Nd; mu++) {
+        // to get just SigmaTilde
+        tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) *
+                 peekLorentz(force, mu);
+        pokeLorentz(force, tmp_mu, mu);
+      }
+
+      for (int ismr = smearingLevels - 1; ismr > 0; --ismr)
+        force = AnalyticSmearedForce(force, get_smeared_conf(ismr - 1));
+
+      force = AnalyticSmearedForce(force, *ThinLinks);
+
+      for (int mu = 0; mu < Nd; mu++) {
+        tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu);
+        pokeLorentz(SigmaTilde, tmp_mu, mu);
+      }
+    }  // if smearingLevels = 0 do nothing
+  }
+  //====================================================================
+
+  GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
+
+  GaugeField& get_U(bool smeared = false) {
+    // get the config, thin links by default
+    if (smeared) {
+      if (smearingLevels) {
+        RealD impl_plaq =
+            WilsonLoops<Gimpl>::avgPlaquette(SmearedSet[smearingLevels - 1]);
+        std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq
+                  << std::endl;
+        return get_SmearedU();
+
+      } else {
+        RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
+        std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
+                  << std::endl;
+        return *ThinLinks;
+      }
+    } else {
+      RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
+      std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
+                << std::endl;
+      return *ThinLinks;
+    }
+  }
+};
+}
+}
+
+#endif
--- a/lib/qcd/smearing/Smearing.h
+++ b/lib/qcd/smearing/Smearing.h
@@ -0,0 +1,9 @@
+#ifndef GRID_QCD_SMEARING_H
+#define GRID_QCD_SMEARING_H
+
+#include <Grid/qcd/smearing/BaseSmearing.h>
+#include <Grid/qcd/smearing/APEsmearing.h>
+#include <Grid/qcd/smearing/StoutSmearing.h>
+#include <Grid/qcd/smearing/GaugeConfiguration.h>
+
+#endif
--- a/lib/qcd/smearing/StoutSmearing.h
+++ b/lib/qcd/smearing/StoutSmearing.h
@@ -0,0 +1,160 @@
+/*
+  @file stoutSmear.hpp
+  @brief Declares Stout smearing class
+*/
+#ifndef STOUT_SMEAR_
+#define STOUT_SMEAR_
+
+namespace Grid {
+namespace QCD {
+
+/*!  @brief Stout smearing of link variable. */
+template <class Gimpl>
+class Smear_Stout : public Smear<Gimpl> {
+ private:
+  const Smear<Gimpl>* SmearBase;
+
+ public:
+  INHERIT_GIMPL_TYPES(Gimpl)
+
+  Smear_Stout(Smear<Gimpl>* base) : SmearBase(base) {
+    static_assert(Nc == 3,
+                  "Stout smearing currently implemented only for Nc==3");
+  }
+
+  /*! Default constructor */
+  Smear_Stout(double rho = 1.0) : SmearBase(new Smear_APE<Gimpl>(rho)) {
+    static_assert(Nc == 3,
+                  "Stout smearing currently implemented only for Nc==3");
+  }
+
+  ~Smear_Stout() {}  // delete SmearBase...
+
+  void smear(GaugeField& u_smr, const GaugeField& U) const {
+    GaugeField C(U._grid);
+    GaugeLinkField tmp(U._grid), iq_mu(U._grid), Umu(U._grid);
+
+    std::cout << GridLogDebug << "Stout smearing started\n";
+
+    // Smear the configurations
+    SmearBase->smear(C, U);
+
+    for (int mu = 0; mu < Nd; mu++) {
+      tmp = peekLorentz(C, mu);
+      Umu = peekLorentz(U, mu);
+      iq_mu = Ta(
+          tmp *
+          adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
+      exponentiate_iQ(tmp, iq_mu);
+      pokeLorentz(u_smr, tmp * Umu, mu);  // u_smr = exp(iQ_mu)*U_mu
+    }
+    std::cout << GridLogDebug << "Stout smearing completed\n";
+  };
+
+  void derivative(GaugeField& SigmaTerm, const GaugeField& iLambda,
+                  const GaugeField& Gauge) const {
+    SmearBase->derivative(SigmaTerm, iLambda, Gauge);
+  };
+
+  void BaseSmear(GaugeField& C, const GaugeField& U) const {
+    SmearBase->smear(C, U);
+  };
+
+  void exponentiate_iQ(GaugeLinkField& e_iQ, const GaugeLinkField& iQ) const {
+    // Put this outside
+    // only valid for SU(3) matrices
+
+    // only one Lorentz direction at a time
+
+    // notice that it actually computes
+    // exp ( input matrix )
+    // the i sign is coming from outside
+    // input matrix is anti-hermitian NOT hermitian
+
+    GridBase* grid = iQ._grid;
+    GaugeLinkField unity(grid);
+    unity = 1.0;
+
+    GaugeLinkField iQ2(grid), iQ3(grid);
+    LatticeComplex u(grid), w(grid);
+    LatticeComplex f0(grid), f1(grid), f2(grid);
+
+    iQ2 = iQ * iQ;
+    iQ3 = iQ * iQ2;
+
+    set_uw(u, w, iQ2, iQ3);
+    set_fj(f0, f1, f2, u, w);
+
+    e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;
+  };
+
+  void set_uw(LatticeComplex& u, LatticeComplex& w, GaugeLinkField& iQ2,
+              GaugeLinkField& iQ3) const {
+    Complex one_over_three = 1.0 / 3.0;
+    Complex one_over_two = 1.0 / 2.0;
+
+    GridBase* grid = u._grid;
+    LatticeComplex c0(grid), c1(grid), tmp(grid), c0max(grid), theta(grid);
+
+    // sign in c0 from the conventions on the Ta
+    c0 = -imag(trace(iQ3)) * one_over_three;  
+    c1 = -real(trace(iQ2)) * one_over_two;
+
+    // Cayley Hamilton checks to machine precision, tested
+    tmp = c1 * one_over_three;
+    c0max = 2.0 * pow(tmp, 1.5);
+
+    theta = acos(c0 / c0max) *
+            one_over_three;  // divide by three here, now leave as it is
+    u = sqrt(tmp) * cos(theta);
+    w = sqrt(c1) * sin(theta);
+  }
+
+  void set_fj(LatticeComplex& f0, LatticeComplex& f1, LatticeComplex& f2,
+              const LatticeComplex& u, const LatticeComplex& w) const {
+    GridBase* grid = u._grid;
+    LatticeComplex xi0(grid), u2(grid), w2(grid), cosw(grid);
+    LatticeComplex fden(grid);
+    LatticeComplex h0(grid), h1(grid), h2(grid);
+    LatticeComplex e2iu(grid), emiu(grid), ixi0(grid), qt(grid);
+    LatticeComplex unity(grid);
+    unity = 1.0;
+
+    xi0 = func_xi0(w);
+    u2 = u * u;
+    w2 = w * w;
+    cosw = cos(w);
+
+    ixi0 = timesI(xi0);
+    emiu = cos(u) - timesI(sin(u));
+    e2iu = cos(2.0 * u) + timesI(sin(2.0 * u));
+
+    h0 = e2iu * (u2 - w2) +
+         emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0));
+    h1 = e2iu * (2.0 * u) - emiu * ((2.0 * u * cosw) - (3.0 * u2 - w2) * ixi0);
+    h2 = e2iu - emiu * (cosw + (3.0 * u) * ixi0);
+
+    fden = unity / (9.0 * u2 - w2);  // reals
+    f0 = h0 * fden;
+    f1 = h1 * fden;
+    f2 = h2 * fden;
+  }
+
+  LatticeComplex func_xi0(const LatticeComplex& w) const {
+    // Define a function to do the check
+    // if( w < 1e-4 ) std::cout << GridLogWarning<< "[Smear_stout] w too small:
+    // "<< w <<"\n";
+    return sin(w) / w;
+  }
+
+  LatticeComplex func_xi1(const LatticeComplex& w) const {
+    // Define a function to do the check
+    // if( w < 1e-4 ) std::cout << GridLogWarning << "[Smear_stout] w too small:
+    // "<< w <<"\n";
+    return cos(w) / (w * w) - sin(w) / (w * w * w);
+  }
+};
+}
+}
+
+#endif
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -43,7 +43,7 @@ public:

  template<typename vtype> using iSUnMatrix              = iScalar<iScalar<iMatrix<vtype, ncolour> > > ;
  template<typename vtype> using iSU2Matrix              = iScalar<iScalar<iMatrix<vtype, 2> > > ;
-
+  
  //////////////////////////////////////////////////////////////////////////////////////////////////
  // Types can be accessed as SU<2>::Matrix , SU<2>::vSUnMatrix, SU<2>::LatticeMatrix etc...
  //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -552,15 +552,24 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
  }

  // reunitarise??
-  static void LieRandomize(GridParallelRNG     &pRNG,LatticeMatrix &out,double scale=1.0){
+  template<typename LatticeMatrixType>
+  static void LieRandomize(GridParallelRNG     &pRNG,LatticeMatrixType &out,double scale=1.0){
    GridBase *grid = out._grid;
-
-    LatticeComplex ca (grid);
-    LatticeMatrix  lie(grid);
-    LatticeMatrix  la (grid);
-    Complex ci(0.0,scale);
-    Complex cone(1.0,0.0);
-    Matrix ta;
+    
+    typedef typename LatticeMatrixType::vector_type vector_type;
+    typedef typename LatticeMatrixType::scalar_type scalar_type;
+    
+    typedef iSinglet<vector_type> vTComplexType;
+    
+    typedef Lattice<vTComplexType> LatticeComplexType;
+    typedef typename GridTypeMapper<typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
+    
+    LatticeComplexType ca (grid);
+    LatticeMatrixType  lie(grid);
+    LatticeMatrixType  la (grid);
+    ComplexD ci(0.0,scale);
+    ComplexD cone(1.0,0.0);
+    MatrixType ta;

    lie=zero;
    for(int a=0;a<generators();a++){
@@ -596,9 +605,13 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g

  }

-
-  static void HotConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
-    LatticeMatrix Umu(out._grid);
+  template<typename GaugeField>
+  static void HotConfiguration(GridParallelRNG &pRNG,GaugeField &out){
+    typedef typename GaugeField::vector_type vector_type;
+    typedef iSUnMatrix<vector_type> vMatrixType;
+    typedef Lattice<vMatrixType> LatticeMatrixType;
+    
+    LatticeMatrixType Umu(out._grid);
    for(int mu=0;mu<Nd;mu++){
      LieRandomize(pRNG,Umu,1.0);
      PokeIndex<LorentzIndex>(out,Umu,mu);
@@ -622,13 +635,15 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
  static void taProj( const LatticeMatrix &in,  LatticeMatrix &out){
    out = Ta(in);
  }
-  static void taExp( const LatticeMatrix &x,  LatticeMatrix &ex){ 
-
-    LatticeMatrix xn(x._grid);
+  template<typename LatticeMatrixType>
+  static void taExp( const LatticeMatrixType &x,  LatticeMatrixType &ex){ 
+    typedef typename LatticeMatrixType::scalar_type ComplexType;    
+    
+    LatticeMatrixType xn(x._grid);
    RealD nfac = 1.0;

    xn = x;
-    ex =xn+Complex(1.0); // 1+x
+    ex =xn+ComplexType(1.0); // 1+x

    // Do a 12th order exponentiation
    for(int i=2; i <= 12; ++i)
--- a/lib/qcd/utils/SpaceTimeGrid.cc
+++ b/lib/qcd/utils/SpaceTimeGrid.cc
@@ -84,7 +84,7 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC

 GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
 {
-  int N4=FourDimGrid->_ndimension;
+  int N4    = FourDimGrid->_ndimension;
  int nsimd = FourDimGrid->Nsimd();

  std::vector<int> latt5(1,Ls);
@@ -103,11 +103,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const Gr
 {
  int N4=FourDimGrid->_ndimension;
  int nsimd = FourDimGrid->Nsimd();
-  int cbd=0;
+  int cbd=1;
  std::vector<int> latt5(1,Ls);
  std::vector<int> simd5(1,nsimd);
  std::vector<int>  mpi5(1,1);
-  std::vector<int>   cb5(1,1);
+  std::vector<int>   cb5(1,0);
    
  for(int d=0;d<N4;d++){
    latt5.push_back(FourDimGrid->_fdimensions[d]);
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -1,6 +1,6 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid

    Source file: ./lib/qcd/utils/WilsonLoops.h

@@ -25,391 +25,501 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+    See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef QCD_UTILS_WILSON_LOOPS_H
 #define QCD_UTILS_WILSON_LOOPS_H
 namespace Grid {
 namespace QCD {

 // Common wilson loop observables
-template<class Gimpl>
-class WilsonLoops : public Gimpl {
+template <class Gimpl> class WilsonLoops : public Gimpl {
 public:
-  
  INHERIT_GIMPL_TYPES(Gimpl);

  typedef typename Gimpl::GaugeLinkField GaugeMat;
-  typedef typename Gimpl::GaugeField     GaugeLorentz;
+  typedef typename Gimpl::GaugeField GaugeLorentz;

  //////////////////////////////////////////////////
  // directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
-  static void dirPlaquette(GaugeMat &plaq,const std::vector<GaugeMat> &U, const int mu, const int nu)
-  {
-    // Annoyingly, must use either scope resolution to find dependent base class, 
-    // or this-> ; there is no "this" in a static method. This forces explicit Gimpl scope
-    // resolution throughout the usage in this file, and rather defeats the purpose of deriving
+  static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U,
+                           const int mu, const int nu) {
+    // Annoyingly, must use either scope resolution to find dependent base
+    // class,
+    // or this-> ; there is no "this" in a static method. This forces explicit
+    // Gimpl scope
+    // resolution throughout the usage in this file, and rather defeats the
+    // purpose of deriving
    // from Gimpl.
-    plaq= Gimpl::CovShiftBackward(U[mu],mu,
-	  Gimpl::CovShiftBackward(U[nu],nu,
-          Gimpl::CovShiftForward (U[mu],mu,U[nu])));
+    plaq = Gimpl::CovShiftBackward(
+        U[mu], mu, Gimpl::CovShiftBackward(
+                       U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
  }
  //////////////////////////////////////////////////
  // trace of directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
-  static void traceDirPlaquette(LatticeComplex &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu)
-  {
+  static void traceDirPlaquette(LatticeComplex &plaq,
+                                const std::vector<GaugeMat> &U, const int mu,
+                                const int nu) {
    GaugeMat sp(U[0]._grid);
-    dirPlaquette(sp,U,mu,nu);
-    plaq=trace(sp);
+    dirPlaquette(sp, U, mu, nu);
+    plaq = trace(sp);
  }
  //////////////////////////////////////////////////
  // sum over all planes of plaquette
  //////////////////////////////////////////////////
-  static void sitePlaquette(LatticeComplex &Plaq,const std::vector<GaugeMat> &U)
-  {
+  static void sitePlaquette(LatticeComplex &Plaq,
+                            const std::vector<GaugeMat> &U) {
    LatticeComplex sitePlaq(U[0]._grid);
-    Plaq=zero;
-    for(int mu=1;mu<Nd;mu++){
-      for(int nu=0;nu<mu;nu++){
-	traceDirPlaquette(sitePlaq,U,mu,nu);
-	Plaq = Plaq + sitePlaq;
+    Plaq = zero;
+    for (int mu = 1; mu < Nd; mu++) {
+      for (int nu = 0; nu < mu; nu++) {
+        traceDirPlaquette(sitePlaq, U, mu, nu);
+        Plaq = Plaq + sitePlaq;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
-  static RealD sumPlaquette(const GaugeLorentz &Umu){
-    std::vector<GaugeMat> U(Nd,Umu._grid);
+  static RealD sumPlaquette(const GaugeLorentz &Umu) {
+    std::vector<GaugeMat> U(4, Umu._grid);

-    for(int mu=0;mu<Nd;mu++){
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+    for (int mu = 0; mu < Nd; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

    LatticeComplex Plaq(Umu._grid);
-    
-    sitePlaquette(Plaq,U);
-    
+
+    sitePlaquette(Plaq, U);
+
    TComplex Tp = sum(Plaq);
-    Complex p  = TensorRemove(Tp);
+    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
-  static RealD avgPlaquette(const GaugeLorentz &Umu){
-
+  static RealD avgPlaquette(const GaugeLorentz &Umu) {
    RealD sumplaq = sumPlaquette(Umu);
-    
    double vol = Umu._grid->gSites();
-    
-    double faces = (1.0*Nd*(Nd-1))/2.0;
-    
-    return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
+    double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
+    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }
-  static RealD linkTrace(const GaugeLorentz &Umu){
-    std::vector<GaugeMat> U(Nd,Umu._grid);

-    LatticeComplex Tr(Umu._grid); Tr=zero;
-    for(int mu=0;mu<Nd;mu++){
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-      Tr = Tr+trace(U[mu]);
+  //////////////////////////////////////////////////
+  // average over traced single links
+  //////////////////////////////////////////////////
+  static RealD linkTrace(const GaugeLorentz &Umu) {
+    std::vector<GaugeMat> U(4, Umu._grid);
+
+    LatticeComplex Tr(Umu._grid);
+    Tr = zero;
+    for (int mu = 0; mu < Nd; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+      Tr = Tr + trace(U[mu]);
    }
-    
+
    TComplex Tp = sum(Tr);
-    Complex p  = TensorRemove(Tp);
+    Complex p = TensorRemove(Tp);

    double vol = Umu._grid->gSites();

-    return p.real()/vol/((double)(Nd*(Nd-1)));
+    return p.real() / vol / 4.0 / 3.0;
  };
+
  //////////////////////////////////////////////////
-  // the sum over all staples on each site
+  // the sum over all staples on each site in direction mu,nu
  //////////////////////////////////////////////////
-  static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
+  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
+                     int nu) {

    GridBase *grid = Umu._grid;

-    std::vector<GaugeMat> U(Nd,grid);
-    for(int d=0;d<Nd;d++){
-      U[d] = PeekIndex<LorentzIndex>(Umu,d);
+    std::vector<GaugeMat> U(4, grid);
+    for (int d = 0; d < Nd; d++) {
+      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
    staple = zero;
-    GaugeMat tmp(grid);

-    
-    for(int nu=0;nu<Nd;nu++){
-
-      if(nu != mu) {
+    if (nu != mu) {

      // mu
      // ^
      // |__>  nu

-      //    __ 
+      //    __
      //      |
      //    __|
      //

-	staple+=Gimpl::ShiftStaple(
-	        Gimpl::CovShiftForward (U[nu],nu, 
-		Gimpl::CovShiftBackward(U[mu],mu,
-		Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
+      staple += Gimpl::ShiftStaple(
+          Gimpl::CovShiftForward(
+              U[nu], nu,
+              Gimpl::CovShiftBackward(
+                  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+          mu);

-      //  __ 
-      // |   
-      // |__ 
+      //  __
+      // |
+      // |__
      //
      //
-	staple+=Gimpl::ShiftStaple(  
-                Gimpl::CovShiftBackward(U[nu],nu,		  		  
-		Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
+      staple += Gimpl::ShiftStaple(
+          Gimpl::CovShiftBackward(U[nu], nu,
+                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
+          mu);
+    }
+  }
+
+  //////////////////////////////////////////////////
+  // the sum over all staples on each site
+  //////////////////////////////////////////////////
+  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
+
+    GridBase *grid = Umu._grid;
+
+    std::vector<GaugeMat> U(Nd, grid);
+    for (int d = 0; d < Nd; d++) {
+      U[d] = PeekIndex<LorentzIndex>(Umu, d);
+    }
+    staple = zero;
+    GaugeMat tmp(grid);
+
+    for (int nu = 0; nu < Nd; nu++) {
+
+      if (nu != mu) {
+
+        // mu
+        // ^
+        // |__>  nu
+
+        //    __
+        //      |
+        //    __|
+        //
+
+        staple += Gimpl::ShiftStaple(
+            Gimpl::CovShiftForward(
+                U[nu], nu,
+                Gimpl::CovShiftBackward(
+                    U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+            mu);
+
+        //  __
+        // |
+        // |__
+        //
+        //
+        staple += Gimpl::ShiftStaple(
+            Gimpl::CovShiftBackward(U[nu], nu,
+                                    Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
+            mu);
      }
    }
  }

+  //////////////////////////////////////////////////
+  // the sum over all staples on each site in direction mu,nu, upper part
+  //////////////////////////////////////////////////
+  static void StapleUpper(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
+                          int nu) {
+
+    staple = zero;
+
+    if (nu != mu) {
+      GridBase *grid = Umu._grid;
+
+      std::vector<GaugeMat> U(4, grid);
+      for (int d = 0; d < Nd; d++) {
+        U[d] = PeekIndex<LorentzIndex>(Umu, d);
+      }
+
+      // mu
+      // ^
+      // |__>  nu
+
+      //    __
+      //      |
+      //    __|
+      //
+
+      staple += Gimpl::ShiftStaple(
+          Gimpl::CovShiftForward(
+              U[nu], nu,
+              Gimpl::CovShiftBackward(
+                  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+          mu);
+    }
+  }
+
  //////////////////////////////////////////////////////
  // Similar to above for rectangle is required
  //////////////////////////////////////////////////////
-  static void dirRectangle(GaugeMat &rect,const std::vector<GaugeMat> &U, const int mu, const int nu)
-  {
-    rect =  Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[mu],mu,U[nu]))* // ->->|
-	adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[mu],mu,U[mu]))) ;
-    rect = rect + 
-          Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[nu],nu,U[nu]))* // ->||
-      adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[nu],nu,U[mu]))) ;
+  static void dirRectangle(GaugeMat &rect, const std::vector<GaugeMat> &U,
+                           const int mu, const int nu) {
+    rect = Gimpl::CovShiftForward(
+               U[mu], mu, Gimpl::CovShiftForward(U[mu], mu, U[nu])) * // ->->|
+           adj(Gimpl::CovShiftForward(
+               U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[mu])));
+    rect = rect +
+           Gimpl::CovShiftForward(
+               U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])) * // ->||
+               adj(Gimpl::CovShiftForward(
+                   U[nu], nu, Gimpl::CovShiftForward(U[nu], nu, U[mu])));
  }
-  static void traceDirRectangle(LatticeComplex &rect, const std::vector<GaugeMat> &U, const int mu, const int nu)
-  {
+  static void traceDirRectangle(LatticeComplex &rect,
+                                const std::vector<GaugeMat> &U, const int mu,
+                                const int nu) {
    GaugeMat sp(U[0]._grid);
-    dirRectangle(sp,U,mu,nu);
-    rect=trace(sp);
+    dirRectangle(sp, U, mu, nu);
+    rect = trace(sp);
  }
-  static void siteRectangle(LatticeComplex &Rect,const std::vector<GaugeMat> &U)
-  {
+  static void siteRectangle(LatticeComplex &Rect,
+                            const std::vector<GaugeMat> &U) {
    LatticeComplex siteRect(U[0]._grid);
-    Rect=zero;
-    for(int mu=1;mu<Nd;mu++){
-      for(int nu=0;nu<mu;nu++){
-	traceDirRectangle(siteRect,U,mu,nu);
-	Rect = Rect + siteRect;
+    Rect = zero;
+    for (int mu = 1; mu < Nd; mu++) {
+      for (int nu = 0; nu < mu; nu++) {
+        traceDirRectangle(siteRect, U, mu, nu);
+        Rect = Rect + siteRect;
      }
    }
  }
+
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
-  static RealD sumRectangle(const GaugeLorentz &Umu){
-    std::vector<GaugeMat> U(Nd,Umu._grid);
+  static RealD sumRectangle(const GaugeLorentz &Umu) {
+    std::vector<GaugeMat> U(Nd, Umu._grid);

-    for(int mu=0;mu<Nd;mu++){
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+    for (int mu = 0; mu < Nd; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

    LatticeComplex Rect(Umu._grid);
-    
-    siteRectangle(Rect,U);
-    
+
+    siteRectangle(Rect, U);
+
    TComplex Tp = sum(Rect);
-    Complex p  = TensorRemove(Tp);
+    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
-  static RealD avgRectangle(const GaugeLorentz &Umu){
+  static RealD avgRectangle(const GaugeLorentz &Umu) {

    RealD sumrect = sumRectangle(Umu);
-    
+
    double vol = Umu._grid->gSites();
-    
-    double faces = (1.0*Nd*(Nd-1)); // 2 distinct orientations summed
-    
-    return sumrect/vol/faces/Nc; // Nd , Nc dependent... FIXME
+
+    double faces = (1.0 * Nd * (Nd - 1)); // 2 distinct orientations summed
+
+    return sumrect / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }

  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
-  static void RectStapleDouble(GaugeMat &U2,const GaugeMat & U,int mu){
-    U2 = U * Cshift(U,mu,1);
+  static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
+    U2 = U * Cshift(U, mu, 1);
  }

  ////////////////////////////////////////////////////////////////////////////
-  // Hop by two optimisation strategy does not work nicely with Gparity. (could do,
+  // Hop by two optimisation strategy does not work nicely with Gparity. (could
+  // do,
  // but need to track two deep where cross boundary and apply a conjugation).
-  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do so .
+  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do
+  // so .
  ////////////////////////////////////////////////////////////////////////////
-  static void RectStapleOptimised(GaugeMat &Stap,std::vector<GaugeMat> &U2,std::vector<GaugeMat> &U,int mu){
+  static void RectStapleOptimised(GaugeMat &Stap, std::vector<GaugeMat> &U2,
+                                  std::vector<GaugeMat> &U, int mu) {

    Stap = zero;

    GridBase *grid = U[0]._grid;

-    GaugeMat Staple2x1 (grid);
-    GaugeMat tmp (grid);
+    GaugeMat Staple2x1(grid);
+    GaugeMat tmp(grid);

-    for(int nu=0;nu<Nd;nu++){
-      if ( nu!=mu) {
+    for (int nu = 0; nu < Nd; nu++) {
+      if (nu != mu) {

-	// Up staple    ___ ___ 
-	//             |       |
-	tmp = Cshift(adj(U[nu]),nu,-1); 
-	tmp = adj(U2[mu])*tmp;
-	tmp = Cshift(tmp,mu,-2);
+        // Up staple    ___ ___
+        //             |       |
+        tmp = Cshift(adj(U[nu]), nu, -1);
+        tmp = adj(U2[mu]) * tmp;
+        tmp = Cshift(tmp, mu, -2);

-	Staple2x1 = Gimpl::CovShiftForward (U[nu],nu,tmp);
+        Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);

+        // Down staple
+        //             |___ ___|
+        //
+        tmp = adj(U2[mu]) * U[nu];
+        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Cshift(tmp, mu, -2));

-	// Down staple
-	//             |___ ___|
-	//
-	tmp = adj(U2[mu])*U[nu];
-	Staple2x1+= Gimpl::CovShiftBackward(U[nu],nu,Cshift(tmp,mu,-2));
+        //              ___ ___
+        //             |    ___|
+        //             |___ ___|
+        //

+        Stap += Cshift(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);

-	//              ___ ___
-	//             |    ___|
-	//             |___ ___|
-	//
+        //              ___ ___
+        //             |___    |
+        //             |___ ___|
+        //

-	Stap+= Cshift(Gimpl::CovShiftForward (U[mu],mu,Staple2x1),mu,1);
+        //	tmp= Staple2x1* Cshift(U[mu],mu,-2);
+        //	Stap+= Cshift(tmp,mu,1) ;
+        Stap += Cshift(Staple2x1, mu, 1) * Cshift(U[mu], mu, -1);
+        ;

-	//              ___ ___
-	//             |___    |
-	//             |___ ___|
-	//
+        //       --
+        //      |  |
+        //
+        //      |  |

-	//	tmp= Staple2x1* Cshift(U[mu],mu,-2);
-	//	Stap+= Cshift(tmp,mu,1) ;
-	Stap+= Cshift(Staple2x1,mu,1)*Cshift(U[mu],mu,-1); ;
+        tmp = Cshift(adj(U2[nu]), nu, -2);
+        tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
+        tmp = U2[nu] * Cshift(tmp, nu, 2);
+        Stap += Cshift(tmp, mu, 1);

-	//       --    
-	//      |  |              
-	//          
-	//      |  | 
-	
-	tmp = Cshift(adj(U2[nu]),nu,-2);
-	tmp = Gimpl::CovShiftBackward(U[mu],mu,tmp);
-	tmp = U2[nu]*Cshift(tmp,nu,2);
-	Stap+= Cshift(tmp, mu, 1);
+        //      |  |
+        //
+        //      |  |
+        //       --

-	//      |  |              
-	//          
-	//      |  | 
-	//       -- 
-	
-	tmp = Gimpl::CovShiftBackward(U[mu],mu,U2[nu]);
-	tmp = adj(U2[nu])*tmp;
-	tmp = Cshift(tmp,nu,-2);
-	Stap+=Cshift(tmp, mu, 1);
-    }}
-
-
-  }
-
-  static void RectStaple(GaugeMat &Stap,const GaugeLorentz & Umu,int mu)
-  {
-    RectStapleUnoptimised(Stap,Umu,mu);
-  }
-  static void RectStaple(const GaugeLorentz & Umu,GaugeMat &Stap,
-			 std::vector<GaugeMat> &U2,
-			 std::vector<GaugeMat> &U, int mu)
-  {
-    if ( Gimpl::isPeriodicGaugeField() ){ 
-      RectStapleOptimised(Stap,U2,U,mu);
-    } else {
-      RectStapleUnoptimised(Stap,Umu,mu);
+        tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
+        tmp = adj(U2[nu]) * tmp;
+        tmp = Cshift(tmp, nu, -2);
+        Stap += Cshift(tmp, mu, 1);
+      }
    }
  }

-  static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
+  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
+    RectStapleUnoptimised(Stap, Umu, mu);
+  }
+  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
+                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
+                         int mu) {
+    if (Gimpl::isPeriodicGaugeField()) {
+      RectStapleOptimised(Stap, U2, U, mu);
+    } else {
+      RectStapleUnoptimised(Stap, Umu, mu);
+    }
+  }
+
+  static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu,
+                                    int mu) {
    GridBase *grid = Umu._grid;

-    std::vector<GaugeMat> U(Nd,grid);
-    for(int d=0;d<Nd;d++){
-      U[d] = PeekIndex<LorentzIndex>(Umu,d);
+    std::vector<GaugeMat> U(Nd, grid);
+    for (int d = 0; d < Nd; d++) {
+      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }

-    Stap=zero;
+    Stap = zero;

-    for(int nu=0;nu<Nd;nu++){
-      if ( nu!=mu) {
-    //           __ ___ 
-    //          |    __ |
-    //
-    Stap+= Gimpl::ShiftStaple(
-		  Gimpl::CovShiftForward (U[mu],mu,
-		  Gimpl::CovShiftForward (U[nu],nu,
-		  Gimpl::CovShiftBackward(U[mu],mu,
-                  Gimpl::CovShiftBackward(U[mu],mu,
-		  Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
+    for (int nu = 0; nu < Nd; nu++) {
+      if (nu != mu) {
+        //           __ ___
+        //          |    __ |
+        //
+        Stap += Gimpl::ShiftStaple(
+            Gimpl::CovShiftForward(
+                U[mu], mu,
+                Gimpl::CovShiftForward(
+                    U[nu], nu,
+                    Gimpl::CovShiftBackward(
+                        U[mu], mu,
+                        Gimpl::CovShiftBackward(
+                            U[mu], mu,
+                            Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
+            mu);

-    //              __ 
-    //          |__ __ |
+        //              __
+        //          |__ __ |

-    Stap+= Gimpl::ShiftStaple(
-                  Gimpl::CovShiftForward (U[mu],mu,
-		  Gimpl::CovShiftBackward(U[nu],nu,
-		  Gimpl::CovShiftBackward(U[mu],mu,
-                  Gimpl::CovShiftBackward(U[mu],mu, U[nu])))) , mu);
+        Stap += Gimpl::ShiftStaple(
+            Gimpl::CovShiftForward(
+                U[mu], mu,
+                Gimpl::CovShiftBackward(
+                    U[nu], nu,
+                    Gimpl::CovShiftBackward(
+                        U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))),
+            mu);

-    //           __ 
-    //          |__ __ |
+        //           __
+        //          |__ __ |

-    Stap+= Gimpl::ShiftStaple(
-		  Gimpl::CovShiftBackward(U[nu],nu,
-		  Gimpl::CovShiftBackward(U[mu],mu,
-		  Gimpl::CovShiftBackward(U[mu],mu,
-		  Gimpl::CovShiftForward(U[nu],nu,U[mu])))) , mu);
+        Stap += Gimpl::ShiftStaple(
+            Gimpl::CovShiftBackward(
+                U[nu], nu,
+                Gimpl::CovShiftBackward(
+                    U[mu], mu,
+                    Gimpl::CovShiftBackward(
+                        U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))),
+            mu);

-    //           __ ___ 
-    //          |__    |
+        //           __ ___
+        //          |__    |

-    Stap+= Gimpl::ShiftStaple(
-		   Gimpl::CovShiftForward (U[nu],nu,
-	           Gimpl::CovShiftBackward(U[mu],mu,
-                   Gimpl::CovShiftBackward(U[mu],mu,
-                   Gimpl::CovShiftBackward(U[nu],nu,U[mu])))) , mu);
+        Stap += Gimpl::ShiftStaple(
+            Gimpl::CovShiftForward(
+                U[nu], nu,
+                Gimpl::CovShiftBackward(
+                    U[mu], mu,
+                    Gimpl::CovShiftBackward(
+                        U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))),
+            mu);

-     //       --    
-     //      |  |              
-     //          
-     //      |  | 
-     
-    Stap+= Gimpl::ShiftStaple(
-		   Gimpl::CovShiftForward(U[nu],nu,
-		   Gimpl::CovShiftForward(U[nu],nu,
-                   Gimpl::CovShiftBackward(U[mu],mu,
-                   Gimpl::CovShiftBackward(U[nu],nu,
-		   Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
+        //       --
+        //      |  |
+        //
+        //      |  |

+        Stap += Gimpl::ShiftStaple(
+            Gimpl::CovShiftForward(
+                U[nu], nu,
+                Gimpl::CovShiftForward(
+                    U[nu], nu,
+                    Gimpl::CovShiftBackward(
+                        U[mu], mu,
+                        Gimpl::CovShiftBackward(
+                            U[nu], nu,
+                            Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
+            mu);

-     //      |  |              
-     //          
-     //      |  | 
-     //       -- 
-     
-    Stap+= Gimpl::ShiftStaple(
-		   Gimpl::CovShiftBackward(U[nu],nu,
-		   Gimpl::CovShiftBackward(U[nu],nu,
-                   Gimpl::CovShiftBackward(U[mu],mu,
-                   Gimpl::CovShiftForward (U[nu],nu,U[nu])))) , mu);
-    }}
+        //      |  |
+        //
+        //      |  |
+        //       --
+
+        Stap += Gimpl::ShiftStaple(
+            Gimpl::CovShiftBackward(
+                U[nu], nu,
+                Gimpl::CovShiftBackward(
+                    U[nu], nu,
+                    Gimpl::CovShiftBackward(
+                        U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))),
+            mu);
+      }
+    }
  }
-
-
 };

+typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
+typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
+typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
+typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
+}
+}

- typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
- typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
- typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
- typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
-
-}}
-
-#endif
+#endif
--- a/lib/serialisation/.dirstamp
+++ b/lib/serialisation/.dirstamp
--- a/lib/serialisation/Serialisation.h
+++ b/lib/serialisation/Serialisation.h
@@ -29,18 +29,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_SERIALISATION_READER_H
 #define GRID_SERIALISATION_READER_H

-#include <serialisation/MacroMagic.h>
-#include <serialisation/BaseIO.h>
 #include <stdint.h>

+#include "MacroMagic.h"
+#include "BaseIO.h"
+#include "BinaryIO.h"
+#include "TextIO.h"
+#include "XmlIO.h"
 //////////////////////////////////////////
 // Todo:
 //////////////////////////////////////////
-#include <serialisation/BinaryIO.h>
-#include <serialisation/TextIO.h>
-//#include <serialisation/JsonIO.h>
-//#include <serialisation/YamlIO.h>
-#include <serialisation/XmlIO.h>
+//#include "JsonIO.h"
+//#include "YamlIO.h"

 //////////////////////////////////////////
 // Select the default serialiser use ifdef's
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@@ -38,7 +38,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <vector>
 #include <cassert>

-#include "pugixml/pugixml.h"
+#include <Grid/pugixml/pugixml.h>

 namespace Grid
 {
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -1,227 +1,234 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/simd/Grid_vector_unops.h
+Source file: ./lib/simd/Grid_vector_unops.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_VECTOR_UNOPS
 #define GRID_VECTOR_UNOPS

 #include <cmath>

-namespace Grid { 
+namespace Grid {

-  template<class scalar> struct SqrtRealFunctor {
-    scalar operator()(const scalar &a) const {
-      return sqrt(real(a));
-    }
-  };
+template <class scalar>
+struct SqrtRealFunctor {
+  scalar operator()(const scalar &a) const { return sqrt(real(a)); }
+};

-  template<class scalar> struct RSqrtRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return scalar(1.0/sqrt(real(a)));
-    }
-  };
+template <class scalar>
+struct RSqrtRealFunctor {
+  scalar operator()(const scalar &a) const {
+    return scalar(1.0 / sqrt(real(a)));
+  }
+};

-  template<class scalar> struct CosRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return cos(real(a));
-    }
-  };
+template <class scalar>
+struct CosRealFunctor {
+  scalar operator()(const scalar &a) const { return cos(real(a)); }
+};

-  template<class scalar> struct SinRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return sin(real(a));
-    }
-  };
+template <class scalar>
+struct SinRealFunctor {
+  scalar operator()(const scalar &a) const { return sin(real(a)); }
+};

-  template<class scalar> struct LogRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return log(real(a));
-    }
-  };
+template <class scalar>
+struct AcosRealFunctor {
+  scalar operator()(const scalar &a) const { return acos(real(a)); }
+};

-  template<class scalar> struct ExpRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return exp(real(a));
-    }
-  };
-  template<class scalar> struct NotFunctor {
-    scalar operator()(const scalar &a)  const {
-      return (!a);
-    }
-  };
-  template<class scalar> struct AbsRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return std::abs(real(a));
-    }
-  };
+template <class scalar>
+struct AsinRealFunctor {
+  scalar operator()(const scalar &a) const { return asin(real(a)); }
+};

-  template<class scalar> struct PowRealFunctor {
-    double y;
-  PowRealFunctor(double _y) : y(_y) {};
-    scalar operator()(const scalar &a)  const {
-      return pow(real(a),y);
-    }
-  };
+template <class scalar>
+struct LogRealFunctor {
+  scalar operator()(const scalar &a) const { return log(real(a)); }
+};

-  template<class scalar> struct ModIntFunctor {
-    Integer y;
-  ModIntFunctor(Integer _y) : y(_y) {};
-    scalar operator()(const scalar &a)  const {
-      return Integer(a)%y;
-    }
-  };
+template <class scalar>
+struct ExpRealFunctor {
+  scalar operator()(const scalar &a) const { return exp(real(a)); }
+};
+template <class scalar>
+struct NotFunctor {
+  scalar operator()(const scalar &a) const { return (!a); }
+};
+template <class scalar>
+struct AbsRealFunctor {
+  scalar operator()(const scalar &a) const { return std::abs(real(a)); }
+};

-  template<class scalar> struct DivIntFunctor {
-    Integer y;
-  DivIntFunctor(Integer _y) : y(_y) {};
-    scalar operator()(const scalar &a)  const {
-      return Integer(a)/y;
-    }
-  };
+template <class scalar>
+struct PowRealFunctor {
+  double y;
+  PowRealFunctor(double _y) : y(_y){};
+  scalar operator()(const scalar &a) const { return pow(real(a), y); }
+};

-  template<class scalar> struct RealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return real(a);
-    }
-  };
-  template<class scalar> struct ImagFunctor {
-    scalar operator()(const scalar &a)  const {
-      return imag(a);
-    }
-  };
-  template < class S, class V > 
-  inline Grid_simd<S,V> real(const Grid_simd<S,V> &r) {
-    return SimdApply(RealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> imag(const Grid_simd<S,V> &r) {
-    return SimdApply(ImagFunctor<S>(),r);
-  }
+template <class scalar>
+struct ModIntFunctor {
+  Integer y;
+  ModIntFunctor(Integer _y) : y(_y){};
+  scalar operator()(const scalar &a) const { return Integer(a) % y; }
+};

-  template < class S, class V > 
-  inline Grid_simd<S,V> sqrt(const Grid_simd<S,V> &r) {
-    return SimdApply(SqrtRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> rsqrt(const Grid_simd<S,V> &r) {
-    return SimdApply(RSqrtRealFunctor<S>(),r);
-  }
-  template < class Scalar > 
-  inline Scalar rsqrt(const Scalar &r) {
-    return (RSqrtRealFunctor<Scalar>(),r);
-  }
+template <class scalar>
+struct DivIntFunctor {
+  Integer y;
+  DivIntFunctor(Integer _y) : y(_y){};
+  scalar operator()(const scalar &a) const { return Integer(a) / y; }
+};

-  template < class S, class V > 
-  inline Grid_simd<S,V> cos(const Grid_simd<S,V> &r) {
-    return SimdApply(CosRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> sin(const Grid_simd<S,V> &r) {
-    return SimdApply(SinRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> log(const Grid_simd<S,V> &r) {
-    return SimdApply(LogRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> abs(const Grid_simd<S,V> &r) {
-    return SimdApply(AbsRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> exp(const Grid_simd<S,V> &r) {
-    return SimdApply(ExpRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> Not(const Grid_simd<S,V> &r) {
-    return SimdApply(NotFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> pow(const Grid_simd<S,V> &r,double y) {
-    return SimdApply(PowRealFunctor<S>(y),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> mod(const Grid_simd<S,V> &r,Integer y) {
-    return SimdApply(ModIntFunctor<S>(y),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> div(const Grid_simd<S,V> &r,Integer y) {
-    return SimdApply(DivIntFunctor<S>(y),r);
-  }
-  ////////////////////////////////////////////////////////////////////////////
-  // Allows us to assign into **conformable** real vectors from complex
-  ////////////////////////////////////////////////////////////////////////////
-  //  template < class S, class V > 
-  //  inline auto ComplexRemove(const Grid_simd<S,V> &c) -> Grid_simd<Grid_simd<S,V>::Real,V> {
-  //    Grid_simd<Grid_simd<S,V>::Real,V> ret;
-  //    ret.v = c.v;
-  //    return ret;
-  //  }
-  template<class scalar> struct AndFunctor {
-    scalar operator()(const scalar &x, const scalar &y)  const {
-      return x & y;
-    }
-  };
-  template<class scalar> struct OrFunctor {
-    scalar operator()(const scalar &x, const scalar &y)  const {
-      return x | y;
-    }
-  };
-  template<class scalar> struct AndAndFunctor {
-    scalar operator()(const scalar &x, const scalar &y)  const {
-      return x && y;
-    }
-  };
-  template<class scalar> struct OrOrFunctor {
-    scalar operator()(const scalar &x, const scalar &y)  const {
-      return x || y;
-    }
-  };
+template <class scalar>
+struct RealFunctor {
+  scalar operator()(const scalar &a) const { return std::real(a); }
+};
+template <class scalar>
+struct ImagFunctor {
+  scalar operator()(const scalar &a) const { return std::imag(a); }
+};
+template <class S, class V>
+inline Grid_simd<S, V> real(const Grid_simd<S, V> &r) {
+  return SimdApply(RealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> imag(const Grid_simd<S, V> &r) {
+  return SimdApply(ImagFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> sqrt(const Grid_simd<S, V> &r) {
+  return SimdApply(SqrtRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> rsqrt(const Grid_simd<S, V> &r) {
+  return SimdApply(RSqrtRealFunctor<S>(), r);
+}
+template <class Scalar>
+inline Scalar rsqrt(const Scalar &r) {
+  return (RSqrtRealFunctor<Scalar>(), r);
+}

-  ////////////////////////////////
-  // Calls to simd binop functors
-  ////////////////////////////////
-  template < class S, class V > 
-  inline Grid_simd<S,V> operator &(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
-    return SimdApplyBinop(AndFunctor<S>(),x,y);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> operator &&(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
-    return SimdApplyBinop(AndAndFunctor<S>(),x,y);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> operator |(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
-    return SimdApplyBinop(OrFunctor<S>(),x,y);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> operator ||(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
-    return SimdApplyBinop(OrOrFunctor<S>(),x,y);
-  }
+template <class S, class V>
+inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
+  return SimdApply(CosRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> sin(const Grid_simd<S, V> &r) {
+  return SimdApply(SinRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> acos(const Grid_simd<S, V> &r) {
+  return SimdApply(AcosRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> asin(const Grid_simd<S, V> &r) {
+  return SimdApply(AsinRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> log(const Grid_simd<S, V> &r) {
+  return SimdApply(LogRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
+  return SimdApply(AbsRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
+  return SimdApply(ExpRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
+  return SimdApply(NotFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> pow(const Grid_simd<S, V> &r, double y) {
+  return SimdApply(PowRealFunctor<S>(y), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> mod(const Grid_simd<S, V> &r, Integer y) {
+  return SimdApply(ModIntFunctor<S>(y), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> div(const Grid_simd<S, V> &r, Integer y) {
+  return SimdApply(DivIntFunctor<S>(y), r);
+}
+////////////////////////////////////////////////////////////////////////////
+// Allows us to assign into **conformable** real vectors from complex
+////////////////////////////////////////////////////////////////////////////
+//  template < class S, class V >
+//  inline auto ComplexRemove(const Grid_simd<S,V> &c) ->
+//  Grid_simd<Grid_simd<S,V>::Real,V> {
+//    Grid_simd<Grid_simd<S,V>::Real,V> ret;
+//    ret.v = c.v;
+//    return ret;
+//  }
+template <class scalar>
+struct AndFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x & y; }
+};
+template <class scalar>
+struct OrFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x | y; }
+};
+template <class scalar>
+struct AndAndFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x && y; }
+};
+template <class scalar>
+struct OrOrFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x || y; }
+};

+////////////////////////////////
+// Calls to simd binop functors
+////////////////////////////////
+template <class S, class V>
+inline Grid_simd<S, V> operator&(const Grid_simd<S, V> &x,
+                                 const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(AndFunctor<S>(), x, y);
+}
+template <class S, class V>
+inline Grid_simd<S, V> operator&&(const Grid_simd<S, V> &x,
+                                  const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(AndAndFunctor<S>(), x, y);
+}
+template <class S, class V>
+inline Grid_simd<S, V> operator|(const Grid_simd<S, V> &x,
+                                 const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(OrFunctor<S>(), x, y);
+}
+template <class S, class V>
+inline Grid_simd<S, V> operator||(const Grid_simd<S, V> &x,
+                                  const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(OrOrFunctor<S>(), x, y);
+}
 }
 #endif
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -90,8 +90,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12

-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
+#include "Intel512common.h"
+#include "Intel512avx.h"

 //////////////////////////////////////////////////////////////////
 // Macros used to build wilson kernel -- can rationalise and simplify
--- a/lib/tensors/Tensor_arith.h
+++ b/lib/tensors/Tensor_arith.h
@@ -28,11 +28,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_MATH_ARITH_H
 #define GRID_MATH_ARITH_H

-#include <tensors/Tensor_arith_add.h>
-#include <tensors/Tensor_arith_sub.h>
-#include <tensors/Tensor_arith_mac.h>
-#include <tensors/Tensor_arith_mul.h>
-#include <tensors/Tensor_arith_scalar.h>
+#include "Tensor_arith_add.h"
+#include "Tensor_arith_sub.h"
+#include "Tensor_arith_mac.h"
+#include "Tensor_arith_mul.h"
+#include "Tensor_arith_scalar.h"

 #endif

--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -1,31 +1,32 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/tensors/Tensor_class.h
+Source file: ./lib/tensors/Tensor_class.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_MATH_TENSORS_H
 #define GRID_MATH_TENSORS_H

@@ -38,17 +39,18 @@ namespace Grid {

 // It is useful to NOT have any constructors
 // so that these classes assert "is_pod<class> == true"
-// because then the standard C++ valarray container eliminates fill overhead on new allocation and 
+// because then the standard C++ valarray container eliminates fill overhead on
+// new allocation and
 // non-move copying.
 //
-// However note that doing this eliminates some syntactical sugar such as 
+// However note that doing this eliminates some syntactical sugar such as
 // calling the constructor explicitly or implicitly
 //
 class GridTensorBase {};

-template<class vtype> class iScalar 
-{
-public:
+template <class vtype>
+class iScalar {
+ public:
  vtype _internal;

  typedef vtype element;
@@ -60,13 +62,14 @@ public:
  typedef iScalar<recurse_scalar_object> scalar_object;

  // substitutes a real or complex version with same tensor structure
-  typedef iScalar<typename GridTypeMapper<vtype>::Complexified > Complexified;
-  typedef iScalar<typename GridTypeMapper<vtype>::Realified >    Realified;
+  typedef iScalar<typename GridTypeMapper<vtype>::Complexified> Complexified;
+  typedef iScalar<typename GridTypeMapper<vtype>::Realified> Realified;

-  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };

  // Scalar no action
-  //  template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
+  //  template<int Level> using tensor_reduce_level = typename
+  //  iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
  iScalar() = default;
  /*
  iScalar(const iScalar<vtype> &copyme)=default;
@@ -74,83 +77,112 @@ public:
  iScalar<vtype> & operator= (const iScalar<vtype> &copyme) = default;
  iScalar<vtype> & operator= (iScalar<vtype> &&copyme) = default;
  */
-  iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
-  iScalar(const Zero &z){ *this = zero; };
+  iScalar(scalar_type s)
+      : _internal(s){};  // recurse down and hit the constructor for vector_type
+  iScalar(const Zero &z) { *this = zero; };

-  iScalar<vtype> & operator= (const Zero &hero){
+  iScalar<vtype> &operator=(const Zero &hero) {
    zeroit(*this);
    return *this;
  }
-  friend strong_inline void vstream(iScalar<vtype> &out,const iScalar<vtype> &in){
-    vstream(out._internal,in._internal);
+  friend strong_inline void vstream(iScalar<vtype> &out,
+                                    const iScalar<vtype> &in) {
+    vstream(out._internal, in._internal);
+  }
+  friend strong_inline void vbroadcast(iScalar<vtype> &out,const iScalar<vtype> &in,int lane){
+    vbroadcast(out._internal,in._internal,lane);
  }
  friend strong_inline void zeroit(iScalar<vtype> &that){
    zeroit(that._internal);
  }
-  friend strong_inline void prefetch(iScalar<vtype> &that){
+  friend strong_inline void prefetch(iScalar<vtype> &that) {
    prefetch(that._internal);
  }
-  friend strong_inline void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
-    permute(out._internal,in._internal,permutetype);
+  friend strong_inline void permute(iScalar<vtype> &out,
+                                    const iScalar<vtype> &in, int permutetype) {
+    permute(out._internal, in._internal, permutetype);
+  }
+  friend strong_inline void rotate(iScalar<vtype> &out,const iScalar<vtype> &in,int rot){
+    rotate(out._internal,in._internal,rot);
  }

  // Unary negation
-  friend strong_inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
+  friend strong_inline iScalar<vtype> operator-(const iScalar<vtype> &r) {
    iScalar<vtype> ret;
-    ret._internal= -r._internal;
+    ret._internal = -r._internal;
    return ret;
  }
  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-  strong_inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
-    *this = (*this)*r;
+  strong_inline iScalar<vtype> &operator*=(const iScalar<vtype> &r) {
+    *this = (*this) * r;
    return *this;
  }
-  strong_inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
-    *this = (*this)-r;
+  strong_inline iScalar<vtype> &operator-=(const iScalar<vtype> &r) {
+    *this = (*this) - r;
    return *this;
  }
-  strong_inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
-    *this = (*this)+r;
+  strong_inline iScalar<vtype> &operator+=(const iScalar<vtype> &r) {
+    *this = (*this) + r;
    return *this;
  }
-  strong_inline vtype & operator ()(void) {
-    return _internal;
-  }
-  strong_inline const vtype & operator ()(void) const {
-    return _internal;
-  }
+  strong_inline vtype &operator()(void) { return _internal; }
+  strong_inline const vtype &operator()(void) const { return _internal; }

  // Type casts meta programmed, must be pure scalar to match TensorRemove
-  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0> operator ComplexF () const { return(TensorRemove(_internal)); };
-  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0> operator ComplexD () const { return(TensorRemove(_internal)); };
-  //  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0> operator RealD    () const { return(real(TensorRemove(_internal))); }
-  template<class U=vtype,class V=scalar_type,IfReal<V>    = 0,IfNotSimd<U> = 0> operator RealD    () const { return TensorRemove(_internal); }
-  template<class U=vtype,class V=scalar_type,IfInteger<V> = 0,IfNotSimd<U> = 0> operator Integer  () const { return Integer(TensorRemove(_internal)); }
-  
-  // convert from a something to a scalar via constructor of something arg
-  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline iScalar<vtype> operator = (T arg)
-    { 
-      _internal = arg;
-      return *this;
-    }
+  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0,
+            IfNotSimd<U> = 0>
+  operator ComplexF() const {
+    return (TensorRemove(_internal));
+  };
+  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0,
+            IfNotSimd<U> = 0>
+  operator ComplexD() const {
+    return (TensorRemove(_internal));
+  };
+  //  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> =
+  //  0> operator RealD    () const { return(real(TensorRemove(_internal))); }
+  template <class U = vtype, class V = scalar_type, IfReal<V> = 0,
+            IfNotSimd<U> = 0>
+  operator RealD() const {
+    return TensorRemove(_internal);
+  }
+  template <class U = vtype, class V = scalar_type, IfInteger<V> = 0,
+            IfNotSimd<U> = 0>
+  operator Integer() const {
+    return Integer(TensorRemove(_internal));
+  }

-    friend std::ostream& operator<< (std::ostream& stream, const iScalar<vtype> &o){
-      stream<< "S {"<<o._internal<<"}";
-      return stream;
-    };
+  // convert from a something to a scalar via constructor of something arg
+  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type
+                         * = nullptr>
+  strong_inline iScalar<vtype> operator=(T arg) {
+    _internal = arg;
+    return *this;
+  }
+
+  friend std::ostream &operator<<(std::ostream &stream,
+                                  const iScalar<vtype> &o) {
+    stream << "S {" << o._internal << "}";
+    return stream;
+  };
 };
 ///////////////////////////////////////////////////////////
 // Allows to turn scalar<scalar<scalar<double>>>> back to double.
 ///////////////////////////////////////////////////////////
-template<class T>     strong_inline typename std::enable_if<!isGridTensor<T>::value, T>::type TensorRemove(T arg) { return arg;}
-template<class vtype> strong_inline auto TensorRemove(iScalar<vtype> arg) -> decltype(TensorRemove(arg._internal))
-{
+template <class T>
+strong_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
+TensorRemove(T arg) {
+  return arg;
+}
+template <class vtype>
+strong_inline auto TensorRemove(iScalar<vtype> arg)
+    -> decltype(TensorRemove(arg._internal)) {
  return TensorRemove(arg._internal);
 }
-    
-template<class vtype,int N> class iVector 
-{
-public:
+
+template <class vtype, int N>
+class iVector {
+ public:
  vtype _internal[N];

  typedef vtype element;
@@ -159,23 +191,23 @@ public:
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
  typedef iScalar<tensor_reduced_v> tensor_reduced;
-  typedef iVector<recurse_scalar_object,N> scalar_object;
+  typedef iVector<recurse_scalar_object, N> scalar_object;

  // substitutes a real or complex version with same tensor structure
-  typedef iVector<typename GridTypeMapper<vtype>::Complexified,N > Complexified;
-  typedef iVector<typename GridTypeMapper<vtype>::Realified,N >    Realified;
+  typedef iVector<typename GridTypeMapper<vtype>::Complexified, N> Complexified;
+  typedef iVector<typename GridTypeMapper<vtype>::Realified, N> Realified;

-  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iVector<vtype,N>
-    { 
-      zeroit(*this);
-      for(int i=0;i<N;i++)
-	_internal[i] = arg;
-      return *this;
-    }
+  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type
+                         * = nullptr>
+  strong_inline auto operator=(T arg) -> iVector<vtype, N> {
+    zeroit(*this);
+    for (int i = 0; i < N; i++) _internal[i] = arg;
+    return *this;
+  }

-  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
-  iVector(const Zero &z){ *this = zero; };
-  iVector() =default;
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };
+  iVector(const Zero &z) { *this = zero; };
+  iVector() = default;
  /*
  iVector(const iVector<vtype,N> &copyme)=default;
  iVector(iVector<vtype,N> &&copyme)=default;
@@ -183,21 +215,27 @@ public:
  iVector<vtype,N> & operator= (iVector<vtype,N> &&copyme) = default;
  */

-  iVector<vtype,N> & operator= (const Zero &hero){
+  iVector<vtype, N> &operator=(const Zero &hero) {
    zeroit(*this);
    return *this;
  }
-  friend strong_inline void zeroit(iVector<vtype,N> &that){
-    for(int i=0;i<N;i++){
+  friend strong_inline void zeroit(iVector<vtype, N> &that) {
+    for (int i = 0; i < N; i++) {
      zeroit(that._internal[i]);
    }
  }
-  friend strong_inline void prefetch(iVector<vtype,N> &that){
-    for(int i=0;i<N;i++) prefetch(that._internal[i]);
+  friend strong_inline void prefetch(iVector<vtype, N> &that) {
+    for (int i = 0; i < N; i++) prefetch(that._internal[i]);
  }
-  friend strong_inline void vstream(iVector<vtype,N> &out,const iVector<vtype,N> &in){
+  friend strong_inline void vstream(iVector<vtype, N> &out,
+                                    const iVector<vtype, N> &in) {
+    for (int i = 0; i < N; i++) {
+      vstream(out._internal[i], in._internal[i]);
+    }
+  }
+  friend strong_inline void vbroadcast(iVector<vtype,N> &out,const iVector<vtype,N> &in,int lane){
    for(int i=0;i<N;i++){
-      vstream(out._internal[i],in._internal[i]);
+      vbroadcast(out._internal[i],in._internal[i],lane);
    }
  }
  friend strong_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
@@ -205,49 +243,51 @@ public:
      permute(out._internal[i],in._internal[i],permutetype);
    }
  }
+  friend strong_inline void rotate(iVector<vtype,N> &out,const iVector<vtype,N> &in,int rot){
+    for(int i=0;i<N;i++){
+      rotate(out._internal[i],in._internal[i],rot);
+    }
+  }

  // Unary negation
-  friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
-    iVector<vtype,N> ret;
-    for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
+  friend strong_inline iVector<vtype, N> operator-(const iVector<vtype, N> &r) {
+    iVector<vtype, N> ret;
+    for (int i = 0; i < N; i++) ret._internal[i] = -r._internal[i];
    return ret;
  }
  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-  strong_inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
-    *this = (*this)*r;
+  strong_inline iVector<vtype, N> &operator*=(const iScalar<vtype> &r) {
+    *this = (*this) * r;
    return *this;
  }
-  strong_inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
-    *this = (*this)-r;
+  strong_inline iVector<vtype, N> &operator-=(const iVector<vtype, N> &r) {
+    *this = (*this) - r;
    return *this;
  }
-  strong_inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
-    *this = (*this)+r;
+  strong_inline iVector<vtype, N> &operator+=(const iVector<vtype, N> &r) {
+    *this = (*this) + r;
    return *this;
  }
-  strong_inline vtype & operator ()(int i) {
-    return _internal[i];
-  }
-  strong_inline const vtype & operator ()(int i) const {
-    return _internal[i];
-  }
-  friend std::ostream& operator<< (std::ostream& stream, const iVector<vtype,N> &o){
-    stream<< "V<"<<N<<">{";
-    for(int i=0;i<N;i++) {
-      stream<<o._internal[i];
-      if (i<N-1)	stream<<",";
+  strong_inline vtype &operator()(int i) { return _internal[i]; }
+  strong_inline const vtype &operator()(int i) const { return _internal[i]; }
+  friend std::ostream &operator<<(std::ostream &stream,
+                                  const iVector<vtype, N> &o) {
+    stream << "V<" << N << ">{";
+    for (int i = 0; i < N; i++) {
+      stream << o._internal[i];
+      if (i < N - 1) stream << ",";
    }
-    stream<<"}";
+    stream << "}";
    return stream;
  };
  //    strong_inline vtype && operator ()(int i) {
  //      return _internal[i];
  //    }
 };
-    
-template<class vtype,int N> class iMatrix 
-{
-public:
+
+template <class vtype, int N>
+class iMatrix {
+ public:
  vtype _internal[N][N];

  typedef vtype element;
@@ -257,29 +297,27 @@ public:
  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;

  // substitutes a real or complex version with same tensor structure
-  typedef iMatrix<typename GridTypeMapper<vtype>::Complexified,N > Complexified;
-  typedef iMatrix<typename GridTypeMapper<vtype>::Realified,N >    Realified;
+  typedef iMatrix<typename GridTypeMapper<vtype>::Complexified, N> Complexified;
+  typedef iMatrix<typename GridTypeMapper<vtype>::Realified, N> Realified;

  // Tensure removal
  typedef iScalar<tensor_reduced_v> tensor_reduced;
-  typedef iMatrix<recurse_scalar_object,N> scalar_object;
+  typedef iMatrix<recurse_scalar_object, N> scalar_object;

-  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };

+  iMatrix(const Zero &z) { *this = zero; };
+  iMatrix() = default;

-  iMatrix(const Zero &z){ *this = zero; };
-  iMatrix() =default;
-  
-  iMatrix& operator=(const iMatrix& rhs){
-    for(int i=0;i<N;i++)
-      for(int j=0;j<N;j++)
-	vstream(_internal[i][j],rhs._internal[i][j]);
+  iMatrix &operator=(const iMatrix &rhs) {
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++) vstream(_internal[i][j], rhs._internal[i][j]);
    return *this;
-  }; 
-  
- 
+  };

-  iMatrix(scalar_type s)  { (*this) = s ;};// recurse down and hit the constructor for vector_type
+  iMatrix(scalar_type s) {
+    (*this) = s;
+  };  // recurse down and hit the constructor for vector_type

  /*
  iMatrix(const iMatrix<vtype,N> &copyme)=default;
@@ -288,19 +326,17 @@ public:
  iMatrix<vtype,N> & operator= (iMatrix<vtype,N> &&copyme) = default;
  */

-
-
-  iMatrix<vtype,N> & operator= (const Zero &hero){
+  iMatrix<vtype, N> &operator=(const Zero &hero) {
    zeroit(*this);
    return *this;
  }
-  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iMatrix<vtype,N>
-    { 
-      zeroit(*this);
-      for(int i=0;i<N;i++)
-	_internal[i][i] = arg;
-      return *this;
-    }
+  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type
+                         * = nullptr>
+  strong_inline auto operator=(T arg) -> iMatrix<vtype, N> {
+    zeroit(*this);
+    for (int i = 0; i < N; i++) _internal[i][i] = arg;
+    return *this;
+  }

  friend strong_inline void zeroit(iMatrix<vtype,N> &that){
    for(int i=0;i<N;i++){
@@ -318,7 +354,13 @@ public:
      for(int j=0;j<N;j++){
 	vstream(out._internal[i][j],in._internal[i][j]);
      }}
-    }
+  }
+  friend strong_inline void vbroadcast(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int lane){
+      for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	vbroadcast(out._internal[i][j],in._internal[i][j],lane);
+      }}
+  }

  friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
    for(int i=0;i<N;i++){
@@ -326,80 +368,83 @@ public:
 	permute(out._internal[i][j],in._internal[i][j],permutetype);
    }}
  }
-
-
-  // Unary negation
-  friend strong_inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
-    iMatrix<vtype,N> ret;
+  friend strong_inline void rotate(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int rot){
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
-	ret._internal[i][j]= -r._internal[i][j];
+	rotate(out._internal[i][j],in._internal[i][j],rot);
    }}
+  }
+
+  // Unary negation
+  friend strong_inline iMatrix<vtype, N> operator-(const iMatrix<vtype, N> &r) {
+    iMatrix<vtype, N> ret;
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < N; j++) {
+        ret._internal[i][j] = -r._internal[i][j];
+      }
+    }
    return ret;
  }
  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-  template<class T>
-  strong_inline iMatrix<vtype,N> &operator *=(const T &r) {
-    *this = (*this)*r;
+  template <class T>
+  strong_inline iMatrix<vtype, N> &operator*=(const T &r) {
+    *this = (*this) * r;
    return *this;
  }
-  template<class T>
-  strong_inline iMatrix<vtype,N> &operator -=(const T &r) {
-    *this = (*this)-r;
+  template <class T>
+  strong_inline iMatrix<vtype, N> &operator-=(const T &r) {
+    *this = (*this) - r;
    return *this;
  }
-  template<class T>
-  strong_inline iMatrix<vtype,N> &operator +=(const T &r) {
-    *this = (*this)+r;
+  template <class T>
+  strong_inline iMatrix<vtype, N> &operator+=(const T &r) {
+    *this = (*this) + r;
    return *this;
  }

  // returns an lvalue reference
-  strong_inline vtype & operator ()(int i,int j) {
+  strong_inline vtype &operator()(int i, int j) { return _internal[i][j]; }
+  strong_inline const vtype &operator()(int i, int j) const {
    return _internal[i][j];
  }
-  strong_inline const vtype & operator ()(int i,int j) const {
-    return _internal[i][j];
-  }
-  friend std::ostream& operator<< (std::ostream& stream, const iMatrix<vtype,N> &o){
-    stream<< "M<"<<N<<">{";
-    for(int i=0;i<N;i++) {
-      stream<< "{";
-      for(int j=0;j<N;j++) {
-	stream<<o._internal[i][j];
-	if (i<N-1)	stream<<",";
+  friend std::ostream &operator<<(std::ostream &stream,
+                                  const iMatrix<vtype, N> &o) {
+    stream << "M<" << N << ">{";
+    for (int i = 0; i < N; i++) {
+      stream << "{";
+      for (int j = 0; j < N; j++) {
+        stream << o._internal[i][j];
+        if (i < N - 1) stream << ",";
      }
-      stream<<"}";
-      if(i!=N-1) stream<<"\n\t\t";
+      stream << "}";
+      if (i != N - 1) stream << "\n\t\t";
    }
-    stream<<"}";
+    stream << "}";
    return stream;
  };

  //  strong_inline vtype && operator ()(int i,int j) {
  //    return _internal[i][j];
  //  }
-
 };

-template<class v> void vprefetch(const iScalar<v> &vv)
-{
+template <class v>
+void vprefetch(const iScalar<v> &vv) {
  vprefetch(vv._internal);
 }
-template<class v,int N> void vprefetch(const iVector<v,N> &vv)
-{
-  for(int i=0;i<N;i++){
+template <class v, int N>
+void vprefetch(const iVector<v, N> &vv) {
+  for (int i = 0; i < N; i++) {
    vprefetch(vv._internal[i]);
  }
 }
-template<class v,int N> void vprefetch(const iMatrix<v,N> &vv)
-{
-  for(int i=0;i<N;i++){
-  for(int j=0;j<N;j++){
-    vprefetch(vv._internal[i][j]);
-  }}
+template <class v, int N>
+void vprefetch(const iMatrix<v, N> &vv) {
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      vprefetch(vv._internal[i][j]);
+    }
+  }
 }
-
-
 }
 #endif
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -10,6 +10,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@phys.columbia.edu>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -167,6 +168,33 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac
  }
 }

+////////////////////////////////////////////////////////////////////////
+// Extract to a bunch of scalar object pointers of different scalar type, with offset. Useful for precision change
+////////////////////////////////////////////////////////////////////////
+template<class vobj, class sobj> inline 
+void extract1(const vobj &vec,std::vector<sobj*> &extracted, int offset)
+{
+  typedef typename vobj::scalar_type vobj_scalar_type ;
+  typedef typename vobj::vector_type vobj_vector_type ;
+
+  typedef typename sobj::scalar_type sobj_scalar_type ;
+  
+  static const int words=sizeof(vobj)/sizeof(vobj_vector_type);
+  static const int Nsimd=vobj_vector_type::Nsimd();
+
+  int Nextr=extracted.size();
+  int s = Nsimd/Nextr;
+  vobj_scalar_type * vp = (vobj_scalar_type *)&vec;
+
+  for(int w=0;w<words;w++){
+    for(int i=0;i<Nextr;i++){
+      sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
+      pointer[w] = vp[i*s+w*Nsimd];
+    }
+  }
+}
+
+  
 ////////////////////////////////////////////////////////////////////////
 // Merge a contiguous array of scalar objects
 ////////////////////////////////////////////////////////////////////////
--- a/lib/tensors/Tensor_index.h
+++ b/lib/tensors/Tensor_index.h
@@ -146,14 +146,14 @@ class TensorIndexRecursion {
    }

  template<class vtype,int N> inline static 
-    void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal,0)),N> &arg, int i)
+    void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0],0)),N> &arg, int i)
    {
      for(int ii=0;ii<N;ii++){
 	TensorIndexRecursion<Level-1>::pokeIndex(ret._internal[ii],arg._internal[ii],i);
      }
    }
  template<class vtype,int N> inline static 
-    void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal,0)),N> &arg, int i,int j)
+    void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0],0)),N> &arg, int i,int j)
    {
      for(int ii=0;ii<N;ii++){
 	TensorIndexRecursion<Level-1>::pokeIndex(ret._internal[ii],arg._internal[ii],i,j);
@@ -161,7 +161,7 @@ class TensorIndexRecursion {
    }

  template<class vtype,int N> inline static 
-    void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal,0)),N> &arg, int i)
+    void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0][0],0)),N> &arg, int i)
    {
      for(int ii=0;ii<N;ii++){
      for(int jj=0;jj<N;jj++){
@@ -169,7 +169,7 @@ class TensorIndexRecursion {
      }}
    }
  template<class vtype,int N> inline static 
-    void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal,0)),N> &arg, int i,int j)
+    void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0][0],0)),N> &arg, int i,int j)
    {
      for(int ii=0;ii<N;ii++){
      for(int jj=0;jj<N;jj++){
--- a/lib/tensors/Tensor_traits.h
+++ b/lib/tensors/Tensor_traits.h
@@ -8,6 +8,7 @@

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@phys.columbia.edu>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -230,6 +231,35 @@ namespace Grid {
    static const bool value = true;
  };

+  //Get the SIMD vector type from a Grid tensor or Lattice<Tensor>
+  template<typename T>
+  struct getVectorType{
+    typedef T type;
+  };
+  
+  //Query if a tensor or Lattice<Tensor> is SIMD vector or scalar
+  template<typename T>
+  class isSIMDvectorized{
+    template<typename U>
+    static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,   typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *);
+
+    template<typename U>
+    static double test(...);
+  
+  public:
+    enum {value = sizeof(test<T>(0)) == sizeof(char) };
+  };
+  
+  //Get the precision of a Lattice, tensor or scalar type in units of sizeof(float)
+  template<typename T>
+  class getPrecision{
+    typedef typename getVectorType<T>::type vector_obj; //get the vector_obj (i.e. a grid Tensor) if its a Lattice<vobj>, do nothing otherwise (i.e. if fundamental or grid Tensor)
+  
+    typedef typename GridTypeMapper<vector_obj>::scalar_type scalar_type; //get the associated scalar type. Works on fundamental and tensor types
+    typedef typename GridTypeMapper<scalar_type>::Realified real_scalar_type; //remove any std::complex wrapper, should get us to the fundamental type
+  public:
+    enum { value = sizeof(real_scalar_type)/sizeof(float) };
+  };
 }

 #endif
--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -86,6 +86,8 @@ UNARY(sqrt);
 UNARY(rsqrt);
 UNARY(sin);
 UNARY(cos);
+UNARY(asin);
+UNARY(acos);
 UNARY(log);
 UNARY(exp);
 UNARY(abs);