]Merge branch 'develop' into feature/hirep

2025-11-08 15:49:32 +00:00 · 2016-07-07 14:20:10 +01:00
parent ffedeb1c58 3c49ddfaa4
commit fbf96b1bbb
59 changed files with 12880 additions and 3046 deletions
--- a/benchmarks/Benchmark_dwf_ntpf
+++ b/benchmarks/Benchmark_dwf_ntpf
--- a/benchmarks/Benchmark_zmm
+++ b/benchmarks/Benchmark_zmm
--- a/8277
+++ b/8277
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -44,6 +44,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <algorithms/iterative/SchurRedBlack.h>
 #include <algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <algorithms/iterative/ConjugateGradientMixedPrec.h>
 // Lanczos support
 #include <algorithms/iterative/MatrixUtils.h>
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -193,7 +193,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
-    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug"<<std::endl;
+    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    exit(EXIT_SUCCESS);
  }
@@ -234,24 +234,33 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
  }
  std::string COL_RED    = GridLogColours.colour["RED"];
  std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
  std::string COL_BLACK  = GridLogColours.colour["BLACK"];
  std::string COL_GREEN  = GridLogColours.colour["GREEN"];
  std::string COL_BLUE   = GridLogColours.colour["BLUE"];
  std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
  std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
  std::cout <<std::endl;
-  std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout << "__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|__|  |  |  "<<             "|  |  | "<<COL_PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
-  std::cout << "__|_                                    _|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|__         "<<             "        "<<COL_PURPLE<<"        "<<                "          _|__"<<std::endl; 
-  std::cout << "__|_    GGGG    RRRR    III    DDDD     _|__"<<std::endl;
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_PURPLE<<"    _|__"<<std::endl;
-  std::cout << "__|_   G        R   R    I     D   D    _|__"<<std::endl;
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_PURPLE<<"    _|__"<<std::endl;
-  std::cout << "__|_   G        R   R    I     D    D   _|__"<<std::endl;
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_PURPLE<<"    _|__"<<std::endl;
-  std::cout << "__|_   G  GG    RRRR     I     D    D   _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G  GG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_GREEN <<"    _|__"<<std::endl;
-  std::cout << "__|_   G   G    R  R     I     D   D    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G   G   "<<COL_RED<<" R  R   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_GREEN <<"    _|__"<<std::endl;
-  std::cout << "__|_    GGGG    R   R   III    DDDD     _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" R   R  "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_GREEN <<"    _|__"<<std::endl;
-  std::cout << "__|_                                    _|__"<<std::endl; 
+  std::cout <<COL_BLUE << "__|__         "<<             "        "<<COL_GREEN <<"        "<<                "          _|__"<<std::endl; 
-  std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout << "  |  |  |  |  |  |  |  |  |  |  |  |  |  |  "<<std::endl; 
+  std::cout <<COL_BLUE << "  |  |  |  |  "<<             "|  |  | "<<COL_GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
  std::cout << std::endl;
  std::cout << std::endl;
  std::cout << std::endl;
  std::cout <<COL_YELLOW<< std::endl;
  std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
  std::cout << std::endl;
  std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
@@ -263,6 +272,7 @@ void Grid_init(int *argc,char ***argv)
  std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
  std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
  std::cout << "GNU General Public License for more details."<<std::endl;
  std::cout << COL_BACKGROUND <<std::endl;
  std::cout << std::endl;
 }
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -1,126 +1,92 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/Log.cc
+Source file: ./lib/Log.cc
-    Copyright (C) 2015
+Copyright (C) 2015
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
 /*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
 GridStopWatch Logger::StopWatch;
-std::ostream  Logger::devnull(0);
+std::ostream Logger::devnull(0);
 std::string Logger::BLACK("\033[30m");
 std::string Logger::RED("\033[31m");
 std::string Logger::GREEN("\033[32m");
 std::string Logger::YELLOW("\033[33m");
 std::string Logger::BLUE("\033[34m");
 std::string Logger::PURPLE("\033[35m");
 std::string Logger::CYAN("\033[36m");
 std::string Logger::WHITE("\033[37m");
 std::string Logger::NORMAL("\033[0;39m");
 std::string EMPTY("");
-#if 0  
+Colours GridLogColours(0);
-  GridLogger GridLogError      (1,"Error",Logger::RED);
+GridLogger GridLogError(1, "Error", GridLogColours, "RED");
-  GridLogger GridLogWarning    (1,"Warning",Logger::YELLOW);
+GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
-  GridLogger GridLogMessage    (1,"Message",Logger::BLACK);
+GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
-  GridLogger GridLogDebug      (1,"Debug",Logger::PURPLE);
+GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE");
-  GridLogger GridLogPerformance(1,"Performance",Logger::GREEN);
+GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
-  GridLogger GridLogIterative  (1,"Iterative",Logger::BLUE);
+GridLogger GridLogIterative(1, "Iterative", GridLogColours, "BLUE");
-  GridLogger GridLogIntegrator (1,"Integrator",Logger::BLUE);
+GridLogger GridLogIntegrator(1, "Integrator", GridLogColours, "BLUE");
 #else
  GridLogger GridLogError      (1,"Error",EMPTY);
  GridLogger GridLogWarning    (1,"Warning",EMPTY);
  GridLogger GridLogMessage    (1,"Message",EMPTY);
  GridLogger GridLogDebug      (1,"Debug",EMPTY);
  GridLogger GridLogPerformance(1,"Performance",EMPTY);
  GridLogger GridLogIterative  (1,"Iterative",EMPTY);
  GridLogger GridLogIntegrator (1,"Integrator",EMPTY);
 #endif
-void GridLogConfigure(std::vector<std::string> &logstreams)
+void GridLogConfigure(std::vector<std::string> &logstreams) {
 {
  GridLogError.Active(0);
  GridLogWarning.Active(0);
-  GridLogMessage.Active(0);
+  GridLogMessage.Active(1); // at least the messages should be always on
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(0);
  GridLogColours.Active(0);
-  int blackAndWhite = 1;
+  for (int i = 0; i < logstreams.size(); i++) {
-  if(blackAndWhite){
+    if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
-    Logger::BLACK = std::string("");
+    if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
-    Logger::RED    =Logger::BLACK;
+    if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
-    Logger::GREEN  =Logger::BLACK;
+    if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
-    Logger::YELLOW =Logger::BLACK;
+    if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
-    Logger::BLUE   =Logger::BLACK;
+    if (logstreams[i] == std::string("Performance"))
-    Logger::PURPLE =Logger::BLACK;
+      GridLogPerformance.Active(1);
-    Logger::CYAN   =Logger::BLACK;
+    if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
-    Logger::WHITE  =Logger::BLACK;
+    if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
    Logger::NORMAL =Logger::BLACK;
  }
  for(int i=0;i<logstreams.size();i++){
    if ( logstreams[i]== std::string("Error")       ) GridLogError.Active(1);
    if ( logstreams[i]== std::string("Warning")     ) GridLogWarning.Active(1);
    if ( logstreams[i]== std::string("Message")     ) GridLogMessage.Active(1);
    if ( logstreams[i]== std::string("Iterative")   ) GridLogIterative.Active(1);
    if ( logstreams[i]== std::string("Debug")       ) GridLogDebug.Active(1);
    if ( logstreams[i]== std::string("Performance") ) GridLogPerformance.Active(1);
    if ( logstreams[i]== std::string("Integrator" ) ) GridLogIntegrator.Active(1);
  }
 }
 ////////////////////////////////////////////////////////////
 // Verbose limiter on MPI tasks
 ////////////////////////////////////////////////////////////
-void Grid_quiesce_nodes(void)
+void Grid_quiesce_nodes(void) {
-{
+  int me = 0;
  int me=0;
 #ifdef GRID_COMMS_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD,&me);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
  me = shmem_my_pe();
 #endif
-  if ( me ) { 
+  if (me) {
    std::cout.setstate(std::ios::badbit);
  }
 }
-void Grid_unquiesce_nodes(void)
+void Grid_unquiesce_nodes(void) {
 {
 #ifdef GRID_COMMS_MPI
-    std::cout.clear();
+  std::cout.clear();
 #endif
 }
 }
--- a/lib/Log.h
+++ b/lib/Log.h
@@ -6,9 +6,9 @@
    Copyright (C) 2015
-Author: Antonin Portelli <antonin.portelli@me.com>
+    Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+    Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -27,6 +27,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <map>
 #ifndef GRID_LOG_H
 #define GRID_LOG_H
@@ -34,56 +37,99 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <execinfo.h>
 #endif
-namespace Grid {
+    namespace Grid {
 // Dress the output; use std::chrono for time stamping via the StopWatch class
 int Rank(void); // used for early stage debug before library init
 class Colours{
 protected:
  bool is_active;
 public:
  std::map<std::string, std::string> colour;
  Colours(bool activate=false){
    Active(activate);
  };
  void Active(bool activate){
    is_active=activate;
    if (is_active){
     colour["BLACK"]  ="\033[30m";
     colour["RED"]    ="\033[31m";
     colour["GREEN"]  ="\033[32m";
     colour["YELLOW"] ="\033[33m";
     colour["BLUE"]   ="\033[34m";
     colour["PURPLE"] ="\033[35m";
     colour["CYAN"]   ="\033[36m";
     colour["WHITE"]  ="\033[37m";
     colour["NORMAL"] ="\033[0;39m";
   } else {
    colour["BLACK"] ="";
    colour["RED"]   ="";
    colour["GREEN"] ="";
    colour["YELLOW"]="";
    colour["BLUE"]  ="";
    colour["PURPLE"]="";
    colour["CYAN"]  ="";
    colour["WHITE"] ="";
    colour["NORMAL"]="";
  }
 };
 };
 class Logger {
 protected:
-    int active;
+  Colours &Painter;
-    std::string name, topName, COLOUR;
+  int active;
-public:
+  std::string name, topName;
-    static GridStopWatch StopWatch;
+  std::string COLOUR;
    static std::ostream devnull;
-    static std::string BLACK;
+public:
-    static std::string RED  ;
+  static GridStopWatch StopWatch;
-    static std::string GREEN;
+  static std::ostream devnull;
-    static std::string YELLOW;
+
-    static std::string BLUE  ;
+  std::string background() {return Painter.colour["NORMAL"];}
-    static std::string PURPLE;
+  std::string evidence() {return Painter.colour["YELLOW"];}
-    static std::string CYAN  ;
+  std::string colour() {return Painter.colour[COLOUR];}
-    static std::string WHITE ;
+
-    static std::string NORMAL;
+  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
-    
+  : active(on),
- Logger(std::string topNm, int on, std::string nm,std::string col)
+  name(nm),
-   : active(on), name(nm), topName(topNm), COLOUR(col) {};
+  topName(topNm),
-    
+  Painter(col_class),
-    void Active(int on) {active = on;};
+  COLOUR(col){} ;
-    int  isActive(void) {return active;};
+  
-    
+  void Active(int on) {active = on;};
-    friend std::ostream& operator<< (std::ostream& stream, const Logger& log){
+  int  isActive(void) {return active;};
-        if ( log.active ) {
+  
-            StopWatch.Stop();
+  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
-            GridTime now = StopWatch.Elapsed();
+
-            StopWatch.Start();
+    if ( log.active ) {
-            stream << BLACK <<std::setw(8) << std::left << log.topName << BLACK<< " : ";
+      StopWatch.Stop();
-            stream << log.COLOUR <<std::setw(11)  << log.name << BLACK << " : ";
+      GridTime now = StopWatch.Elapsed();
-            stream << YELLOW <<std::setw(6) << now <<BLACK << " : " ;
+      StopWatch.Start();
-            stream << log.COLOUR;
+      stream << log.background()<< log.topName << log.background()<< " : ";
-            return stream;
+      stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
-        } else { 
+      stream << log.evidence()<< now << log.background() << " : " << log.colour();
-            return devnull;
+      return stream;
-        }
+    } else { 
      return devnull;
    }
-    
+  }
 };
-    
+
 class GridLogger: public Logger {
 public:
- GridLogger(int on, std::string nm, std::string col = Logger::BLACK): Logger("Grid", on, nm, col){};
+  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
  Logger("Grid", on, nm, col_class, col_key){};
 };
 void GridLogConfigure(std::vector<std::string> &logstreams);
@@ -95,38 +141,40 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern Colours    GridLogColours;
 #define _NBACKTRACE (256)
 extern void * Grid_backtrace_buffer[_NBACKTRACE];
 #define BACKTRACEFILE() {\
-    char string[20];					\
+char string[20];					\
-    std::sprintf(string,"backtrace.%d",Rank());				\
+std::sprintf(string,"backtrace.%d",Rank());				\
-    std::FILE * fp = std::fopen(string,"w");				\
+std::FILE * fp = std::fopen(string,"w");				\
-    BACKTRACEFP(fp)\
+BACKTRACEFP(fp)\
-    std::fclose(fp);	    \
+std::fclose(fp);	    \
 }
 #ifdef HAVE_EXECINFO_H
 #define BACKTRACEFP(fp) { \
-  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
+int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
-  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
+char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
-  for (int i = 0; i < symbols; i++){\
+for (int i = 0; i < symbols; i++){\
-    std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
+  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
-  }\
+}\
 }
 #else 
 #define BACKTRACEFP(fp) { \
-    std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
 }
 #endif
 #define BACKTRACE() BACKTRACEFP(stdout) 
 }
 #endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -1,32 +1,33 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/Simd.h
+Source file: ./lib/Simd.h
-    Copyright (C) 2015
+Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_SIMD_H
 #define GRID_SIMD_H
@@ -118,6 +119,14 @@ namespace Grid {
  inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
  inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
  inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
  // define projections to real and imaginay parts
  inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
  inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
  inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
  inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
  // define auxiliary functions for complex computations
  inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
  inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
  inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -40,9 +40,10 @@ namespace Grid {
  template<class Field> 
    class ConjugateGradient : public OperatorFunction<Field> {
 public:                                                
    bool ErrorOnNoConverge; //throw an assert when the CG fails to converge. Defaults true.
    RealD   Tolerance;
    Integer MaxIterations;
-    ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
+  ConjugateGradient(RealD tol,Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){ 
    };
@@ -137,13 +138,15 @@ public:
 	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 	  std::cout<<std::endl;
-	  assert(true_residual/Tolerance < 1000.0);
+	  if(ErrorOnNoConverge)
 	    assert(true_residual/Tolerance < 1000.0);
 	  return;
 	}
      }
      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
-      assert(0);
+      if(ErrorOnNoConverge)	
 	assert(0);
    }
  };
 }
--- a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -0,0 +1,142 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
 #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
 namespace Grid {
  //Mixed precision restarted defect correction CG
  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
  public:                                                
    RealD   Tolerance;
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
      Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
      OuterLoopNormMult(100.), guesser(NULL){ };
    void useGuesser(LinearFunction<FieldF> &g){
      guesser = g;
    }
    void operator() (const FieldD &src_d_in, FieldD &sol_d){
      GridStopWatch TotalTimer;
      TotalTimer.Start();
      int cb = src_d_in.checkerboard;
      sol_d.checkerboard = cb;
      RealD src_norm = norm2(src_d_in);
      RealD stop = src_norm * Tolerance*Tolerance;
      GridBase* DoublePrecGrid = src_d_in._grid;
      FieldD tmp_d(DoublePrecGrid);
      tmp_d.checkerboard = cb;
      FieldD tmp2_d(DoublePrecGrid);
      tmp2_d.checkerboard = cb;
      FieldD src_d(DoublePrecGrid);
      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
      RealD inner_tol = Tolerance;
      FieldF src_f(SinglePrecGrid);
      src_f.checkerboard = cb;
      FieldF sol_f(SinglePrecGrid);
      sol_f.checkerboard = cb;
      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
      CG_f.ErrorOnNoConverge = false;
      GridStopWatch InnerCGtimer;
      GridStopWatch PrecChangeTimer;
      for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
 	//Compute double precision rsd and also new RHS vector.
 	Linop_d.HermOp(sol_d, tmp_d);
 	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
 	if(norm < OuterLoopNormMult * stop){
 	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 	  break;
 	}
 	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 	PrecChangeTimer.Start();
 	precisionChange(src_f, src_d);
 	PrecChangeTimer.Stop();
 	zeroit(sol_f);
 	//Optionally improve inner solver guess (eg using known eigenvectors)
 	if(guesser != NULL)
 	  (*guesser)(src_f, sol_f);
 	//Inner CG
 	CG_f.Tolerance = inner_tol;
 	InnerCGtimer.Start();
 	CG_f(Linop_f, src_f, sol_f);
 	InnerCGtimer.Stop();
 	//Convert sol back to double and add to double prec solution
 	PrecChangeTimer.Start();
 	precisionChange(tmp_d, sol_f);
 	PrecChangeTimer.Stop();
 	axpy(sol_d, 1.0, tmp_d, sol_d);
      }
      //Final trial CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
      CG_d(Linop_d, src_d_in, sol_d);
      TotalTimer.Stop();
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
    }
  };
 }
 #endif
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@@ -1,73 +1,74 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/lattice/Lattice_ET.h
+Source file: ./lib/lattice/Lattice_ET.h
-    Copyright (C) 2015
+Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_LATTICE_ET_H
 #define GRID_LATTICE_ET_H
 #include <iostream>
 #include <vector>
 #include <tuple>
 #include <typeinfo>
 #include <vector>
 namespace Grid {
-  ////////////////////////////////////////////////////
+////////////////////////////////////////////////////
-  // Predicated where support
+// Predicated where support
-  ////////////////////////////////////////////////////
+////////////////////////////////////////////////////
-  template<class iobj,class vobj,class robj>
+template <class iobj, class vobj, class robj>
-    inline vobj predicatedWhere(const iobj &predicate,const vobj &iftrue,const robj &iffalse) {
+inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
                            const robj &iffalse) {
  typename std::remove_const<vobj>::type ret;
-    typename std::remove_const<vobj>::type ret;
+  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
-    typedef typename vobj::scalar_object scalar_object;
+  const int Nsimd = vobj::vector_type::Nsimd();
-    typedef typename vobj::scalar_type scalar_type;
+  const int words = sizeof(vobj) / sizeof(vector_type);
    typedef typename vobj::vector_type vector_type;
-    const int Nsimd = vobj::vector_type::Nsimd();
+  std::vector<Integer> mask(Nsimd);
-    const int words = sizeof(vobj)/sizeof(vector_type);
+  std::vector<scalar_object> truevals(Nsimd);
  std::vector<scalar_object> falsevals(Nsimd);
-    std::vector<Integer> mask(Nsimd);
+  extract(iftrue, truevals);
-    std::vector<scalar_object> truevals (Nsimd);
+  extract(iffalse, falsevals);
-    std::vector<scalar_object> falsevals(Nsimd);
+  extract<vInteger, Integer>(TensorRemove(predicate), mask);
-    extract(iftrue   ,truevals);
+  for (int s = 0; s < Nsimd; s++) {
-    extract(iffalse  ,falsevals);
+    if (mask[s]) falsevals[s] = truevals[s];
    extract<vInteger,Integer>(TensorRemove(predicate),mask);
    for(int s=0;s<Nsimd;s++){
      if (mask[s]) falsevals[s]=truevals[s];
    }
    merge(ret,falsevals);
    return ret;
  }
  merge(ret, falsevals);
  return ret;
 }
 ////////////////////////////////////////////
 // recursive evaluation of expressions; Could
 // switch to generic approach with variadics, a la
@@ -75,303 +76,351 @@ namespace Grid {
 // from tuple is hideous; C++14 introduces std::make_index_sequence for this
 ////////////////////////////////////////////
 // leaf eval of lattice ; should enable if protect using traits
-//leaf eval of lattice ; should enable if protect using traits
+template <typename T>
 using is_lattice = std::is_base_of<LatticeBase, T>;
-template <typename T> using is_lattice      = std::is_base_of<LatticeBase,T >;
+template <typename T>
 using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
 template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
 //Specialization of getVectorType for lattices
 template<typename T>
 struct getVectorType<Lattice<T> >{
  typedef typename Lattice<T>::vector_object type;
 };
 template<class sobj>
 inline sobj eval(const unsigned int ss, const sobj &arg)
 {
  return arg;
 }
-template<class lobj>
+template <class lobj>
-inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg)
+inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
-{
+  return arg._odata[ss];
    return arg._odata[ss];
 }
 // handle nodes in syntax tree
 template <typename Op, typename T1>
-auto inline eval(const unsigned int ss, const LatticeUnaryExpression<Op,T1 > &expr) // eval one operand
+auto inline eval(
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second))))
+    const unsigned int ss,
-{
+    const LatticeUnaryExpression<Op, T1> &expr)  // eval one operand
-  return expr.first.func(eval(ss,std::get<0>(expr.second)));
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
  return expr.first.func(eval(ss, std::get<0>(expr.second)));
 }
 template <typename Op, typename T1, typename T2>
-auto inline eval(const unsigned int ss, const LatticeBinaryExpression<Op,T1,T2> &expr) // eval two operands
+auto inline eval(
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second))))
+    const unsigned int ss,
-{
+    const LatticeBinaryExpression<Op, T1, T2> &expr)  // eval two operands
-  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)));
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
                                eval(ss, std::get<1>(expr.second)))) {
  return expr.first.func(eval(ss, std::get<0>(expr.second)),
                         eval(ss, std::get<1>(expr.second)));
 }
 template <typename Op, typename T1, typename T2, typename T3>
-auto inline eval(const unsigned int ss, const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) // eval three operands
+auto inline eval(const unsigned int ss,
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second))))
+                 const LatticeTrinaryExpression<Op, T1, T2, T3>
-{
+                     &expr)  // eval three operands
-  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second)) );
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
                                eval(ss, std::get<1>(expr.second)),
                                eval(ss, std::get<2>(expr.second)))) {
  return expr.first.func(eval(ss, std::get<0>(expr.second)),
                         eval(ss, std::get<1>(expr.second)),
                         eval(ss, std::get<2>(expr.second)));
 }
 //////////////////////////////////////////////////////////////////////////
-// Obtain the grid from an expression, ensuring conformable. This must follow a tree recursion
+// Obtain the grid from an expression, ensuring conformable. This must follow a
 // tree recursion
 //////////////////////////////////////////////////////////////////////////
-template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
+template <class T1,
-inline void GridFromExpression(GridBase * &grid,const T1& lat)   // Lattice leaf
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
-{
+inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf
  if ( grid ) {
    conformable(grid,lat._grid);
  } 
  grid=lat._grid;
 }
 template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
 inline void GridFromExpression(GridBase * &grid,const T1& notlat)   // non-lattice leaf
 {
  if (grid) {
    conformable(grid, lat._grid);
  }
  grid = lat._grid;
 }
 template <class T1,
          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
 inline void GridFromExpression(GridBase *&grid,
                               const T1 &notlat)  // non-lattice leaf
 {}
 template <typename Op, typename T1>
-inline void GridFromExpression(GridBase * &grid,const LatticeUnaryExpression<Op,T1 > &expr)
+inline void GridFromExpression(GridBase *&grid,
-{
+                               const LatticeUnaryExpression<Op, T1> &expr) {
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse 
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
 }
 template <typename Op, typename T1, typename T2>
-inline void GridFromExpression(GridBase * &grid,const LatticeBinaryExpression<Op,T1,T2> &expr) 
+inline void GridFromExpression(
-{
+    GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
-  GridFromExpression(grid,std::get<1>(expr.second));
+  GridFromExpression(grid, std::get<1>(expr.second));
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
+inline void GridFromExpression(
-{
+    GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
-  GridFromExpression(grid,std::get<1>(expr.second));
+  GridFromExpression(grid, std::get<1>(expr.second));
-  GridFromExpression(grid,std::get<2>(expr.second));
+  GridFromExpression(grid, std::get<2>(expr.second));
 }
 //////////////////////////////////////////////////////////////////////////
-// Obtain the CB from an expression, ensuring conformable. This must follow a tree recursion
+// Obtain the CB from an expression, ensuring conformable. This must follow a
 // tree recursion
 //////////////////////////////////////////////////////////////////////////
-template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
+template <class T1,
-inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
 inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
 {
-  if ( (cb==Odd) || (cb==Even) ) {
+  if ((cb == Odd) || (cb == Even)) {
-    assert(cb==lat.checkerboard);
+    assert(cb == lat.checkerboard);
-  } 
+  }
-  cb=lat.checkerboard;
+  cb = lat.checkerboard;
  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
 }
-template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
+template <class T1,
-inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
 inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
 {
  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
 }
 template <typename Op, typename T1>
-inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
+inline void CBFromExpression(int &cb,
-{
+                             const LatticeUnaryExpression<Op, T1> &expr) {
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2>
-inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &expr) 
+inline void CBFromExpression(int &cb,
-{
+                             const LatticeBinaryExpression<Op, T1, T2> &expr) {
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
-  CBFromExpression(cb,std::get<1>(expr.second));
+  CBFromExpression(cb, std::get<1>(expr.second));
  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
+inline void CBFromExpression(
-{
+    int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
-  CBFromExpression(cb,std::get<1>(expr.second));
+  CBFromExpression(cb, std::get<1>(expr.second));
-  CBFromExpression(cb,std::get<2>(expr.second));
+  CBFromExpression(cb, std::get<2>(expr.second));
  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
 }
 ////////////////////////////////////////////
 // Unary operators and funcs
 ////////////////////////////////////////////
-#define GridUnopClass(name,ret)\
+#define GridUnopClass(name, ret)                                          \
-template <class arg> struct name\
+  template <class arg>                                                    \
-{\
+  struct name {                                                           \
-  static auto inline func(const arg a)-> decltype(ret) { return ret; } \
+    static auto inline func(const arg a) -> decltype(ret) { return ret; } \
-};
+  };
-GridUnopClass(UnarySub,-a);
+GridUnopClass(UnarySub, -a);
-GridUnopClass(UnaryNot,Not(a));
+GridUnopClass(UnaryNot, Not(a));
-GridUnopClass(UnaryAdj,adj(a));
+GridUnopClass(UnaryAdj, adj(a));
-GridUnopClass(UnaryConj,conjugate(a));
+GridUnopClass(UnaryConj, conjugate(a));
-GridUnopClass(UnaryTrace,trace(a));
+GridUnopClass(UnaryTrace, trace(a));
-GridUnopClass(UnaryTranspose,transpose(a));
+GridUnopClass(UnaryTranspose, transpose(a));
-GridUnopClass(UnaryTa,Ta(a));
+GridUnopClass(UnaryTa, Ta(a));
-GridUnopClass(UnaryProjectOnGroup,ProjectOnGroup(a));
+GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
-GridUnopClass(UnaryReal,real(a));
+GridUnopClass(UnaryReal, real(a));
-GridUnopClass(UnaryImag,imag(a));
+GridUnopClass(UnaryImag, imag(a));
-GridUnopClass(UnaryToReal,toReal(a));
+GridUnopClass(UnaryToReal, toReal(a));
-GridUnopClass(UnaryToComplex,toComplex(a));
+GridUnopClass(UnaryToComplex, toComplex(a));
-GridUnopClass(UnaryAbs,abs(a));
+GridUnopClass(UnaryTimesI, timesI(a));
-GridUnopClass(UnarySqrt,sqrt(a));
+GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
-GridUnopClass(UnaryRsqrt,rsqrt(a));
+GridUnopClass(UnaryAbs, abs(a));
-GridUnopClass(UnarySin,sin(a));
+GridUnopClass(UnarySqrt, sqrt(a));
-GridUnopClass(UnaryCos,cos(a));
+GridUnopClass(UnaryRsqrt, rsqrt(a));
-GridUnopClass(UnaryLog,log(a));
+GridUnopClass(UnarySin, sin(a));
-GridUnopClass(UnaryExp,exp(a));
+GridUnopClass(UnaryCos, cos(a));
 GridUnopClass(UnaryAsin, asin(a));
 GridUnopClass(UnaryAcos, acos(a));
 GridUnopClass(UnaryLog, log(a));
 GridUnopClass(UnaryExp, exp(a));
 ////////////////////////////////////////////
 // Binary operators
 ////////////////////////////////////////////
-#define GridBinOpClass(name,combination)\
+#define GridBinOpClass(name, combination)                      \
-template <class left,class right>\
+  template <class left, class right>                           \
-struct name\
+  struct name {                                                \
-{\
+    static auto inline func(const left &lhs, const right &rhs) \
-  static auto inline func(const left &lhs,const right &rhs)-> decltype(combination) const \
+        -> decltype(combination) const {                       \
-    {\
+      return combination;                                      \
-      return combination;\
+    }                                                          \
-    }\
+  }
-}
+GridBinOpClass(BinaryAdd, lhs + rhs);
-GridBinOpClass(BinaryAdd,lhs+rhs);
+GridBinOpClass(BinarySub, lhs - rhs);
-GridBinOpClass(BinarySub,lhs-rhs);
+GridBinOpClass(BinaryMul, lhs *rhs);
 GridBinOpClass(BinaryMul,lhs*rhs);
-GridBinOpClass(BinaryAnd   ,lhs&rhs);
+GridBinOpClass(BinaryAnd, lhs &rhs);
-GridBinOpClass(BinaryOr    ,lhs|rhs);
+GridBinOpClass(BinaryOr, lhs | rhs);
-GridBinOpClass(BinaryAndAnd,lhs&&rhs);
+GridBinOpClass(BinaryAndAnd, lhs &&rhs);
-GridBinOpClass(BinaryOrOr  ,lhs||rhs);
+GridBinOpClass(BinaryOrOr, lhs || rhs);
 ////////////////////////////////////////////////////
 // Trinary conditional op
 ////////////////////////////////////////////////////
-#define GridTrinOpClass(name,combination)\
+#define GridTrinOpClass(name, combination)                                     \
-template <class predicate,class left, class right>	\
+  template <class predicate, class left, class right>                          \
-struct name\
+  struct name {                                                                \
-{\
+    static auto inline func(const predicate &pred, const left &lhs,            \
-  static auto inline func(const predicate &pred,const left &lhs,const right &rhs)-> decltype(combination) const \
+                            const right &rhs) -> decltype(combination) const { \
-    {\
+      return combination;                                                      \
-      return combination;\
+    }                                                                          \
-    }\
+  }
 }
-GridTrinOpClass(TrinaryWhere,(predicatedWhere<predicate, \
+GridTrinOpClass(
-			       typename std::remove_reference<left>::type, \
+    TrinaryWhere,
-			       typename std::remove_reference<right>::type> (pred,lhs,rhs)));
+    (predicatedWhere<predicate, typename std::remove_reference<left>::type,
                     typename std::remove_reference<right>::type>(pred, lhs,
                                                                  rhs)));
 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
 #define GRID_UNOP(name)   name<decltype(eval(0, arg))>
 #define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
 #define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
-#define GRID_DEF_UNOP(op, name)\
+#define GRID_UNOP(name) name<decltype(eval(0, arg))>
-template <typename T1,\
+#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
-  typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr> inline auto op(const T1 &arg) \
+#define GRID_TRINOP(name) \
-  -> decltype(LatticeUnaryExpression<GRID_UNOP(name),const T1&>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg)))) \
+  name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
 { return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg))); }
-#define GRID_BINOP_LEFT(op, name)\
+#define GRID_DEF_UNOP(op, name)                                             \
-template <typename T1,typename T2,\
+  template <typename T1,                                                    \
-          typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr>\
+            typename std::enable_if<is_lattice<T1>::value ||                \
-inline auto op(const T1 &lhs,const T2&rhs) \
+                                        is_lattice_expr<T1>::value,         \
-  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
+                                    T1>::type * = nullptr>                  \
-											    std::forward_as_tuple(lhs, rhs)))) \
+  inline auto op(const T1 &arg)                                             \
-{\
+      ->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(       \
- return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
+          std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
-									  std::forward_as_tuple(lhs, rhs))); \
+    return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(             \
-}
+        std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)));     \
  }
-#define GRID_BINOP_RIGHT(op, name)\
+#define GRID_BINOP_LEFT(op, name)                                             \
- template <typename T1,typename T2,\
+  template <typename T1, typename T2,                                         \
-           typename std::enable_if<!is_lattice<T1>::value && !is_lattice_expr<T1>::value, T1>::type* = nullptr,\
+            typename std::enable_if<is_lattice<T1>::value ||                  \
-           typename std::enable_if< is_lattice<T2>::value ||  is_lattice_expr<T2>::value, T2>::type* = nullptr> \
+                                        is_lattice_expr<T1>::value,           \
-inline auto op(const T1 &lhs,const T2&rhs)			\
+                                    T1>::type * = nullptr>                    \
-  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
-											    std::forward_as_tuple(lhs, rhs)))) \
+      ->decltype(                                                             \
-{\
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
- return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
+              std::make_pair(GRID_BINOP(name)(),                              \
-								          std::forward_as_tuple(lhs, rhs))); \
+                             std::forward_as_tuple(lhs, rhs)))) {             \
-}
+    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
  }
-#define GRID_DEF_BINOP(op, name)\
+#define GRID_BINOP_RIGHT(op, name)                                            \
- GRID_BINOP_LEFT(op,name);\
+  template <typename T1, typename T2,                                         \
- GRID_BINOP_RIGHT(op,name);
+            typename std::enable_if<!is_lattice<T1>::value &&                 \
                                        !is_lattice_expr<T1>::value,          \
                                    T1>::type * = nullptr,                    \
            typename std::enable_if<is_lattice<T2>::value ||                  \
                                        is_lattice_expr<T2>::value,           \
                                    T2>::type * = nullptr>                    \
  inline auto op(const T1 &lhs, const T2 &rhs)                                \
      ->decltype(                                                             \
          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
              std::make_pair(GRID_BINOP(name)(),                              \
                             std::forward_as_tuple(lhs, rhs)))) {             \
    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
  }
 #define GRID_DEF_BINOP(op, name) \
  GRID_BINOP_LEFT(op, name);     \
  GRID_BINOP_RIGHT(op, name);
-#define GRID_DEF_TRINOP(op, name)\
+#define GRID_DEF_TRINOP(op, name)                                              \
-template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,const T2&lhs,const T3 &rhs) \
+  template <typename T1, typename T2, typename T3>                             \
-  -> decltype(LatticeTrinaryExpression<GRID_TRINOP(name),const T1&,const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(),\
+  inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)                 \
-										   std::forward_as_tuple(pred,lhs,rhs)))) \
+      ->decltype(                                                              \
-{\
+          LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,  \
-  return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(), \
+                                   const T3 &>(std::make_pair(                 \
-										 std::forward_as_tuple(pred,lhs, rhs))); \
+              GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) {  \
-}
+    return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
                                    const T3 &>(std::make_pair(                \
        GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)));          \
  }
 ////////////////////////
-//Operator definitions
+// Operator definitions
 ////////////////////////
-GRID_DEF_UNOP(operator -,UnarySub);
+GRID_DEF_UNOP(operator-, UnarySub);
-GRID_DEF_UNOP(Not,UnaryNot);
+GRID_DEF_UNOP(Not, UnaryNot);
-GRID_DEF_UNOP(operator !,UnaryNot);
+GRID_DEF_UNOP(operator!, UnaryNot);
-GRID_DEF_UNOP(adj,UnaryAdj);
+GRID_DEF_UNOP(adj, UnaryAdj);
-GRID_DEF_UNOP(conjugate,UnaryConj);
+GRID_DEF_UNOP(conjugate, UnaryConj);
-GRID_DEF_UNOP(trace,UnaryTrace);
+GRID_DEF_UNOP(trace, UnaryTrace);
-GRID_DEF_UNOP(transpose,UnaryTranspose);
+GRID_DEF_UNOP(transpose, UnaryTranspose);
-GRID_DEF_UNOP(Ta,UnaryTa);
+GRID_DEF_UNOP(Ta, UnaryTa);
-GRID_DEF_UNOP(ProjectOnGroup,UnaryProjectOnGroup);
+GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
-GRID_DEF_UNOP(real,UnaryReal);
+GRID_DEF_UNOP(real, UnaryReal);
-GRID_DEF_UNOP(imag,UnaryImag);
+GRID_DEF_UNOP(imag, UnaryImag);
-GRID_DEF_UNOP(toReal,UnaryToReal);
+GRID_DEF_UNOP(toReal, UnaryToReal);
-GRID_DEF_UNOP(toComplex,UnaryToComplex);
+GRID_DEF_UNOP(toComplex, UnaryToComplex);
-GRID_DEF_UNOP(abs  ,UnaryAbs); //abs overloaded in cmath C++98; DON'T do the abs-fabs-dabs-labs thing
+GRID_DEF_UNOP(timesI, UnaryTimesI);
-GRID_DEF_UNOP(sqrt ,UnarySqrt);
+GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
-GRID_DEF_UNOP(rsqrt,UnaryRsqrt);
+GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
-GRID_DEF_UNOP(sin  ,UnarySin);
+                               // abs-fabs-dabs-labs thing
-GRID_DEF_UNOP(cos  ,UnaryCos);
+GRID_DEF_UNOP(sqrt, UnarySqrt);
-GRID_DEF_UNOP(log  ,UnaryLog);
+GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
-GRID_DEF_UNOP(exp  ,UnaryExp);
+GRID_DEF_UNOP(sin, UnarySin);
 GRID_DEF_UNOP(cos, UnaryCos);
 GRID_DEF_UNOP(asin, UnaryAsin);
 GRID_DEF_UNOP(acos, UnaryAcos);
 GRID_DEF_UNOP(log, UnaryLog);
 GRID_DEF_UNOP(exp, UnaryExp);
-GRID_DEF_BINOP(operator+,BinaryAdd);
+GRID_DEF_BINOP(operator+, BinaryAdd);
-GRID_DEF_BINOP(operator-,BinarySub);
+GRID_DEF_BINOP(operator-, BinarySub);
-GRID_DEF_BINOP(operator*,BinaryMul);
+GRID_DEF_BINOP(operator*, BinaryMul);
-GRID_DEF_BINOP(operator&,BinaryAnd);
+GRID_DEF_BINOP(operator&, BinaryAnd);
-GRID_DEF_BINOP(operator|,BinaryOr);
+GRID_DEF_BINOP(operator|, BinaryOr);
-GRID_DEF_BINOP(operator&&,BinaryAndAnd);
+GRID_DEF_BINOP(operator&&, BinaryAndAnd);
-GRID_DEF_BINOP(operator||,BinaryOrOr);
+GRID_DEF_BINOP(operator||, BinaryOrOr);
-GRID_DEF_TRINOP(where,TrinaryWhere);
+GRID_DEF_TRINOP(where, TrinaryWhere);
 /////////////////////////////////////////////////////////////
 // Closure convenience to force expression to evaluate
 /////////////////////////////////////////////////////////////
-template<class Op,class T1>
+template <class Op, class T1>
-  auto closure(const LatticeUnaryExpression<Op,T1> & expr)
+auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))>
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
-{
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))> ret(expr);
+      expr);
  return ret;
 }
-template<class Op,class T1, class T2>
+template <class Op, class T1, class T2>
-  auto closure(const LatticeBinaryExpression<Op,T1,T2> & expr)
+auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
-				      eval(0,std::get<1>(expr.second))))>
+                                        eval(0, std::get<1>(expr.second))))> {
-{
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second))))>
-				   eval(0,std::get<1>(expr.second))))> ret(expr);
+      ret(expr);
  return ret;
 }
-template<class Op,class T1, class T2, class T3>
+template <class Op, class T1, class T2, class T3>
-  auto closure(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
+auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
-				      eval(0,std::get<1>(expr.second)),
+                                        eval(0, std::get<1>(expr.second)),
-				      eval(0,std::get<2>(expr.second))))>
+                                        eval(0, std::get<2>(expr.second))))> {
-{
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second)),
-				   eval(0,std::get<1>(expr.second)),
+                                   eval(0, std::get<2>(expr.second))))>
-				   eval(0,std::get<2>(expr.second))))> ret(expr);
+      ret(expr);
  return ret;
 }
@@ -382,12 +431,11 @@ template<class Op,class T1, class T2, class T3>
 #undef GRID_DEF_UNOP
 #undef GRID_DEF_BINOP
 #undef GRID_DEF_TRINOP
 }
 #if 0
 using namespace Grid;
- 	      
+        
 int main(int argc,char **argv){
   Lattice<double> v1(16);
@@ -397,7 +445,7 @@ using namespace Grid;
   BinaryAdd<double,double> tmp;
   LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &> 
     expr(std::make_pair(tmp,
-	  std::forward_as_tuple(v1,v2)));
+    std::forward_as_tuple(v1,v2)));
   tmp.func(eval(0,v1),eval(0,v2));
   auto var = v1+v2;
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -1,32 +1,33 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/lattice/Lattice_base.h
+Source file: ./lib/lattice/Lattice_base.h
-    Copyright (C) 2015
+Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_LATTICE_BASE_H
 #define GRID_LATTICE_BASE_H
@@ -255,6 +256,18 @@ PARALLEL_FOR_LOOP
        checkerboard=0;
    }
    Lattice(const Lattice& r){ // copy constructor
    	_grid = r._grid;
    	checkerboard = r.checkerboard;
    	_odata.resize(_grid->oSites());// essential
  		PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            _odata[ss]=r._odata[ss];
        }  	
    }
    virtual ~Lattice(void) = default;
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
@@ -267,7 +280,7 @@ PARALLEL_FOR_LOOP
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
-      std::cout<<GridLogMessage<<"Lattice operator ="<<std::endl;
+      
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -40,7 +40,7 @@ namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
    ComplexD nrm = innerProduct(arg,arg);
-    return real(nrm); 
+    return std::real(nrm); 
  }
    template<class vobj>
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -482,6 +482,96 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 }
 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
 template<typename vobj, typename sobj>
 typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in){
  typedef typename vobj::vector_type vtype;
  GridBase* in_grid = in._grid;
  out.resize(in_grid->lSites());
  int ndim = in_grid->Nd();
  int in_nsimd = vtype::Nsimd();
  std::vector<std::vector<int> > in_icoor(in_nsimd);
  for(int lane=0; lane < in_nsimd; lane++){
    in_icoor[lane].resize(ndim);
    in_grid->iCoorFromIindex(in_icoor[lane], lane);
  }
 PARALLEL_FOR_LOOP
  for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> out_ptrs(in_nsimd);
    std::vector<int> in_ocoor(ndim);
    in_grid->oCoorFromOindex(in_ocoor, in_oidx);
    std::vector<int> lcoor(in_grid->Nd());
    for(int lane=0; lane < in_nsimd; lane++){
      for(int mu=0;mu<ndim;mu++)
 	lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
      int lex;
      Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
      out_ptrs[lane] = &out[lex];
    }
    //Unpack into those ptrs
    const vobj & in_vobj = in._odata[in_oidx];
    extract1(in_vobj, out_ptrs, 0);
  }
 }
 //Convert a Lattice from one precision to another
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  assert(out._grid->Nd() == in._grid->Nd());
  out.checkerboard = in.checkerboard;
  GridBase *in_grid=in._grid;
  GridBase *out_grid = out._grid;
  typedef typename VobjOut::scalar_object SobjOut;
  typedef typename VobjIn::scalar_object SobjIn;
  int ndim = out._grid->Nd();
  int out_nsimd = out_grid->Nsimd();
  std::vector<std::vector<int> > out_icoor(out_nsimd);
  for(int lane=0; lane < out_nsimd; lane++){
    out_icoor[lane].resize(ndim);
    out_grid->iCoorFromIindex(out_icoor[lane], lane);
  }
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
  PARALLEL_FOR_LOOP
  for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
    std::vector<int> out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
    std::vector<SobjOut*> ptrs(out_nsimd);      
    std::vector<int> lcoor(out_grid->Nd());
    for(int lane=0; lane < out_nsimd; lane++){
      for(int mu=0;mu<ndim;mu++)
 	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
      ptrs[lane] = &in_slex_conv[llex];
    }
    merge(out._odata[out_oidx], ptrs, 0);
  }
 }
 }
 #endif
--- a/lib/pugixml/.dirstamp
+++ b/lib/pugixml/.dirstamp
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -495,5 +495,6 @@ namespace QCD {
 #include <qcd/hmc/integrators/Integrator_algorithm.h>
 #include <qcd/hmc/HMC.h>
 #include <qcd/smearing/Smearing.h>
 #endif
--- a/lib/qcd/action/ActionBase.h
+++ b/lib/qcd/action/ActionBase.h
@@ -35,6 +35,7 @@ template<class GaugeField>
 class Action { 
 public:
  bool is_smeared = false;
  // Boundary conditions? // Heatbath?
  virtual void  refresh(const GaugeField &U, GridParallelRNG& pRNG) = 0;// refresh pseudofermions
  virtual RealD S    (const GaugeField &U)                        = 0;  // evaluate the action
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -34,78 +34,75 @@ directory
 namespace Grid {
-namespace QCD {
+  namespace QCD {
 //////////////////////////////////////////////
 // Template parameter class constructs to package
 // externally control Fermion implementations
 // in orthogonal directions
 //
 // Ultimately need Impl to always define types where XXX is opaque
 //
 //    typedef typename XXX               Simd;
 //    typedef typename XXX     GaugeLinkField;
 //    typedef typename XXX         GaugeField;
 //    typedef typename XXX      GaugeActField;
 //    typedef typename XXX       FermionField;
 //    typedef typename XXX  DoubledGaugeField;
 //    typedef typename XXX         SiteSpinor;
 //    typedef typename XXX     SiteHalfSpinor;
 //    typedef typename XXX         Compressor;
 //
 // and Methods:
 //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const
 //    GaugeField &Umu)
 //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const
 //    GaugeField &Umu)
 //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const
 //    SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
 //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const
 //    FermionField &A,int mu)
 //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const
 //    FermionField &A,int mu)
 //
 //
 // To acquire the typedefs from "Base" (either a base class or template param)
 // use:
 //
 // INHERIT_GIMPL_TYPES(Base)
 // INHERIT_FIMPL_TYPES(Base)
 // INHERIT_IMPL_TYPES(Base)
 //
 // The Fermion operators will do the following:
 //
 // struct MyOpParams {
 //   RealD mass;
 // };
 //
 //
 // template<class Impl>
 // class MyOp : pubic<Impl> {
 // public:
 //
 //    INHERIT_ALL_IMPL_TYPES(Impl);
 //
 //    MyOp(MyOpParams Myparm, ImplParams &ImplParam) :  Impl(ImplParam)
 //    {
 //
 //    };
 //
 //  }
 //////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////
-// Implementation dependent fermion types
+    // Template parameter class constructs to package
-////////////////////////////////////////////////////////////////////////
+    // externally control Fermion implementations
    // in orthogonal directions
    //
    // Ultimately need Impl to always define types where XXX is opaque
    //
    //    typedef typename XXX               Simd;
    //    typedef typename XXX     GaugeLinkField;	
    //    typedef typename XXX         GaugeField;
    //    typedef typename XXX      GaugeActField;
    //    typedef typename XXX       FermionField;
    //    typedef typename XXX  DoubledGaugeField;
    //    typedef typename XXX         SiteSpinor;
    //    typedef typename XXX     SiteHalfSpinor;	
    //    typedef typename XXX         Compressor;	
    //
    // and Methods:
    //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
    //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
    //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
    //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
    //
    //
    // To acquire the typedefs from "Base" (either a base class or template param) use:
    //
    // INHERIT_GIMPL_TYPES(Base)
    // INHERIT_FIMPL_TYPES(Base)
    // INHERIT_IMPL_TYPES(Base)
    //
    // The Fermion operators will do the following:
    //
    // struct MyOpParams { 
    //   RealD mass;
    // };
    //
    //
    // template<class Impl>
    // class MyOp : public<Impl> { 
    // public:
    //
    //    INHERIT_ALL_IMPL_TYPES(Impl);
    //
    //    MyOp(MyOpParams Myparm, ImplParams &ImplParam) :  Impl(ImplParam)
    //    {
    //
    //    };
    //    
    //  }
    //////////////////////////////////////////////
    ////////////////////////////////////////////////////////////////////////
    // Implementation dependent fermion types
    ////////////////////////////////////////////////////////////////////////
 #define INHERIT_FIMPL_TYPES(Impl)\
    typedef typename Impl::FermionField           FermionField;		\
    typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
    typedef typename Impl::SiteSpinor               SiteSpinor;		\
    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
    typedef typename Impl::Compressor               Compressor;		\
    typedef typename Impl::StencilImpl             StencilImpl;	  \
    typedef typename Impl::ImplParams ImplParams;
 #define INHERIT_FIMPL_TYPES(Impl)                             \
  typedef typename Impl::FermionField FermionField;           \
  typedef typename Impl::DoubledGaugeField DoubledGaugeField; \
  typedef typename Impl::SiteSpinor SiteSpinor;               \
  typedef typename Impl::SiteHalfSpinor SiteHalfSpinor;       \
  typedef typename Impl::Compressor Compressor;               \
  typedef typename Impl::StencilImpl StencilImpl;             \
  typedef typename Impl::ImplParams ImplParams;
 #define INHERIT_IMPL_TYPES(Base) \
  INHERIT_GIMPL_TYPES(Base)      \
@@ -148,17 +145,22 @@ class WilsonImpl
  bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
-  inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
+  inline void multLink(SiteHalfSpinor &phi,
-                       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+		       const SiteDoubledGaugeField &U,
                       const SiteHalfSpinor &chi,
 		       int mu,
 		       StencilEntry *SE,
                       StencilImpl &St) {
    mult(&phi(), &U(mu), &chi());
  }
  template <class ref>
-  inline void loadLinkElement(Simd &reg, ref &memory) {
+  inline void loadLinkElement(Simd &reg,
 			      ref &memory) {
    reg = memory;
  }
-  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,
+  inline void DoubleStore(GridBase *GaugeGrid,
 			  DoubledGaugeField &Uds,
                          const GaugeField &Umu) {
    conformable(Uds._grid, GaugeGrid);
    conformable(Umu._grid, GaugeGrid);
@@ -171,15 +173,19 @@ class WilsonImpl
    }
  }
-  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
+  inline void InsertForce4D(GaugeField &mat,
-                            FermionField &A, int mu) {
+			    FermionField &Btilde,
                            FermionField &A,
 			    int mu) {
    GaugeLinkField link(mat._grid);
    link = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
    PokeIndex<LorentzIndex>(mat, link, mu);
  }
-  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
+  inline void InsertForce5D(GaugeField &mat,
-                            FermionField &Atilde, int mu) {
+			    FermionField &Btilde,
                            FermionField &Atilde,
 			    int mu) {
    int Ls = Btilde._grid->_fdimensions[0];
    GaugeLinkField tmp(mat._grid);
--- a/lib/qcd/action/gauge/GaugeImpl.h
+++ b/lib/qcd/action/gauge/GaugeImpl.h
@@ -1,181 +1,188 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/qcd/action/gauge/GaugeImpl.h
+Source file: ./lib/qcd/action/gauge/GaugeImpl.h
-    Copyright (C) 2015
+Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
-#ifndef  GRID_QCD_GAUGE_IMPL_H
+/*  END LEGAL */
-#define  GRID_QCD_GAUGE_IMPL_H
+#ifndef GRID_QCD_GAUGE_IMPL_H
 #define GRID_QCD_GAUGE_IMPL_H
 namespace Grid {
-  namespace QCD {
+namespace QCD {
-    
+////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////////////////////////////////////////////
+// Implementation dependent gauge types
-    // Implementation dependent gauge types
+////////////////////////////////////////////////////////////////////////
    ////////////////////////////////////////////////////////////////////////
-template<class Gimpl> class WilsonLoops;
+template <class Gimpl> class WilsonLoops;
-#define INHERIT_GIMPL_TYPES(GImpl) \
+#define INHERIT_GIMPL_TYPES(GImpl)                                             \
-    typedef typename GImpl::Simd                           Simd;\
+  typedef typename GImpl::Simd Simd;                                           \
-    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
+  typedef typename GImpl::GaugeLinkField GaugeLinkField;                       \
-    typedef typename GImpl::GaugeField               GaugeField;\
+  typedef typename GImpl::GaugeField GaugeField;                               \
-    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
+  typedef typename GImpl::SiteGaugeField SiteGaugeField;                       \
-    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;
+  typedef typename GImpl::SiteGaugeLink SiteGaugeLink;
 //
 template <class S, int Nrepresentation = Nc> class GaugeImplTypes {
 public:
  typedef S Simd;
-    // 
+  template <typename vtype>
-    template<class S,int Nrepresentation=Nc>
+  using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation>>>;
-    class GaugeImplTypes { 
+  template <typename vtype>
-    public:
+  using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation>>, Nd>;
      typedef S Simd;
      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd  >;
      typedef iImplGaugeLink    <Simd>           SiteGaugeLink;
      typedef iImplGaugeField   <Simd>           SiteGaugeField;
      typedef Lattice<SiteGaugeLink>                GaugeLinkField; // bit ugly naming; polarised gauge field, lorentz... all ugly
      typedef Lattice<SiteGaugeField>                   GaugeField;
-    };
+  typedef iImplGaugeLink<Simd> SiteGaugeLink;
  typedef iImplGaugeField<Simd> SiteGaugeField;
-    // Composition with smeared link, bc's etc.. probably need multiple inheritance
+  typedef Lattice<SiteGaugeLink> GaugeLinkField; // bit ugly naming; polarised
-    // Variable precision "S" and variable Nc
+                                                 // gauge field, lorentz... all
-    template<class GimplTypes>
+                                                 // ugly
-    class PeriodicGaugeImpl : public GimplTypes  { 
+  typedef Lattice<SiteGaugeField> GaugeField;
    public:
-    INHERIT_GIMPL_TYPES(GimplTypes);
+  // Move this elsewhere?
-
+  static inline void AddGaugeLink(GaugeField &U, GaugeLinkField &W,
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+                                  int mu) { // U[mu] += W
-    // Support needed for the assembly of loops including all boundary condition effects such as conjugate bcs
+    PARALLEL_FOR_LOOP
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    for (auto ss = 0; ss < U._grid->oSites(); ss++) {
-    
+      U._odata[ss]._internal[mu] =
-      template<class covariant>  static inline
+          U._odata[ss]._internal[mu] + W._odata[ss]._internal;
      Lattice<covariant> CovShiftForward (const GaugeLinkField &Link, int mu, const Lattice<covariant> &field) {
 	return PeriodicBC::CovShiftForward(Link,mu,field);
      }
      template<class covariant> static inline
      Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,const Lattice<covariant> &field) {
 	return PeriodicBC::CovShiftBackward(Link,mu,field);
      }
      static inline
      GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
 	return Cshift(adj(Link),mu,-1);
      }
      static inline
      GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
 	return Link;
      }
      static inline
      GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
 	return Cshift(Link,mu,1);
      }
      static inline bool isPeriodicGaugeField(void) {
 	return true;
      }
    };
    // Composition with smeared link, bc's etc.. probably need multiple inheritance
    // Variable precision "S" and variable Nc
    template<class GimplTypes>
    class ConjugateGaugeImpl : public GimplTypes { 
    public:
      INHERIT_GIMPL_TYPES(GimplTypes);
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Support needed for the assembly of loops including all boundary condition effects such as Gparity.
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class covariant>  static
    Lattice<covariant> CovShiftForward (const GaugeLinkField &Link, int mu, const Lattice<covariant> &field) {
      return ConjugateBC::CovShiftForward(Link,mu,field);
    }
    template<class covariant> static
    Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,const Lattice<covariant> &field) {
      return ConjugateBC::CovShiftBackward(Link,mu,field);
    }
    static inline
    GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
      GridBase *grid = Link._grid;
      int Lmu = grid->GlobalDimensions()[mu]-1;
      Lattice<iScalar<vInteger> > coor(grid);    LatticeCoordinate(coor,mu);
      GaugeLinkField tmp (grid);
      tmp=adj(Link);
      tmp = where(coor==Lmu,conjugate(tmp),tmp);
      return Cshift(tmp,mu,-1);// moves towards positive mu
    }
    static inline
    GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
      return Link;
    }
    static inline
    GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
      GridBase *grid = Link._grid;
      int Lmu = grid->GlobalDimensions()[mu]-1;
      Lattice<iScalar<vInteger> > coor(grid);    LatticeCoordinate(coor,mu);
      GaugeLinkField tmp (grid);
      tmp=Cshift(Link,mu,1);
      tmp=where(coor==Lmu,conjugate(tmp),tmp);
      return tmp;
    }
    static inline bool isPeriodicGaugeField(void) {
      return false;
    }
    };
    typedef GaugeImplTypes<vComplex,Nc>     GimplTypesR;
    typedef GaugeImplTypes<vComplexF,Nc>    GimplTypesF;
    typedef GaugeImplTypes<vComplexD,Nc>    GimplTypesD;
    typedef PeriodicGaugeImpl<GimplTypesR> PeriodicGimplR; // Real.. whichever prec
    typedef PeriodicGaugeImpl<GimplTypesF> PeriodicGimplF; // Float
    typedef PeriodicGaugeImpl<GimplTypesD> PeriodicGimplD; // Double
    typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever prec
    typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
    typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
  }
 };
 // Composition with smeared link, bc's etc.. probably need multiple inheritance
 // Variable precision "S" and variable Nc
 template <class GimplTypes> class PeriodicGaugeImpl : public GimplTypes {
 public:
  INHERIT_GIMPL_TYPES(GimplTypes);
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Support needed for the assembly of loops including all boundary condition
  // effects such as conjugate bcs
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  template <class covariant>
  static inline Lattice<covariant>
  CovShiftForward(const GaugeLinkField &Link, int mu,
                  const Lattice<covariant> &field) {
    return PeriodicBC::CovShiftForward(Link, mu, field);
  }
  template <class covariant>
  static inline Lattice<covariant>
  CovShiftBackward(const GaugeLinkField &Link, int mu,
                   const Lattice<covariant> &field) {
    return PeriodicBC::CovShiftBackward(Link, mu, field);
  }
  static inline GaugeLinkField
  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
    return Cshift(adj(Link), mu, -1);
  }
  static inline GaugeLinkField
  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
    return Link;
  }
  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
    return Cshift(Link, mu, 1);
  }
  static inline bool isPeriodicGaugeField(void) { return true; }
 };
 // Composition with smeared link, bc's etc.. probably need multiple inheritance
 // Variable precision "S" and variable Nc
 template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes {
 public:
  INHERIT_GIMPL_TYPES(GimplTypes);
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Support needed for the assembly of loops including all boundary condition
  // effects such as Gparity.
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  template <class covariant>
  static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
                                            const Lattice<covariant> &field) {
    return ConjugateBC::CovShiftForward(Link, mu, field);
  }
  template <class covariant>
  static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
                                             const Lattice<covariant> &field) {
    return ConjugateBC::CovShiftBackward(Link, mu, field);
  }
  static inline GaugeLinkField
  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
    GridBase *grid = Link._grid;
    int Lmu = grid->GlobalDimensions()[mu] - 1;
    Lattice<iScalar<vInteger>> coor(grid);
    LatticeCoordinate(coor, mu);
    GaugeLinkField tmp(grid);
    tmp = adj(Link);
    tmp = where(coor == Lmu, conjugate(tmp), tmp);
    return Cshift(tmp, mu, -1); // moves towards positive mu
  }
  static inline GaugeLinkField
  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
    return Link;
  }
  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
    GridBase *grid = Link._grid;
    int Lmu = grid->GlobalDimensions()[mu] - 1;
    Lattice<iScalar<vInteger>> coor(grid);
    LatticeCoordinate(coor, mu);
    GaugeLinkField tmp(grid);
    tmp = Cshift(Link, mu, 1);
    tmp = where(coor == Lmu, conjugate(tmp), tmp);
    return tmp;
  }
  static inline bool isPeriodicGaugeField(void) { return false; }
 };
 typedef GaugeImplTypes<vComplex, Nc> GimplTypesR;
 typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF;
 typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD;
 typedef PeriodicGaugeImpl<GimplTypesR> PeriodicGimplR; // Real.. whichever prec
 typedef PeriodicGaugeImpl<GimplTypesF> PeriodicGimplF; // Float
 typedef PeriodicGaugeImpl<GimplTypesD> PeriodicGimplD; // Double
 typedef ConjugateGaugeImpl<GimplTypesR>
    ConjugateGimplR; // Real.. whichever prec
 typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
 typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
 }
 }
 #endif
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
@@ -1,212 +1,214 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+Source file: ./lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
-    Copyright (C) 2015
+Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
 /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H
 #define QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H
-namespace Grid{
+namespace Grid {
-  namespace QCD{
+namespace QCD {
-    ///////////////////////////////////////
+///////////////////////////////////////
-    // One flavour rational
+// One flavour rational
-    ///////////////////////////////////////
+///////////////////////////////////////
-    // S_f = chi^dag *  N(Mpc^dag*Mpc)/D(Mpc^dag*Mpc) * chi
+// S_f = chi^dag *  N(Mpc^dag*Mpc)/D(Mpc^dag*Mpc) * chi
 //
 // Here, M is some operator
 // N and D makeup the rat. poly
 //
 template <class Impl>
 class OneFlavourEvenOddRationalPseudoFermionAction
    : public Action<typename Impl::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  typedef OneFlavourRationalParams Params;
  Params param;
  MultiShiftFunction PowerHalf;
  MultiShiftFunction PowerNegHalf;
  MultiShiftFunction PowerQuarter;
  MultiShiftFunction PowerNegQuarter;
 private:
  FermionOperator<Impl> &FermOp;  // the basic operator
  // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us
  // historically
  // and hasenbusch works better
  FermionField PhiEven;  // the pseudo fermion field for this trajectory
  FermionField PhiOdd;   // the pseudo fermion field for this trajectory
 public:
  OneFlavourEvenOddRationalPseudoFermionAction(FermionOperator<Impl> &Op,
                                               Params &p)
      : FermOp(Op),
        PhiEven(Op.FermionRedBlackGrid()),
        PhiOdd(Op.FermionRedBlackGrid()),
        param(p) {
    AlgRemez remez(param.lo, param.hi, param.precision);
    // MdagM^(+- 1/2)
    std::cout << GridLogMessage << "Generating degree " << param.degree
              << " for x^(1/2)" << std::endl;
    remez.generateApprox(param.degree, 1, 2);
    PowerHalf.Init(remez, param.tolerance, false);
    PowerNegHalf.Init(remez, param.tolerance, true);
    // MdagM^(+- 1/4)
    std::cout << GridLogMessage << "Generating degree " << param.degree
              << " for x^(1/4)" << std::endl;
    remez.generateApprox(param.degree, 1, 4);
    PowerQuarter.Init(remez, param.tolerance, false);
    PowerNegQuarter.Init(remez, param.tolerance, true);
  };
  virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
    // P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
    //        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
    // Phi = MpcdagMpc^{1/4} eta
    //
-    // Here, M is some operator 
+    // P(eta) = e^{- eta^dag eta}
    // N and D makeup the rat. poly 
    //
-  
+    // e^{x^2/2 sig^2} => sig^2 = 0.5.
-    template<class Impl>
+    //
-    class OneFlavourEvenOddRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    // So eta should be of width sig = 1/sqrt(2).
    public:
      INHERIT_IMPL_TYPES(Impl);
-      typedef OneFlavourRationalParams Params;
+    RealD scale = std::sqrt(0.5);
      Params param;
-      MultiShiftFunction PowerHalf   ;
+    FermionField eta(FermOp.FermionGrid());
-      MultiShiftFunction PowerNegHalf;
+    FermionField etaOdd(FermOp.FermionRedBlackGrid());
-      MultiShiftFunction PowerQuarter;
+    FermionField etaEven(FermOp.FermionRedBlackGrid());
      MultiShiftFunction PowerNegQuarter;
-    private:
+    gaussian(pRNG, eta);
-     
+    eta = eta * scale;
      FermionOperator<Impl> & FermOp;// the basic operator
-      // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us historically
+    pickCheckerboard(Even, etaEven, eta);
-      // and hasenbusch works better
+    pickCheckerboard(Odd, etaOdd, eta);
-      FermionField PhiEven; // the pseudo fermion field for this trajectory
+    FermOp.ImportGauge(U);
      FermionField PhiOdd; // the pseudo fermion field for this trajectory
-    public:
+    // mutishift CG
    SchurDifferentiableOperator<Impl> Mpc(FermOp);
    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter, PowerQuarter);
    msCG(Mpc, etaOdd, PhiOdd);
-      OneFlavourEvenOddRationalPseudoFermionAction(FermionOperator<Impl>  &Op, 
+    //////////////////////////////////////////////////////
-						   Params & p ) : FermOp(Op), 
+    // FIXME : Clover term not yet..
-	PhiEven(Op.FermionRedBlackGrid()), 
+    //////////////////////////////////////////////////////
 	PhiOdd (Op.FermionRedBlackGrid()), 
 	param(p) 
      {
 	AlgRemez remez(param.lo,param.hi,param.precision);
-	// MdagM^(+- 1/2)
+    assert(FermOp.ConstEE() == 1);
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
+    PhiEven = zero;
-	remez.generateApprox(param.degree,1,2);
+  };
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
-	// MdagM^(+- 1/4)
+  //////////////////////////////////////////////////////
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
+  // S = phi^dag (Mdag M)^-1/2 phi
-	remez.generateApprox(param.degree,1,4);
+  //////////////////////////////////////////////////////
-   	PowerQuarter.Init(remez,param.tolerance,false);
+  virtual RealD S(const GaugeField &U) {
-	PowerNegQuarter.Init(remez,param.tolerance,true);
+    FermOp.ImportGauge(U);
      };
      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
-	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
+    FermionField Y(FermOp.FermionRedBlackGrid());
 	//        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
 	// Phi = MpcdagMpc^{1/4} eta 
 	//
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
-	RealD scale = std::sqrt(0.5);
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);
-	FermionField eta    (FermOp.FermionGrid());
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,
-	FermionField etaOdd (FermOp.FermionRedBlackGrid());
+                                                   PowerNegQuarter);
 	FermionField etaEven(FermOp.FermionRedBlackGrid());
-	gaussian(pRNG,eta);	eta=eta*scale;
+    msCG(Mpc, PhiOdd, Y);
-	pickCheckerboard(Even,etaEven,eta);
+    RealD action = norm2(Y);
-	pickCheckerboard(Odd,etaOdd,eta);
+    std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 "
                                   "solve or -1/2 solve faster??? "
              << action << std::endl;
-	FermOp.ImportGauge(U);
+    return action;
  };
-	// mutishift CG
+  //////////////////////////////////////////////////////
-	SchurDifferentiableOperator<Impl> Mpc(FermOp);
+  // Need
-	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerQuarter);
+  // dS_f/dU = chi^dag   d[N/D]  chi
-	msCG(Mpc,etaOdd,PhiOdd);
+  //
  // N/D is expressed as partial fraction expansion:
  //
  //           a0 + \sum_k ak/(M^dagM + bk)
  //
  // d[N/D] is then
  //
  //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M +
  //          bk]^{-1}
  //
  // Need
  //       Mf Phi_k = [MdagM+bk]^{-1} Phi
  //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
  //
  // With these building blocks
  //
  //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf
  //       Phi_k
  //        S    = innerprodReal(Phi,Mf Phi);
  //////////////////////////////////////////////////////
  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
    const int Npole = PowerNegHalf.poles.size();
-	//////////////////////////////////////////////////////
+    std::vector<FermionField> MPhi_k(Npole, FermOp.FermionRedBlackGrid());
 	// FIXME : Clover term not yet..
 	//////////////////////////////////////////////////////
-	assert(FermOp.ConstEE() == 1);
+    FermionField X(FermOp.FermionRedBlackGrid());
-	PhiEven = zero;
+    FermionField Y(FermOp.FermionRedBlackGrid());
      };
-      //////////////////////////////////////////////////////
+    GaugeField tmp(FermOp.GaugeGrid());
      // S = phi^dag (Mdag M)^-1/2 phi
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
-	FermOp.ImportGauge(U);
+    FermOp.ImportGauge(U);
-	FermionField Y(FermOp.FermionRedBlackGrid());
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);
 	SchurDifferentiableOperator<Impl> Mpc(FermOp);
-	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegQuarter);
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter, PowerNegHalf);
-	msCG(Mpc,PhiOdd,Y);
+    msCG(Mpc, PhiOdd, MPhi_k);
-	RealD action = norm2(Y);
+    dSdU = zero;
-	std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 solve or -1/2 solve faster??? "<<action<<std::endl;
+    for (int k = 0; k < Npole; k++) {
      RealD ak = PowerNegHalf.residues[k];
-	return action;
+      X = MPhi_k[k];
      };
-      //////////////////////////////////////////////////////
+      Mpc.Mpc(X, Y);
-      // Need
+      Mpc.MpcDeriv(tmp, Y, X);
-      // dS_f/dU = chi^dag   d[N/D]  chi
+      dSdU = dSdU + ak * tmp;
-      //
+      Mpc.MpcDagDeriv(tmp, X, Y);
-      // N/D is expressed as partial fraction expansion:
+      dSdU = dSdU + ak * tmp;
-      //
+    }
      //           a0 + \sum_k ak/(M^dagM + bk)
      //
      // d[N/D] is then
      //
      //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M + bk]^{-1}
      //
      // Need
      //       Mf Phi_k = [MdagM+bk]^{-1} Phi
      //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
      //
      // With these building blocks
      //
      //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf Phi_k
      //        S    = innerprodReal(Phi,Mf Phi);
      //////////////////////////////////////////////////////
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-	const int Npole = PowerNegHalf.poles.size();
+    // dSdU = Ta(dSdU);
-
+  };
-	std::vector<FermionField> MPhi_k (Npole,FermOp.FermionRedBlackGrid());
+};
-
+}
 	FermionField X(FermOp.FermionRedBlackGrid());
 	FermionField Y(FermOp.FermionRedBlackGrid());
 	GaugeField   tmp(FermOp.GaugeGrid());
 	FermOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> Mpc(FermOp);
 	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegHalf);
 	msCG(Mpc,PhiOdd,MPhi_k);
 	dSdU = zero;
 	for(int k=0;k<Npole;k++){
 	  RealD ak = PowerNegHalf.residues[k];
 	  X  = MPhi_k[k];
 	  Mpc.Mpc(X,Y);
 	  Mpc.MpcDeriv   (tmp , Y, X );  dSdU=dSdU+ak*tmp;
 	  Mpc.MpcDagDeriv(tmp , X, Y );  dSdU=dSdU+ak*tmp;
 	}
 	dSdU = Ta(dSdU);
      };
    };
  }
 }
 #endif
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -256,7 +256,7 @@ namespace Grid{
 	}
-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);
      };
    };
--- a/lib/qcd/action/pseudofermion/OneFlavourRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRational.h
@@ -186,7 +186,7 @@ namespace Grid{
 	}
-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);
      };
    };
--- a/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@@ -242,7 +242,7 @@ namespace Grid{
 	}
-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);
      };
    };
--- a/lib/qcd/action/pseudofermion/TwoFlavour.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavour.h
@@ -137,7 +137,7 @@ namespace Grid{
 	FermOp.MDeriv(tmp , Y, X,DaggerNo );  dSdU=tmp;
 	FermOp.MDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);
      };
--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
@@ -62,118 +62,120 @@ class TwoFlavourEvenOddPseudoFermionAction
        DerivativeSolver(DS),
        ActionSolver(AS),
        PhiEven(Op.FermionRedBlackGrid()),
-        PhiOdd(Op.FermionRedBlackGrid()){};
+	PhiOdd(Op.FermionRedBlackGrid())
 		  {};
      //////////////////////////////////////////////////////////////////////////////////////
      // Push the gauge field in to the dops. Assume any BC's and smearing already applied
      //////////////////////////////////////////////////////////////////////////////////////
      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
-  //////////////////////////////////////////////////////////////////////////////////////
+	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi}
-  // Push the gauge field in to the dops. Assume any BC's and smearing already
+	// Phi = McpDag eta 
-  // applied
+	// P(eta) = e^{- eta^dag eta}
-  //////////////////////////////////////////////////////////////////////////////////////
+	//
-  virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
+	// e^{x^2/2 sig^2} => sig^2 = 0.5.
    // P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi}
    // Phi = McpDag eta
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
-    RealD scale = std::sqrt(0.5);
+	RealD scale = std::sqrt(0.5);
-    FermionField eta(FermOp.FermionGrid());
+	FermionField eta    (FermOp.FermionGrid());
-    FermionField etaOdd(FermOp.FermionRedBlackGrid());
+	FermionField etaOdd (FermOp.FermionRedBlackGrid());
-    FermionField etaEven(FermOp.FermionRedBlackGrid());
+	FermionField etaEven(FermOp.FermionRedBlackGrid());
-     gaussian(pRNG, eta);
+	gaussian(pRNG,eta);
-    pickCheckerboard(Even, etaEven, eta);
+	pickCheckerboard(Even,etaEven,eta);
-    pickCheckerboard(Odd, etaOdd, eta);
+	pickCheckerboard(Odd,etaOdd,eta);
-    FermOp.ImportGauge(U);
+	FermOp.ImportGauge(U);
-    SchurDifferentiableOperator<Impl> PCop(FermOp);
+	SchurDifferentiableOperator<Impl> PCop(FermOp);
-    PCop.MpcDag(etaOdd, PhiOdd);
+	PCop.MpcDag(etaOdd,PhiOdd);
-    FermOp.MooeeDag(etaEven, PhiEven);
+	FermOp.MooeeDag(etaEven,PhiEven);
-    PhiOdd = PhiOdd * scale;
+	PhiOdd =PhiOdd*scale;
-    PhiEven = PhiEven * scale;
+	PhiEven=PhiEven*scale;
  };
-  //////////////////////////////////////////////////////
+      };
  // S = phi^dag (Mdag M)^-1 phi  (odd)
  //   + phi^dag (Mdag M)^-1 phi  (even)
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    FermOp.ImportGauge(U);
-    FermionField X(FermOp.FermionRedBlackGrid());
+      //////////////////////////////////////////////////////
-    FermionField Y(FermOp.FermionRedBlackGrid());
+      // S = phi^dag (Mdag M)^-1 phi  (odd)
      //   + phi^dag (Mdag M)^-1 phi  (even)
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
-    SchurDifferentiableOperator<Impl> PCop(FermOp);
+	FermOp.ImportGauge(U);
-    X = zero;
+	FermionField X(FermOp.FermionRedBlackGrid());
-    ActionSolver(PCop, PhiOdd, X);
+	FermionField Y(FermOp.FermionRedBlackGrid());
-    PCop.Op(X, Y);
+	
-    RealD action = norm2(Y);
+	SchurDifferentiableOperator<Impl> PCop(FermOp);
-    // The EE factorised block; normally can replace with zero if det is
+	X=zero;
-    // constant (gauge field indept)
+	ActionSolver(PCop,PhiOdd,X);
-    // Only really clover term that creates this.
+	PCop.Op(X,Y);
-    FermOp.MooeeInvDag(PhiEven, Y);
+	RealD action = norm2(Y);
    action = action + norm2(Y);
-    std::cout << GridLogMessage << "Pseudofermion EO action " << action
+	// The EE factorised block; normally can replace with zero if det is constant (gauge field indept)
-              << std::endl;
+	// Only really clover term that creates this.
-    return action;
+	FermOp.MooeeInvDag(PhiEven,Y);
-  };
+	action = action + norm2(Y);
-  //////////////////////////////////////////////////////
+	std::cout << GridLogMessage << "Pseudofermion EO action "<<action<<std::endl;
-  //
+	return action;
-  // dS/du = - phi^dag  (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 phi
+      };
  //       = - phi^dag M^-1 dM (MdagM)^-1 phi -  phi^dag (MdagM)^-1 dMdag dM
  //       (Mdag)^-1 phi
  //
  //       = - Ydag dM X  - Xdag dMdag Y
  //
  //////////////////////////////////////////////////////
  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
    FermOp.ImportGauge(U);
-    FermionField X(FermOp.FermionRedBlackGrid());
+      //////////////////////////////////////////////////////
-    FermionField Y(FermOp.FermionRedBlackGrid());
+      //
-    GaugeField tmp(FermOp.GaugeGrid());
+      // dS/du = - phi^dag  (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 phi
      //       = - phi^dag M^-1 dM (MdagM)^-1 phi -  phi^dag (MdagM)^-1 dMdag dM (Mdag)^-1 phi 
      //
      //       = - Ydag dM X  - Xdag dMdag Y
      //
      //////////////////////////////////////////////////////
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-    SchurDifferentiableOperator<Impl> Mpc(FermOp);
+	FermOp.ImportGauge(U);
-    // Our conventions really make this UdSdU; We do not differentiate wrt Udag
+	FermionField X(FermOp.FermionRedBlackGrid());
-    // here.
+	FermionField Y(FermOp.FermionRedBlackGrid());
-    // So must take dSdU - adj(dSdU) and left multiply by mom to get dS/dt.
+	GaugeField tmp(FermOp.GaugeGrid());
-    X = zero;
+	SchurDifferentiableOperator<Impl> Mpc(FermOp);
    DerivativeSolver(Mpc, PhiOdd, X);
    Mpc.Mpc(X, Y);
    Mpc.MpcDeriv(tmp, Y, X);
    dSdU = tmp;
    Mpc.MpcDagDeriv(tmp, X, Y);
    dSdU = dSdU + tmp;
-    // Treat the EE case. (MdagM)^-1 = Minv Minvdag
+	// Our conventions really make this UdSdU; We do not differentiate wrt Udag here.
-    // Deriv defaults to zero.
+	// So must take dSdU - adj(dSdU) and left multiply by mom to get dS/dt.
    //        FermOp.MooeeInvDag(PhiOdd,Y);
    //      FermOp.MooeeInv(Y,X);
    //  FermOp.MeeDeriv(tmp , Y, X,DaggerNo );    dSdU=tmp;
    //  FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
-    assert(FermOp.ConstEE() == 1);
+	X=zero;
 	DerivativeSolver(Mpc,PhiOdd,X);
 	Mpc.Mpc(X,Y);
  	Mpc.MpcDeriv(tmp , Y, X );    dSdU=tmp;
 	Mpc.MpcDagDeriv(tmp , X, Y);  dSdU=dSdU+tmp;
-    /*
+	// Treat the EE case. (MdagM)^-1 = Minv Minvdag
-    FermOp.MooeeInvDag(PhiOdd,Y);
+	// Deriv defaults to zero.
-    FermOp.MooeeInv(Y,X);
+	//        FermOp.MooeeInvDag(PhiOdd,Y);
-    FermOp.MeeDeriv(tmp , Y, X,DaggerNo );    dSdU=tmp;
+	//      FermOp.MooeeInv(Y,X);
-    FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
+	//	FermOp.MeeDeriv(tmp , Y, X,DaggerNo );    dSdU=tmp;
-    */
+	//  FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
-    dSdU = Ta(dSdU);
+	assert(FermOp.ConstEE() == 1);
-  };
+
-};
+	/*
-}
+        FermOp.MooeeInvDag(PhiOdd,Y);
        FermOp.MooeeInv(Y,X);
  	FermOp.MeeDeriv(tmp , Y, X,DaggerNo );    dSdU=tmp;
 	FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
 	*/
 	//dSdU = Ta(dSdU);
      };
    };
  }
 }
 #endif
--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -188,8 +188,9 @@ namespace Grid{
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
-	dSdU = -Ta(dSdU);
+	//dSdU = -Ta(dSdU);
-
+	dSdU = -dSdU;
      };
    };
  }
--- a/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
@@ -155,7 +155,8 @@ namespace Grid{
 	DenOp.MDeriv(force,Y,X,DaggerNo);   dSdU=dSdU-force;
 	DenOp.MDeriv(force,X,Y,DaggerYes);  dSdU=dSdU-force;
-	dSdU = - Ta(dSdU);
+	dSdU *= -1.0;
 	//dSdU = - Ta(dSdU);
      };
    };
--- a/lib/qcd/hmc/HMC.h
+++ b/lib/qcd/hmc/HMC.h
@@ -1,33 +1,34 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/qcd/hmc/HMC.h
+Source file: ./lib/qcd/hmc/HMC.h
-    Copyright (C) 2015
+Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
 /*  END LEGAL */
 //--------------------------------------------------------------------
 /*! @file HMC.h
 * @brief Classes for Hybrid Monte Carlo update
@@ -41,172 +42,195 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <string>
 namespace Grid {
 namespace QCD {
-namespace Grid{
+struct HMCparameters {
-  namespace QCD{
+  Integer StartTrajectory;
-    
+  Integer Trajectories; /* @brief Number of sweeps in this run */
  bool MetropolisTest;
  Integer NoMetropolisUntil;
-    struct HMCparameters{
+  HMCparameters() {
    ////////////////////////////// Default values
    MetropolisTest = true;
    NoMetropolisUntil = 10;
    StartTrajectory = 0;
    Trajectories = 200;
    /////////////////////////////////
  }
-      Integer StartTrajectory;
+  void print() const {
-      Integer Trajectories; /* @brief Number of sweeps in this run */
+    std::cout << GridLogMessage << "[HMC parameter] Trajectories            : " << Trajectories << "\n";
-      bool    MetropolisTest;
+    std::cout << GridLogMessage << "[HMC parameter] Start trajectory        : " << StartTrajectory << "\n";
-      Integer NoMetropolisUntil;
+    std::cout << GridLogMessage << "[HMC parameter] Metropolis test (on/off): " << MetropolisTest << "\n";
    std::cout << GridLogMessage << "[HMC parameter] Thermalization trajs    : " << NoMetropolisUntil << "\n";
  }
 };
-      HMCparameters(){
+template <class GaugeField>
-	////////////////////////////// Default values
+class HmcObservable {
-	MetropolisTest      = true;
+ public:
-	NoMetropolisUntil   = 10;
+  virtual void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
-	StartTrajectory     = 0;
+                                  GridParallelRNG &pRNG) = 0;
-	Trajectories        = 200;
+};
 	/////////////////////////////////
      }
    };
-    template<class GaugeField> 
+template <class Gimpl>
-    class HmcObservable {
+class PlaquetteLogger : public HmcObservable<typename Gimpl::GaugeField> {
-    public:
+ private:
-      virtual void TrajectoryComplete (int traj, GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG & pRNG )=0;
+  std::string Stem;
    };
-    template<class Gimpl> 
+ public:
-    class PlaquetteLogger : public HmcObservable<typename Gimpl::GaugeField> {
+  INHERIT_GIMPL_TYPES(Gimpl);
-    private:
+  PlaquetteLogger(std::string cf) { Stem = cf; };
      std::string Stem;
    public:
      INHERIT_GIMPL_TYPES(Gimpl);
      PlaquetteLogger(std::string cf) {
        Stem  = cf;
      };
-      void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG & pRNG )
+  void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
-      {
+                          GridParallelRNG &pRNG) {
-	  std::string file;   { std::ostringstream os; os << Stem     <<"."<< traj; file = os.str(); }
+    std::string file;
-	  std::ofstream of(file);
+    {
      std::ostringstream os;
      os << Stem << "." << traj;
      file = os.str();
    }
    std::ofstream of(file);
-	  RealD peri_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
+    RealD peri_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
-	  RealD peri_rect = WilsonLoops<PeriodicGimplR>::avgRectangle(U);
+    RealD peri_rect = WilsonLoops<PeriodicGimplR>::avgRectangle(U);
-	  RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
+    RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
-	  RealD impl_rect = WilsonLoops<Gimpl>::avgRectangle(U);
+    RealD impl_rect = WilsonLoops<Gimpl>::avgRectangle(U);
-	  of << traj<<" "<< impl_plaq << " " << impl_rect << "  "<< peri_plaq<<" "<<peri_rect<<std::endl;
+    of << traj << " " << impl_plaq << " " << impl_rect << "  " << peri_plaq
-	  std::cout<< GridLogMessage<< "traj"<<" "<< "plaq " << " " << " rect  " << "  "<< "peri_plaq" <<" "<<"peri_rect"<<std::endl;
+       << " " << peri_rect << std::endl;
-	  std::cout<< GridLogMessage<< traj<<" "<< impl_plaq << " " << impl_rect << "  "<< peri_plaq<<" "<<peri_rect<<std::endl;
+    std::cout << GridLogMessage << "traj"
-      }
+              << " "
-    };
+              << "plaq "
              << " "
              << " rect  "
              << "  "
              << "peri_plaq"
              << " "
              << "peri_rect" << std::endl;
    std::cout << GridLogMessage << traj << " " << impl_plaq << " " << impl_rect
              << "  " << peri_plaq << " " << peri_rect << std::endl;
  }
 };
-    //    template <class GaugeField, class Integrator, class Smearer, class Boundary> 
+//    template <class GaugeField, class Integrator, class Smearer, class
-    template <class GaugeField, class IntegratorType>
+//    Boundary>
-    class HybridMonteCarlo {
+template <class GaugeField, class IntegratorType>
-    private:
+class HybridMonteCarlo {
 private:
  const HMCparameters Params;
-      const HMCparameters Params;
+  GridSerialRNG &sRNG;    // Fixme: need a RNG management strategy.
-      
+  GridParallelRNG &pRNG;  // Fixme: need a RNG management strategy.
-      GridSerialRNG   &sRNG; // Fixme: need a RNG management strategy.
+  GaugeField &Ucur;
      GridParallelRNG &pRNG; // Fixme: need a RNG management strategy.
      GaugeField      & Ucur;
-      IntegratorType &TheIntegrator;
+  IntegratorType &TheIntegrator;
-      std::vector<HmcObservable<GaugeField> *> Observables;
+  std::vector<HmcObservable<GaugeField> *> Observables;
-      /////////////////////////////////////////////////////////
+  /////////////////////////////////////////////////////////
-      // Metropolis step
+  // Metropolis step
-      /////////////////////////////////////////////////////////
+  /////////////////////////////////////////////////////////
-      bool metropolis_test(const RealD DeltaH){
+  bool metropolis_test(const RealD DeltaH) {
    RealD rn_test;
-	RealD rn_test;
+    RealD prob = std::exp(-DeltaH);
-	RealD prob = std::exp(-DeltaH);
+    random(sRNG, rn_test);
-	random(sRNG,rn_test);
+    std::cout << GridLogMessage
-      
+              << "--------------------------------------------------\n";
-	std::cout<<GridLogMessage<< "--------------------------------------------\n";
+    std::cout << GridLogMessage << "exp(-dH) = " << prob
-	std::cout<<GridLogMessage<< "dH = "<<DeltaH << "  Random = "<< rn_test <<"\n";
+              << "  Random = " << rn_test << "\n";
-	std::cout<<GridLogMessage<< "Acc. Probability = " << ((prob<1.0)? prob: 1.0)<< "   ";
+    std::cout << GridLogMessage
-      
+              << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
 	if((prob >1.0) || (rn_test <= prob)){       // accepted
 	  std::cout<<GridLogMessage <<"-- ACCEPTED\n";
 	  return true;
 	} else {                               // rejected
 	  std::cout<<GridLogMessage <<"-- REJECTED\n";
 	  return false;
 	}
    if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
      std::cout << GridLogMessage
                << "--------------------------------------------------\n";
      return true;
    } else {  // rejected
      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
      std::cout << GridLogMessage
                << "--------------------------------------------------\n";
      return false;
    }
  }
  /////////////////////////////////////////////////////////
  // Evolution
  /////////////////////////////////////////////////////////
  RealD evolve_step(GaugeField &U) {
    TheIntegrator.refresh(U, pRNG);  // set U and initialize P and phi's
    RealD H0 = TheIntegrator.S(U);  // initial state action
    std::streamsize current_precision = std::cout.precision();
    std::cout.precision(17);
    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
    std::cout.precision(current_precision);
    TheIntegrator.integrate(U);
    RealD H1 = TheIntegrator.S(U);  // updated state action
    std::cout.precision(17);
    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
              << "  dH = " << H1 - H0 << "\n";
    std::cout.precision(current_precision);
    return (H1 - H0);
  }
 public:
  /////////////////////////////////////////
  // Constructor
  /////////////////////////////////////////
  HybridMonteCarlo(HMCparameters Pams, IntegratorType &_Int,
                   GridSerialRNG &_sRNG, GridParallelRNG &_pRNG, GaugeField &_U)
      : Params(Pams), TheIntegrator(_Int), sRNG(_sRNG), pRNG(_pRNG), Ucur(_U) {}
  ~HybridMonteCarlo(){};
  void AddObservable(HmcObservable<GaugeField> *obs) {
    Observables.push_back(obs);
  }
  void evolve(void) {
    Real DeltaH;
    GaugeField Ucopy(Ucur._grid);
    Params.print();
    // Actual updates (evolve a copy Ucopy then copy back eventually)
    for (int traj = Params.StartTrajectory;
         traj < Params.Trajectories + Params.StartTrajectory; ++traj) {
      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
      Ucopy = Ucur;
      DeltaH = evolve_step(Ucopy);
      bool accept = true;
      if (traj >= Params.NoMetropolisUntil) {
        accept = metropolis_test(DeltaH);
      }
-      /////////////////////////////////////////////////////////
+      if (accept) {
-      // Evolution
+        Ucur = Ucopy;
      /////////////////////////////////////////////////////////
      RealD evolve_step(GaugeField& U){
 	TheIntegrator.refresh(U,pRNG); // set U and initialize P and phi's 
 	RealD H0 = TheIntegrator.S(U); // initial state action  
 	std::cout<<GridLogMessage<<"Total H before = "<< H0 << "\n";
 	TheIntegrator.integrate(U);
 	RealD H1 = TheIntegrator.S(U); // updated state action            
 	std::cout<<GridLogMessage<<"Total H after = "<< H1 << "\n";
 	return (H1-H0);
      }
    public:
      /////////////////////////////////////////
      // Constructor
      /////////////////////////////////////////
      HybridMonteCarlo(HMCparameters Pms,  IntegratorType &_Int, GridSerialRNG &_sRNG, GridParallelRNG &_pRNG, GaugeField &_U ) :
        Params(Pms), 
 	TheIntegrator(_Int), 
 	sRNG(_sRNG),
 	pRNG(_pRNG),
 	Ucur(_U)
      {
      }
      ~HybridMonteCarlo(){};
      void AddObservable(HmcObservable<GaugeField> *obs) {
 	Observables.push_back(obs);
      }
-      void evolve(void){
+      for (int obs = 0; obs < Observables.size(); obs++) {
-
+        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
 	Real DeltaH;
 	GaugeField Ucopy(Ucur._grid);
 	// Actual updates (evolve a copy Ucopy then copy back eventually)
 	for(int traj=Params.StartTrajectory; traj < Params.Trajectories+Params.StartTrajectory; ++traj){
 	  std::cout<<GridLogMessage << "-- # Trajectory = "<< traj <<  "\n";
 	  Ucopy = Ucur;
 	  DeltaH = evolve_step(Ucopy);
 	  bool accept = true;
 	  if ( traj > Params.NoMetropolisUntil) { 
 	    accept = metropolis_test(DeltaH);
 	  }
 	  if ( accept ) {
 	    Ucur = Ucopy;
 	  }
 	  for(int obs = 0;obs<Observables.size();obs++){
 	    Observables[obs]->TrajectoryComplete (traj+1,Ucur,sRNG,pRNG);
 	  }
 	}
      }
-    };
+    }
-    
+  }
-  }// QCD
+};
 }// Grid
 }  // QCD
 }  // Grid
-#endif 
+#endif
--- a/lib/qcd/hmc/HmcRunner.h
+++ b/lib/qcd/hmc/HmcRunner.h
@@ -47,7 +47,7 @@ public:
  GridRedBlackCartesian * UrbGrid ;
  GridRedBlackCartesian * FrbGrid ;
-  virtual void BuildTheAction (int argc, char **argv) = 0;
+  virtual void BuildTheAction (int argc, char **argv) = 0; // necessary?
  void Run (int argc, char  **argv){
@@ -81,55 +81,78 @@ public:
      NumTraj = ivec[0];
    }
-    // Create integrator
+    int NumThermalizations = 10;
-    typedef MinimumNorm2<GaugeField>  IntegratorType;// change here to change the algorithm
+    if( GridCmdOptionExists(argv,argv+argc,"--Thermalizations") ){
-    IntegratorParameters MDpar(20);
+      arg= GridCmdOptionPayload(argv,argv+argc,"--Thermalizations");
-    IntegratorType MDynamics(UGrid,MDpar, TheAction);
+      std::vector<int> ivec(0);
      GridCmdOptionIntVector(arg,ivec);
      NumThermalizations = ivec[0];
    }
    GridSerialRNG    sRNG;
    GridParallelRNG  pRNG(UGrid);
    LatticeGaugeField  U(UGrid); // change this to an extended field (smearing class)
    std::vector<int> SerSeed({1,2,3,4,5});
    std::vector<int> ParSeed({6,7,8,9,10});
    // Create integrator, including the smearing policy
    // Smearing policy
    std::cout << GridLogDebug << " Creating the Stout class\n";
    double rho = 0.1; // smearing parameter, now hardcoded
    int Nsmear = 1;   // number of smearing levels
    Smear_Stout<Gimpl> Stout(rho);
    std::cout << GridLogDebug << " Creating the SmearedConfiguration class\n";
    SmearedConfiguration<Gimpl> SmearingPolicy(UGrid, Nsmear, Stout);
    std::cout << GridLogDebug << " done\n";
    //////////////
    typedef MinimumNorm2<GaugeField, SmearedConfiguration<Gimpl> >  IntegratorType;// change here to change the algorithm
    IntegratorParameters MDpar(20);
    IntegratorType MDynamics(UGrid, MDpar, TheAction, SmearingPolicy);
    // Checkpoint strategy
    NerscHmcCheckpointer<Gimpl> Checkpoint(std::string("ckpoint_lat"),std::string("ckpoint_rng"),1);
    PlaquetteLogger<Gimpl>      PlaqLog(std::string("plaq"));
    HMCparameters HMCpar;
-    HMCpar.StartTrajectory = StartTraj;
+    HMCpar.StartTrajectory   = StartTraj;
-    HMCpar.Trajectories    = NumTraj;
+    HMCpar.Trajectories      = NumTraj;
    HMCpar.NoMetropolisUntil = NumThermalizations;
    GridSerialRNG    sRNG;
    GridParallelRNG  pRNG(UGrid);
    LatticeGaugeField  U(UGrid);
    std::vector<int> SerSeed({1,2,3,4,5});
    std::vector<int> ParSeed({6,7,8,9,10});
    if ( StartType == HotStart ) {
      // Hot start
      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::HotConfiguration(pRNG, U);
    } else if ( StartType == ColdStart ) { 
      // Cold start
      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::ColdConfiguration(pRNG, U);
    } else if ( StartType == TepidStart ) {       
      // Tepid start
      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::TepidConfiguration(pRNG, U);
    } else if ( StartType == CheckpointStart ) { 
      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      // CheckpointRestart
      Checkpoint.CheckpointRestore(StartTraj, U, sRNG, pRNG);
    }
-    HybridMonteCarlo<GaugeField,IntegratorType>  HMC(HMCpar, MDynamics,sRNG,pRNG,U);
+    // Attach the gauge field to the smearing Policy and create the fill the smeared set
    // notice that the unit configuration is singular in this procedure
    std::cout << GridLogMessage << "Filling the smeared set\n"; 
    SmearingPolicy.set_GaugeField(U);
    HybridMonteCarlo<GaugeField,IntegratorType>  HMC(HMCpar, MDynamics,sRNG,pRNG,U); 
    HMC.AddObservable(&Checkpoint);
    HMC.AddObservable(&PlaqLog);
--- a/lib/qcd/hmc/integrators/Integrator.h
+++ b/lib/qcd/hmc/integrators/Integrator.h
@@ -44,40 +44,40 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <memory>
-namespace Grid{
+ namespace Grid{
-  namespace QCD{
+ 	namespace QCD{
-    struct IntegratorParameters{
+ 		struct IntegratorParameters{
-      int Nexp;
+ 			int Nexp;
      int MDsteps;  // number of outer steps
      RealD trajL;  // trajectory length 
      RealD stepsize;
      IntegratorParameters(int MDsteps_, 
-			   RealD trajL_=1.0,
+      	RealD trajL_=1.0,
-			   int Nexp_=12):
+      	int Nexp_=12):
-        Nexp(Nexp_),
+      Nexp(Nexp_),
-	MDsteps(MDsteps_),
+      MDsteps(MDsteps_),
-	trajL(trajL_),
+      trajL(trajL_),
-	stepsize(trajL/MDsteps)
+      stepsize(trajL/MDsteps)
-        {
+      {
 	  // empty body constructor
-	};
+      };
-    };
+  };
    /*! @brief Class for Molecular Dynamics management */   
-    template<class GaugeField>
+    template<class GaugeField, class SmearingPolicy>
-    class Integrator {
+  class Integrator {
-    protected:
+  protected:
-      typedef IntegratorParameters ParameterType;
+  	typedef IntegratorParameters ParameterType;
-      IntegratorParameters Params;
+  	IntegratorParameters Params;
-      const ActionSet<GaugeField> as;
+  	const ActionSet<GaugeField> as;
      int levels;              //
      double t_U;              // Track time passing on each level and for U and for P
@@ -85,17 +85,19 @@ namespace Grid{
      GaugeField P;
      SmearingPolicy &Smearer;
      // Should match any legal (SU(n)) gauge field
      // Need to use this template to match Ncol to pass to SU<N> class
      template<int Ncol,class vec> void generate_momenta(Lattice< iVector< iScalar< iMatrix<vec,Ncol> >, Nd> > & P,GridParallelRNG& pRNG){
-	typedef Lattice< iScalar< iScalar< iMatrix<vec,Ncol> > > > GaugeLinkField;
+      typedef Lattice< iScalar< iScalar< iMatrix<vec,Ncol> > > > GaugeLinkField;
-	GaugeLinkField Pmu(P._grid);
+      GaugeLinkField Pmu(P._grid);
-	Pmu = zero;
+      Pmu = zero;
-	for(int mu=0;mu<Nd;mu++){
+      for(int mu=0;mu<Nd;mu++){
-	  SU<Ncol>::GaussianLieAlgebraMatrix(pRNG, Pmu);
+      	SU<Ncol>::GaussianLieAlgebraMatrix(pRNG, Pmu);
-	  PokeIndex<LorentzIndex>(P, Pmu, mu);
+      	PokeIndex<LorentzIndex>(P, Pmu, mu);
 	}
      }
  }
      //ObserverList observers; // not yet
@@ -103,110 +105,128 @@ namespace Grid{
      //      void register_observers();
      //      void notify_observers();
-      void update_P(GaugeField&U, int level,double ep){
+  void update_P(GaugeField&U, int level, double ep){
-	t_P[level]+=ep;
+  	t_P[level]+=ep;
-	update_P(P,U,level,ep);
+  	update_P(P,U,level,ep);
-	std::cout<<GridLogIntegrator<<"["<<level<<"] P " << " dt "<< ep <<" : t_P "<< t_P[level] <<std::endl;
+  	std::cout<<GridLogIntegrator<<"["<<level<<"] P " << " dt "<< ep <<" : t_P "<< t_P[level] <<std::endl;
-      }
+  }
-      void update_P(GaugeField &Mom,GaugeField&U, int level,double ep){
+  void update_P(GaugeField &Mom,GaugeField&U, int level,double ep){
-	for(int a=0; a<as[level].actions.size(); ++a){
+  	// input U actually not used... 
-	  GaugeField force(U._grid);
+  	for(int a=0; a<as[level].actions.size(); ++a){
-	  as[level].actions.at(a)->deriv(U,force);
+  		GaugeField force(U._grid);
-	  Mom = Mom - force*ep;
+  		GaugeField& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
  		as[level].actions.at(a)->deriv(Us,force); // deriv should NOT include Ta
 	  	std::cout<< GridLogIntegrator << "Smearing (on/off): "<<as[level].actions.at(a)->is_smeared <<std::endl;
 	  	if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
 	  	force = Ta(force);
 	  	std::cout<< GridLogIntegrator << "Force average: "<< norm2(force)/(U._grid->gSites()) <<std::endl;
 	  	Mom -= force*ep;
 	  }
 	}
      }
-      void update_U(GaugeField&U, double ep){
+	void update_U(GaugeField&U, double ep){
-	update_U(P,U,ep);
+		update_U(P,U,ep);
-	t_U+=ep;
+		t_U+=ep;
-	int fl = levels-1;
+		int fl = levels-1;
-	std::cout<<GridLogIntegrator<<"   "<<"["<<fl<<"] U " << " dt "<< ep <<" : t_U "<< t_U <<std::endl;
+		std::cout<< GridLogIntegrator <<"   "<<"["<<fl<<"] U " << " dt "<< ep <<" : t_U "<< t_U <<std::endl;
-      }
+	}
-      void update_U(GaugeField &Mom, GaugeField&U, double ep){
+	void update_U(GaugeField &Mom, GaugeField&U, double ep){
 	//rewrite exponential to deal automatically  with the lorentz index?
 	//	GaugeLinkField Umu(U._grid);
 	//	GaugeLinkField Pmu(U._grid);
-	for (int mu = 0; mu < Nd; mu++){
+		for (int mu = 0; mu < Nd; mu++){
-	  auto Umu=PeekIndex<LorentzIndex>(U, mu);
+			auto Umu=PeekIndex<LorentzIndex>(U, mu);
-	  auto Pmu=PeekIndex<LorentzIndex>(Mom, mu);
+			auto Pmu=PeekIndex<LorentzIndex>(Mom, mu);
-	  Umu = expMat(Pmu, ep, Params.Nexp)*Umu;
+			Umu = expMat(Pmu, ep, Params.Nexp)*Umu;
-	  ProjectOnGroup(Umu);
+			ProjectOnGroup(Umu);
-	  PokeIndex<LorentzIndex>(U, Umu, mu);
+			PokeIndex<LorentzIndex>(U, Umu, mu);
 		}
 	// Update the smeared fields, can be implemented as observer
 		Smearer.set_GaugeField(U);
 	}
      }
      virtual void step (GaugeField& U,int level, int first,int last)=0;
-    public:
+	virtual void step (GaugeField& U,int level, int first,int last)=0;
-      Integrator(GridBase* grid, 
+public:
-		 IntegratorParameters Par,
+
-		 ActionSet<GaugeField> & Aset):
+	Integrator(GridBase* grid, 
-          Params(Par),
+		IntegratorParameters Par,
-    	  as(Aset),
+		ActionSet<GaugeField> & Aset,
-	  P(grid),
+		SmearingPolicy &Sm):
-	  levels(Aset.size())
+	Params(Par),
-      {
+	as(Aset),
-	t_P.resize(levels,0.0);
+	P(grid),
-	t_U=0.0;
+	levels(Aset.size()),
-      };
+	Smearer(Sm)
-      
+	{
-      virtual ~Integrator(){}
+		t_P.resize(levels,0.0);
 		t_U=0.0;
 	// initialization of smearer delegated outside of Integrator
 	};
 	virtual ~Integrator(){}
      //Initialization of momenta and actions
-      void refresh(GaugeField& U,GridParallelRNG &pRNG){
+	void refresh(GaugeField& U,GridParallelRNG &pRNG){
-	std::cout<<GridLogIntegrator<< "Integrator refresh\n";
+		std::cout<<GridLogIntegrator<< "Integrator refresh\n";
-	generate_momenta(P,pRNG);
+		generate_momenta(P,pRNG);
-	for(int level=0; level< as.size(); ++level){
+		for(int level=0; level< as.size(); ++level){
-	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
+			for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
-	    as[level].actions.at(actionID)->refresh(U, pRNG);
+	    // get gauge field from the SmearingPolicy and
-	  }
+	    // based on the boolean is_smeared in actionID
 				GaugeField& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
 				as[level].actions.at(actionID)->refresh(Us, pRNG);
 			}
 		}
 	}
      }
      // Calculate action
-      RealD S(GaugeField& U){
+	RealD S(GaugeField& U){// here also U not used
-	LatticeComplex Hloc(U._grid);	Hloc = zero;
+		LatticeComplex Hloc(U._grid);	Hloc = zero;
 	// Momenta
-	for (int mu=0; mu <Nd; mu++){
+		for (int mu=0; mu <Nd; mu++){
-	  auto Pmu = PeekIndex<LorentzIndex>(P, mu);
+			auto Pmu = PeekIndex<LorentzIndex>(P, mu);
-	  Hloc -= trace(Pmu*Pmu);
+			Hloc -= trace(Pmu*Pmu);
-	}
+		}
-	Complex Hsum = sum(Hloc);
+		Complex Hsum = sum(Hloc);
-	
+
-	RealD H = Hsum.real();
+		RealD H = Hsum.real();
-	RealD Hterm;
+		RealD Hterm;
-	std::cout<<GridLogMessage << "Momentum action H_p = "<< H << "\n";
+		std::cout<<GridLogMessage << "Momentum action H_p = "<< H << "\n";
 	// Actions
-	for(int level=0; level<as.size(); ++level){
+		for(int level=0; level<as.size(); ++level){
-	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
+			for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
-	    Hterm = as[level].actions.at(actionID)->S(U);
+	    // get gauge field from the SmearingPolicy and
-	    std::cout<<GridLogMessage << "Level "<<level<<" term "<<actionID<<" H = "<<Hterm<<std::endl;
+	    // based on the boolean is_smeared in actionID
-	    H += Hterm;
+				GaugeField& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
-	  }
+				Hterm = as[level].actions.at(actionID)->S(Us);
-	}
+				std::cout<<GridLogMessage << "S Level "<<level<<" term "<<actionID<<" H = "<<Hterm<<std::endl;
-	
+				H += Hterm;
-	return H;
+			}
-      }
+		}
-      void integrate(GaugeField& U){
+		return H;
 	}
 	void integrate(GaugeField& U){
 	// reset the clocks
-	t_U=0;
+		t_U=0;
-	for(int level=0; level<as.size(); ++level){
+		for(int level=0; level<as.size(); ++level){
-	  t_P[level]=0;
+			t_P[level]=0;
-	}	
+		}	
 	for(int step=0; step< Params.MDsteps; ++step){   // MD step
-	  int first_step = (step==0);
+		int first_step = (step==0);
-	  int  last_step = (step==Params.MDsteps-1);
+		int  last_step = (step==Params.MDsteps-1);
-	  this->step(U,0,first_step,last_step);
+		this->step(U,0,first_step,last_step);
 	}
 	// Check the clocks all match on all levels
@@ -219,9 +239,9 @@ namespace Grid{
 	assert(fabs(t_U-Params.trajL) < 1.0e-6);
-      }
+}
-    };
+};
-    
+
-  }
+}
 }
 #endif//INTEGRATOR_INCLUDED
--- a/lib/qcd/hmc/integrators/Integrator_algorithm.h
+++ b/lib/qcd/hmc/integrators/Integrator_algorithm.h
@@ -91,14 +91,17 @@ namespace Grid{
    *  P 1/2                            P 1/2
    */    
-    template<class GaugeField> class LeapFrog : public Integrator<GaugeField> {
+    template<class GaugeField, class SmearingPolicy> class LeapFrog :
      public Integrator<GaugeField, SmearingPolicy> {
    public:
-      typedef LeapFrog<GaugeField> Algorithm;
+      typedef LeapFrog<GaugeField, SmearingPolicy> Algorithm;
      LeapFrog(GridBase* grid, 
 	       IntegratorParameters Par,
-	       ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+	       ActionSet<GaugeField> & Aset,
 	       SmearingPolicy & Sm):
 	Integrator<GaugeField, SmearingPolicy>(grid,Par,Aset,Sm) {};
      void step (GaugeField& U, int level,int _first, int _last){
@@ -135,7 +138,8 @@ namespace Grid{
      }
    };
-    template<class GaugeField> class MinimumNorm2 : public Integrator<GaugeField> {
+    template<class GaugeField, class SmearingPolicy> class MinimumNorm2 :
      public Integrator<GaugeField, SmearingPolicy> {
    private:
      const RealD lambda = 0.1931833275037836;
@@ -143,7 +147,9 @@ namespace Grid{
      MinimumNorm2(GridBase* grid, 
 		   IntegratorParameters Par,
-		   ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+		   ActionSet<GaugeField> & Aset,
 		   SmearingPolicy& Sm):
 	Integrator<GaugeField, SmearingPolicy>(grid,Par,Aset,Sm) {};
      void step (GaugeField& U, int level, int _first,int _last){
@@ -191,7 +197,8 @@ namespace Grid{
    };
-    template<class GaugeField> class ForceGradient : public Integrator<GaugeField> {
+    template<class GaugeField, class SmearingPolicy> class ForceGradient :
      public Integrator<GaugeField, SmearingPolicy> {
    private:
      const RealD lambda = 1.0/6.0;;
      const RealD chi    = 1.0/72.0;
@@ -202,7 +209,9 @@ namespace Grid{
      // Looks like dH scales as dt^4. tested wilson/wilson 2 level.
    ForceGradient(GridBase* grid, 
 		  IntegratorParameters Par,
-		  ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+		  ActionSet<GaugeField> & Aset,
 		  SmearingPolicy &Sm):
      Integrator<GaugeField, SmearingPolicy>(grid,Par,Aset, Sm) {};
      void FG_update_P(GaugeField&U, int level,double fg_dt,double ep){
--- a/lib/qcd/smearing/APEsmearing.h
+++ b/lib/qcd/smearing/APEsmearing.h
@@ -0,0 +1,130 @@
 /*!
  @brief Declaration of Smear_APE class for APE smearing
 */
 #ifndef APE_SMEAR_
 #define APE_SMEAR_
  namespace Grid {
  	namespace QCD {
    /*!  @brief APE type smearing of link variables. */
    template <class Gimpl> 
  		class Smear_APE: public Smear<Gimpl>{
  		private:
      	const std::vector<double> rho;/*!< Array of weights */
 //This member must be private - we do not want to control from outside 
  			std::vector<double> set_rho(const double common_rho) const {
  				std::vector<double> res;
  				for(int mn=0; mn<Nd*Nd; ++mn) res.push_back(common_rho);
  					for(int mu=0; mu<Nd; ++mu) res[mu + mu*Nd] = 0.0;
  						return res;
  				}
  			public:
      // Defines the gauge field types
  				INHERIT_GIMPL_TYPES(Gimpl)
      // Constructors and destructors
  				Smear_APE(const std::vector<double>& rho_):rho(rho_){} // check vector size
  				Smear_APE(double rho_val):rho(set_rho(rho_val)){}
  				Smear_APE():rho(set_rho(1.0)){}
  				~Smear_APE(){}
      ///////////////////////////////////////////////////////////////////////////////
  				void smear(GaugeField& u_smr, const GaugeField& U)const{
  					GridBase *grid = U._grid;
  					GaugeLinkField Cup(grid), tmp_stpl(grid);
  					WilsonLoops<Gimpl> WL;
  					u_smr = zero; 
  					for(int mu=0; mu<Nd; ++mu){
  						Cup = zero;
  						for(int nu=0; nu<Nd; ++nu){
  							if (nu != mu) {
  								// get the staple in direction mu, nu
 	      						WL.Staple(tmp_stpl, U, mu, nu);  //nb staple conventions of IroIro and Grid differ by a dagger
 	      						Cup += tmp_stpl*rho[mu + Nd * nu];
 	      					}
 	      				}
 	  					// save the Cup link-field on the u_smr gauge-field
 	  					pokeLorentz(u_smr, adj(Cup), mu); // u_smr[mu] = Cup^dag   see conventions for Staple
 	  				}
 	  			}
 ////////////////////////////////////////////////////////////////////////////////
 	  			void derivative(GaugeField& SigmaTerm,
 	  				const GaugeField& iLambda,
 	  				const GaugeField& U)const{
 	// Reference 
 	// Morningstar, Peardon, Phys.Rev.D69,054501(2004)
 	// Equation 75
    // Computing Sigma_mu, derivative of S[fat links] with respect to the thin links
    // Output SigmaTerm
 	  				GridBase *grid = U._grid;
 	  				WilsonLoops<Gimpl> WL;
 	  				GaugeLinkField staple(grid), u_tmp(grid);
 	  				GaugeLinkField iLambda_mu(grid), iLambda_nu(grid);
 	  				GaugeLinkField U_mu(grid), U_nu(grid);
 	  				GaugeLinkField sh_field(grid), temp_Sigma(grid);
 	  				Real rho_munu, rho_numu;
 	  				for(int mu = 0; mu < Nd; ++mu){
 	  					U_mu       = peekLorentz(      U, mu);
 	  					iLambda_mu = peekLorentz(iLambda, mu);
 	  					for(int nu = 0; nu < Nd; ++nu){
 	  						if(nu==mu) continue;
 	  						U_nu       = peekLorentz(      U, nu);
 	  						iLambda_nu = peekLorentz(iLambda, nu);
 	  						rho_munu = rho[mu + Nd * nu];
 	  						rho_numu = rho[nu + Nd * mu];
 	  						WL.StapleUpper(staple, U, mu, nu);
 	  						temp_Sigma = -rho_numu*staple*iLambda_nu;  //ok
 	        				//-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x)
 	  						Gimpl::AddGaugeLink(SigmaTerm, temp_Sigma, mu);
 	    					sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity?
 	    					temp_Sigma = rho_numu*sh_field*staple; //ok
 	    					//r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)
 	    					Gimpl::AddGaugeLink(SigmaTerm, temp_Sigma, mu);
 	    					sh_field = Cshift(iLambda_mu, nu, 1);
 	    					temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok
 	    					//-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x)
 	    					Gimpl::AddGaugeLink(SigmaTerm, temp_Sigma, mu);
 	    					staple = zero;
 	    					sh_field = Cshift(U_nu, mu, 1);
 	    					temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu;
 	    					temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu;
 	    					u_tmp = adj(U_nu)*iLambda_nu;
 	    					sh_field = Cshift(u_tmp, mu, 1);
 	    					temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu;
 	    					sh_field = Cshift(temp_Sigma, nu, -1);
 	    					Gimpl::AddGaugeLink(SigmaTerm, sh_field, mu);
 	    				}
 	    			}
 	    		}
 	    	};
  }// namespace QCD
 }//namespace Grid
 #endif  
--- a/lib/qcd/smearing/BaseSmearing.h
+++ b/lib/qcd/smearing/BaseSmearing.h
@@ -0,0 +1,17 @@
 /*
  @brief Declares base smearing class Smear
 */
 #ifndef BASE_SMEAR_
 #define BASE_SMEAR_
 template <class Gimpl> 
 class Smear{
 public:
  INHERIT_GIMPL_TYPES(Gimpl) // inherits the types for the gauge fields
  virtual ~Smear(){}
  virtual void smear     (GaugeField&,const GaugeField&)const = 0;
  virtual void derivative(GaugeField&,
 			  const GaugeField&,const GaugeField&) const = 0;
 };
 #endif
--- a/lib/qcd/smearing/GaugeConfiguration.h
+++ b/lib/qcd/smearing/GaugeConfiguration.h
@@ -0,0 +1,262 @@
 /*!
  @file GaugeConfiguration.h
  @brief Declares the GaugeConfiguration class
 */
 #ifndef GAUGE_CONFIG_
 #define GAUGE_CONFIG_
 namespace Grid {
 namespace QCD {
 /*!
  @brief Smeared configuration container
  It will behave like a configuration from the point of view of
  the HMC update and integrators.
  An "advanced configuration" object that can provide not only the
  data to store the gauge configuration but also operations to manipulate
  it, like smearing.
  It stores a list of smeared configurations.
 */
 template <class Gimpl>
 class SmearedConfiguration {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
 private:
  const unsigned int smearingLevels;
  Smear_Stout<Gimpl> StoutSmearing;
  std::vector<GaugeField> SmearedSet;
  // Member functions
  //====================================================================
  void fill_smearedSet(GaugeField& U) {
    ThinLinks = &U;  // attach the smearing routine to the field U
    // check the pointer is not null
    if (ThinLinks == NULL)
      std::cout << GridLogError
                << "[SmearedConfiguration] Error in ThinLinks pointer\n";
    if (smearingLevels > 0) {
      std::cout << GridLogDebug
                << "[SmearedConfiguration] Filling SmearedSet\n";
      GaugeField previous_u(ThinLinks->_grid);
      previous_u = *ThinLinks;
      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) {
        StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
        previous_u = SmearedSet[smearLvl];
        // For debug purposes
        RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(previous_u);
        std::cout << GridLogDebug
                  << "[SmearedConfiguration] Plaq: " << impl_plaq << std::endl;
      }
    }
  }
  //====================================================================
  GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
                                  const GaugeField& GaugeK) const {
    GridBase* grid = GaugeK._grid;
    GaugeField C(grid), SigmaK(grid), iLambda(grid);
    GaugeLinkField iLambda_mu(grid);
    GaugeLinkField iQ(grid), e_iQ(grid);
    GaugeLinkField SigmaKPrime_mu(grid);
    GaugeLinkField GaugeKmu(grid), Cmu(grid);
    StoutSmearing.BaseSmear(C, GaugeK);
    SigmaK = zero;
    iLambda = zero;
    for (int mu = 0; mu < Nd; mu++) {
      Cmu = peekLorentz(C, mu);
      GaugeKmu = peekLorentz(GaugeK, mu);
      SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu);
      iQ = Ta(Cmu * adj(GaugeKmu));
      set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
      pokeLorentz(iLambda, iLambda_mu, mu);
    }
    StoutSmearing.derivative(SigmaK, iLambda,
                             GaugeK);  // derivative of SmearBase
    return SigmaK;
  }
  /*! @brief Returns smeared configuration at level 'Level' */
  const GaugeField& get_smeared_conf(int Level) const {
    return SmearedSet[Level];
  }
  //====================================================================
  void set_iLambda(GaugeLinkField& iLambda, GaugeLinkField& e_iQ,
                   const GaugeLinkField& iQ, const GaugeLinkField& Sigmap,
                   const GaugeLinkField& GaugeK) const {
    GridBase* grid = iQ._grid;
    GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid);
    GaugeLinkField unity(grid);
    unity = 1.0;
    LatticeComplex u(grid), w(grid);
    LatticeComplex f0(grid), f1(grid), f2(grid);
    LatticeComplex xi0(grid), xi1(grid), tmp(grid);
    LatticeComplex u2(grid), w2(grid), cosw(grid);
    LatticeComplex emiu(grid), e2iu(grid), qt(grid), fden(grid);
    LatticeComplex r01(grid), r11(grid), r21(grid), r02(grid), r12(grid);
    LatticeComplex r22(grid), tr1(grid), tr2(grid);
    LatticeComplex b10(grid), b11(grid), b12(grid), b20(grid), b21(grid),
        b22(grid);
    LatticeComplex LatticeUnitComplex(grid);
    LatticeUnitComplex = 1.0;
    // Exponential
    iQ2 = iQ * iQ;
    iQ3 = iQ * iQ2;
    StoutSmearing.set_uw(u, w, iQ2, iQ3);
    StoutSmearing.set_fj(f0, f1, f2, u, w);
    e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;
    // Getting B1, B2, Gamma and Lambda
    // simplify this part, reduntant calculations in set_fj
    xi0 = StoutSmearing.func_xi0(w);
    xi1 = StoutSmearing.func_xi1(w);
    u2 = u * u;
    w2 = w * w;
    cosw = cos(w);
    emiu = cos(u) - timesI(sin(u));
    e2iu = cos(2.0 * u) + timesI(sin(2.0 * u));
    r01 = (2.0 * u + timesI(2.0 * (u2 - w2))) * e2iu +
          emiu * ((16.0 * u * cosw + 2.0 * u * (3.0 * u2 + w2) * xi0) +
                  timesI(-8.0 * u2 * cosw + 2.0 * (9.0 * u2 + w2) * xi0));
    r11 = (2.0 * LatticeUnitComplex + timesI(4.0 * u)) * e2iu +
          emiu * ((-2.0 * cosw + (3.0 * u2 - w2) * xi0) +
                  timesI((2.0 * u * cosw + 6.0 * u * xi0)));
    r21 =
        2.0 * timesI(e2iu) + emiu * (-3.0 * u * xi0 + timesI(cosw - 3.0 * xi0));
    r02 = -2.0 * e2iu +
          emiu * (-8.0 * u2 * xi0 +
                  timesI(2.0 * u * (cosw + xi0 + 3.0 * u2 * xi1)));
    r12 = emiu * (2.0 * u * xi0 + timesI(-cosw - xi0 + 3.0 * u2 * xi1));
    r22 = emiu * (xi0 - timesI(3.0 * u * xi1));
    fden = LatticeUnitComplex / (2.0 * (9.0 * u2 - w2) * (9.0 * u2 - w2));
    b10 = 2.0 * u * r01 + (3.0 * u2 - w2) * r02 - (30.0 * u2 + 2.0 * w2) * f0;
    b11 = 2.0 * u * r11 + (3.0 * u2 - w2) * r12 - (30.0 * u2 + 2.0 * w2) * f1;
    b12 = 2.0 * u * r21 + (3.0 * u2 - w2) * r22 - (30.0 * u2 + 2.0 * w2) * f2;
    b20 = r01 - (3.0 * u) * r02 - (24.0 * u) * f0;
    b21 = r11 - (3.0 * u) * r12 - (24.0 * u) * f1;
    b22 = r21 - (3.0 * u) * r22 - (24.0 * u) * f2;
    b10 *= fden;
    b11 *= fden;
    b12 *= fden;
    b20 *= fden;
    b21 *= fden;
    b22 *= fden;
    B1 = b10 * unity + timesMinusI(b11) * iQ - b12 * iQ2;
    B2 = b20 * unity + timesMinusI(b21) * iQ - b22 * iQ2;
    USigmap = GaugeK * Sigmap;
    tr1 = trace(USigmap * B1);
    tr2 = trace(USigmap * B2);
    GaugeLinkField QUS = iQ * USigmap;
    GaugeLinkField USQ = USigmap * iQ;
    GaugeLinkField iGamma = tr1 * iQ - timesI(tr2) * iQ2 +
                            timesI(f1) * USigmap + f2 * QUS + f2 * USQ;
    iLambda = Ta(iGamma);
  }
  //====================================================================
 public:
  GaugeField*
      ThinLinks; /*!< @brief Pointer to the thin
                                                         links configuration */
  /*! @brief Standard constructor */
  SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
                       Smear_Stout<Gimpl>& Stout)
      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) {
    for (unsigned int i = 0; i < smearingLevels; ++i)
      SmearedSet.push_back(*(new GaugeField(UGrid)));
  }
  /*! For just thin links */
  SmearedConfiguration()
      : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}
  // attach the smeared routines to the thin links U and fill the smeared set
  void set_GaugeField(GaugeField& U) { fill_smearedSet(U); }
  //====================================================================
  void smeared_force(GaugeField& SigmaTilde) const {
    if (smearingLevels > 0) {
      GaugeField force = SigmaTilde; // actually = U*SigmaTilde
      GaugeLinkField tmp_mu(SigmaTilde._grid);
      for (int mu = 0; mu < Nd; mu++) {
        // to get just SigmaTilde
        tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) *
                 peekLorentz(force, mu);
        pokeLorentz(force, tmp_mu, mu);
      }
      for (int ismr = smearingLevels - 1; ismr > 0; --ismr)
        force = AnalyticSmearedForce(force, get_smeared_conf(ismr - 1));
      force = AnalyticSmearedForce(force, *ThinLinks);
      for (int mu = 0; mu < Nd; mu++) {
        tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu);
        pokeLorentz(SigmaTilde, tmp_mu, mu);
      }
    }  // if smearingLevels = 0 do nothing
  }
  //====================================================================
  GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
  GaugeField& get_U(bool smeared = false) {
    // get the config, thin links by default
    if (smeared) {
      if (smearingLevels) {
        RealD impl_plaq =
            WilsonLoops<Gimpl>::avgPlaquette(SmearedSet[smearingLevels - 1]);
        std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq
                  << std::endl;
        return get_SmearedU();
      } else {
        RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
        std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
                  << std::endl;
        return *ThinLinks;
      }
    } else {
      RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
      std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
                << std::endl;
      return *ThinLinks;
    }
  }
 };
 }
 }
 #endif
--- a/lib/qcd/smearing/Smearing.h
+++ b/lib/qcd/smearing/Smearing.h
@@ -0,0 +1,9 @@
 #ifndef GRID_QCD_SMEARING_H
 #define GRID_QCD_SMEARING_H
 #include <qcd/smearing/BaseSmearing.h>
 #include <qcd/smearing/APEsmearing.h>
 #include <qcd/smearing/StoutSmearing.h>
 #include <qcd/smearing/GaugeConfiguration.h>
 #endif
--- a/lib/qcd/smearing/StoutSmearing.h
+++ b/lib/qcd/smearing/StoutSmearing.h
@@ -0,0 +1,160 @@
 /*
  @file stoutSmear.hpp
  @brief Declares Stout smearing class
 */
 #ifndef STOUT_SMEAR_
 #define STOUT_SMEAR_
 namespace Grid {
 namespace QCD {
 /*!  @brief Stout smearing of link variable. */
 template <class Gimpl>
 class Smear_Stout : public Smear<Gimpl> {
 private:
  const Smear<Gimpl>* SmearBase;
 public:
  INHERIT_GIMPL_TYPES(Gimpl)
  Smear_Stout(Smear<Gimpl>* base) : SmearBase(base) {
    static_assert(Nc == 3,
                  "Stout smearing currently implemented only for Nc==3");
  }
  /*! Default constructor */
  Smear_Stout(double rho = 1.0) : SmearBase(new Smear_APE<Gimpl>(rho)) {
    static_assert(Nc == 3,
                  "Stout smearing currently implemented only for Nc==3");
  }
  ~Smear_Stout() {}  // delete SmearBase...
  void smear(GaugeField& u_smr, const GaugeField& U) const {
    GaugeField C(U._grid);
    GaugeLinkField tmp(U._grid), iq_mu(U._grid), Umu(U._grid);
    std::cout << GridLogDebug << "Stout smearing started\n";
    // Smear the configurations
    SmearBase->smear(C, U);
    for (int mu = 0; mu < Nd; mu++) {
      tmp = peekLorentz(C, mu);
      Umu = peekLorentz(U, mu);
      iq_mu = Ta(
          tmp *
          adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
      exponentiate_iQ(tmp, iq_mu);
      pokeLorentz(u_smr, tmp * Umu, mu);  // u_smr = exp(iQ_mu)*U_mu
    }
    std::cout << GridLogDebug << "Stout smearing completed\n";
  };
  void derivative(GaugeField& SigmaTerm, const GaugeField& iLambda,
                  const GaugeField& Gauge) const {
    SmearBase->derivative(SigmaTerm, iLambda, Gauge);
  };
  void BaseSmear(GaugeField& C, const GaugeField& U) const {
    SmearBase->smear(C, U);
  };
  void exponentiate_iQ(GaugeLinkField& e_iQ, const GaugeLinkField& iQ) const {
    // Put this outside
    // only valid for SU(3) matrices
    // only one Lorentz direction at a time
    // notice that it actually computes
    // exp ( input matrix )
    // the i sign is coming from outside
    // input matrix is anti-hermitian NOT hermitian
    GridBase* grid = iQ._grid;
    GaugeLinkField unity(grid);
    unity = 1.0;
    GaugeLinkField iQ2(grid), iQ3(grid);
    LatticeComplex u(grid), w(grid);
    LatticeComplex f0(grid), f1(grid), f2(grid);
    iQ2 = iQ * iQ;
    iQ3 = iQ * iQ2;
    set_uw(u, w, iQ2, iQ3);
    set_fj(f0, f1, f2, u, w);
    e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;
  };
  void set_uw(LatticeComplex& u, LatticeComplex& w, GaugeLinkField& iQ2,
              GaugeLinkField& iQ3) const {
    Complex one_over_three = 1.0 / 3.0;
    Complex one_over_two = 1.0 / 2.0;
    GridBase* grid = u._grid;
    LatticeComplex c0(grid), c1(grid), tmp(grid), c0max(grid), theta(grid);
    // sign in c0 from the conventions on the Ta
    c0 = -imag(trace(iQ3)) * one_over_three;  
    c1 = -real(trace(iQ2)) * one_over_two;
    // Cayley Hamilton checks to machine precision, tested
    tmp = c1 * one_over_three;
    c0max = 2.0 * pow(tmp, 1.5);
    theta = acos(c0 / c0max) *
            one_over_three;  // divide by three here, now leave as it is
    u = sqrt(tmp) * cos(theta);
    w = sqrt(c1) * sin(theta);
  }
  void set_fj(LatticeComplex& f0, LatticeComplex& f1, LatticeComplex& f2,
              const LatticeComplex& u, const LatticeComplex& w) const {
    GridBase* grid = u._grid;
    LatticeComplex xi0(grid), u2(grid), w2(grid), cosw(grid);
    LatticeComplex fden(grid);
    LatticeComplex h0(grid), h1(grid), h2(grid);
    LatticeComplex e2iu(grid), emiu(grid), ixi0(grid), qt(grid);
    LatticeComplex unity(grid);
    unity = 1.0;
    xi0 = func_xi0(w);
    u2 = u * u;
    w2 = w * w;
    cosw = cos(w);
    ixi0 = timesI(xi0);
    emiu = cos(u) - timesI(sin(u));
    e2iu = cos(2.0 * u) + timesI(sin(2.0 * u));
    h0 = e2iu * (u2 - w2) +
         emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0));
    h1 = e2iu * (2.0 * u) - emiu * ((2.0 * u * cosw) - (3.0 * u2 - w2) * ixi0);
    h2 = e2iu - emiu * (cosw + (3.0 * u) * ixi0);
    fden = unity / (9.0 * u2 - w2);  // reals
    f0 = h0 * fden;
    f1 = h1 * fden;
    f2 = h2 * fden;
  }
  LatticeComplex func_xi0(const LatticeComplex& w) const {
    // Define a function to do the check
    // if( w < 1e-4 ) std::cout << GridLogWarning<< "[Smear_stout] w too small:
    // "<< w <<"\n";
    return sin(w) / w;
  }
  LatticeComplex func_xi1(const LatticeComplex& w) const {
    // Define a function to do the check
    // if( w < 1e-4 ) std::cout << GridLogWarning << "[Smear_stout] w too small:
    // "<< w <<"\n";
    return cos(w) / (w * w) - sin(w) / (w * w * w);
  }
 };
 }
 }
 #endif
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -43,7 +43,7 @@ public:
  template<typename vtype> using iSUnMatrix              = iScalar<iScalar<iMatrix<vtype, ncolour> > > ;
  template<typename vtype> using iSU2Matrix              = iScalar<iScalar<iMatrix<vtype, 2> > > ;
-
+  
  //////////////////////////////////////////////////////////////////////////////////////////////////
  // Types can be accessed as SU<2>::Matrix , SU<2>::vSUnMatrix, SU<2>::LatticeMatrix etc...
  //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -552,15 +552,24 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
  }
  // reunitarise??
-  static void LieRandomize(GridParallelRNG     &pRNG,LatticeMatrix &out,double scale=1.0){
+  template<typename LatticeMatrixType>
  static void LieRandomize(GridParallelRNG     &pRNG,LatticeMatrixType &out,double scale=1.0){
    GridBase *grid = out._grid;
-
+    
-    LatticeComplex ca (grid);
+    typedef typename LatticeMatrixType::vector_type vector_type;
-    LatticeMatrix  lie(grid);
+    typedef typename LatticeMatrixType::scalar_type scalar_type;
-    LatticeMatrix  la (grid);
+    
-    Complex ci(0.0,scale);
+    typedef iSinglet<vector_type> vTComplexType;
-    Complex cone(1.0,0.0);
+    
-    Matrix ta;
+    typedef Lattice<vTComplexType> LatticeComplexType;
    typedef typename GridTypeMapper<typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
    LatticeComplexType ca (grid);
    LatticeMatrixType  lie(grid);
    LatticeMatrixType  la (grid);
    ComplexD ci(0.0,scale);
    ComplexD cone(1.0,0.0);
    MatrixType ta;
    lie=zero;
    for(int a=0;a<generators();a++){
@@ -596,9 +605,13 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
  }
-
+  template<typename GaugeField>
-  static void HotConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
+  static void HotConfiguration(GridParallelRNG &pRNG,GaugeField &out){
-    LatticeMatrix Umu(out._grid);
+    typedef typename GaugeField::vector_type vector_type;
    typedef iSUnMatrix<vector_type> vMatrixType;
    typedef Lattice<vMatrixType> LatticeMatrixType;
    LatticeMatrixType Umu(out._grid);
    for(int mu=0;mu<Nd;mu++){
      LieRandomize(pRNG,Umu,1.0);
      PokeIndex<LorentzIndex>(out,Umu,mu);
@@ -622,13 +635,15 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
  static void taProj( const LatticeMatrix &in,  LatticeMatrix &out){
    out = Ta(in);
  }
-  static void taExp( const LatticeMatrix &x,  LatticeMatrix &ex){ 
+  template<typename LatticeMatrixType>
-
+  static void taExp( const LatticeMatrixType &x,  LatticeMatrixType &ex){ 
-    LatticeMatrix xn(x._grid);
+    typedef typename LatticeMatrixType::scalar_type ComplexType;    
    LatticeMatrixType xn(x._grid);
    RealD nfac = 1.0;
    xn = x;
-    ex =xn+Complex(1.0); // 1+x
+    ex =xn+ComplexType(1.0); // 1+x
    // Do a 12th order exponentiation
    for(int i=2; i <= 12; ++i)
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -1,6 +1,6 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/utils/WilsonLoops.h
@@ -25,391 +25,501 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+    See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
 /*  END LEGAL */
 #ifndef QCD_UTILS_WILSON_LOOPS_H
 #define QCD_UTILS_WILSON_LOOPS_H
 namespace Grid {
 namespace QCD {
 // Common wilson loop observables
-template<class Gimpl>
+template <class Gimpl> class WilsonLoops : public Gimpl {
 class WilsonLoops : public Gimpl {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  typedef typename Gimpl::GaugeLinkField GaugeMat;
-  typedef typename Gimpl::GaugeField     GaugeLorentz;
+  typedef typename Gimpl::GaugeField GaugeLorentz;
  //////////////////////////////////////////////////
  // directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
-  static void dirPlaquette(GaugeMat &plaq,const std::vector<GaugeMat> &U, const int mu, const int nu)
+  static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U,
-  {
+                           const int mu, const int nu) {
-    // Annoyingly, must use either scope resolution to find dependent base class, 
+    // Annoyingly, must use either scope resolution to find dependent base
-    // or this-> ; there is no "this" in a static method. This forces explicit Gimpl scope
+    // class,
-    // resolution throughout the usage in this file, and rather defeats the purpose of deriving
+    // or this-> ; there is no "this" in a static method. This forces explicit
    // Gimpl scope
    // resolution throughout the usage in this file, and rather defeats the
    // purpose of deriving
    // from Gimpl.
-    plaq= Gimpl::CovShiftBackward(U[mu],mu,
+    plaq = Gimpl::CovShiftBackward(
-	  Gimpl::CovShiftBackward(U[nu],nu,
+        U[mu], mu, Gimpl::CovShiftBackward(
-          Gimpl::CovShiftForward (U[mu],mu,U[nu])));
+                       U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
  }
  //////////////////////////////////////////////////
  // trace of directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
-  static void traceDirPlaquette(LatticeComplex &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu)
+  static void traceDirPlaquette(LatticeComplex &plaq,
-  {
+                                const std::vector<GaugeMat> &U, const int mu,
                                const int nu) {
    GaugeMat sp(U[0]._grid);
-    dirPlaquette(sp,U,mu,nu);
+    dirPlaquette(sp, U, mu, nu);
-    plaq=trace(sp);
+    plaq = trace(sp);
  }
  //////////////////////////////////////////////////
  // sum over all planes of plaquette
  //////////////////////////////////////////////////
-  static void sitePlaquette(LatticeComplex &Plaq,const std::vector<GaugeMat> &U)
+  static void sitePlaquette(LatticeComplex &Plaq,
-  {
+                            const std::vector<GaugeMat> &U) {
    LatticeComplex sitePlaq(U[0]._grid);
-    Plaq=zero;
+    Plaq = zero;
-    for(int mu=1;mu<Nd;mu++){
+    for (int mu = 1; mu < Nd; mu++) {
-      for(int nu=0;nu<mu;nu++){
+      for (int nu = 0; nu < mu; nu++) {
-	traceDirPlaquette(sitePlaq,U,mu,nu);
+        traceDirPlaquette(sitePlaq, U, mu, nu);
-	Plaq = Plaq + sitePlaq;
+        Plaq = Plaq + sitePlaq;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
-  static RealD sumPlaquette(const GaugeLorentz &Umu){
+  static RealD sumPlaquette(const GaugeLorentz &Umu) {
-    std::vector<GaugeMat> U(Nd,Umu._grid);
+    std::vector<GaugeMat> U(4, Umu._grid);
-    for(int mu=0;mu<Nd;mu++){
+    for (int mu = 0; mu < Nd; mu++) {
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Plaq(Umu._grid);
-    
+
-    sitePlaquette(Plaq,U);
+    sitePlaquette(Plaq, U);
-    
+
    TComplex Tp = sum(Plaq);
-    Complex p  = TensorRemove(Tp);
+    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
-  static RealD avgPlaquette(const GaugeLorentz &Umu){
+  static RealD avgPlaquette(const GaugeLorentz &Umu) {
    RealD sumplaq = sumPlaquette(Umu);
    double vol = Umu._grid->gSites();
-    
+    double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
-    double faces = (1.0*Nd*(Nd-1))/2.0;
+    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
    return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
  }
  static RealD linkTrace(const GaugeLorentz &Umu){
    std::vector<GaugeMat> U(Nd,Umu._grid);
-    LatticeComplex Tr(Umu._grid); Tr=zero;
+  //////////////////////////////////////////////////
-    for(int mu=0;mu<Nd;mu++){
+  // average over traced single links
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  //////////////////////////////////////////////////
-      Tr = Tr+trace(U[mu]);
+  static RealD linkTrace(const GaugeLorentz &Umu) {
    std::vector<GaugeMat> U(4, Umu._grid);
    LatticeComplex Tr(Umu._grid);
    Tr = zero;
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
      Tr = Tr + trace(U[mu]);
    }
-    
+
    TComplex Tp = sum(Tr);
-    Complex p  = TensorRemove(Tp);
+    Complex p = TensorRemove(Tp);
    double vol = Umu._grid->gSites();
-    return p.real()/vol/((double)(Nd*(Nd-1)));
+    return p.real() / vol / 4.0 / 3.0;
  };
  //////////////////////////////////////////////////
-  // the sum over all staples on each site
+  // the sum over all staples on each site in direction mu,nu
  //////////////////////////////////////////////////
-  static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
+  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
                     int nu) {
    GridBase *grid = Umu._grid;
-    std::vector<GaugeMat> U(Nd,grid);
+    std::vector<GaugeMat> U(4, grid);
-    for(int d=0;d<Nd;d++){
+    for (int d = 0; d < Nd; d++) {
-      U[d] = PeekIndex<LorentzIndex>(Umu,d);
+      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
    staple = zero;
    GaugeMat tmp(grid);
-    
+    if (nu != mu) {
    for(int nu=0;nu<Nd;nu++){
      if(nu != mu) {
      // mu
      // ^
      // |__>  nu
-      //    __ 
+      //    __
      //      |
      //    __|
      //
-	staple+=Gimpl::ShiftStaple(
+      staple += Gimpl::ShiftStaple(
-	        Gimpl::CovShiftForward (U[nu],nu, 
+          Gimpl::CovShiftForward(
-		Gimpl::CovShiftBackward(U[mu],mu,
+              U[nu], nu,
-		Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
+              Gimpl::CovShiftBackward(
                  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
          mu);
-      //  __ 
+      //  __
-      // |   
+      // |
-      // |__ 
+      // |__
      //
      //
-	staple+=Gimpl::ShiftStaple(  
+      staple += Gimpl::ShiftStaple(
-                Gimpl::CovShiftBackward(U[nu],nu,		  		  
+          Gimpl::CovShiftBackward(U[nu], nu,
-		Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
+                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
          mu);
    }
  }
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    GridBase *grid = Umu._grid;
    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
    staple = zero;
    GaugeMat tmp(grid);
    for (int nu = 0; nu < Nd; nu++) {
      if (nu != mu) {
        // mu
        // ^
        // |__>  nu
        //    __
        //      |
        //    __|
        //
        staple += Gimpl::ShiftStaple(
            Gimpl::CovShiftForward(
                U[nu], nu,
                Gimpl::CovShiftBackward(
                    U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
            mu);
        //  __
        // |
        // |__
        //
        //
        staple += Gimpl::ShiftStaple(
            Gimpl::CovShiftBackward(U[nu], nu,
                                    Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
            mu);
      }
    }
  }
  //////////////////////////////////////////////////
  // the sum over all staples on each site in direction mu,nu, upper part
  //////////////////////////////////////////////////
  static void StapleUpper(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
                          int nu) {
    staple = zero;
    if (nu != mu) {
      GridBase *grid = Umu._grid;
      std::vector<GaugeMat> U(4, grid);
      for (int d = 0; d < Nd; d++) {
        U[d] = PeekIndex<LorentzIndex>(Umu, d);
      }
      // mu
      // ^
      // |__>  nu
      //    __
      //      |
      //    __|
      //
      staple += Gimpl::ShiftStaple(
          Gimpl::CovShiftForward(
              U[nu], nu,
              Gimpl::CovShiftBackward(
                  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
          mu);
    }
  }
  //////////////////////////////////////////////////////
  // Similar to above for rectangle is required
  //////////////////////////////////////////////////////
-  static void dirRectangle(GaugeMat &rect,const std::vector<GaugeMat> &U, const int mu, const int nu)
+  static void dirRectangle(GaugeMat &rect, const std::vector<GaugeMat> &U,
-  {
+                           const int mu, const int nu) {
-    rect =  Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[mu],mu,U[nu]))* // ->->|
+    rect = Gimpl::CovShiftForward(
-	adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[mu],mu,U[mu]))) ;
+               U[mu], mu, Gimpl::CovShiftForward(U[mu], mu, U[nu])) * // ->->|
-    rect = rect + 
+           adj(Gimpl::CovShiftForward(
-          Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[nu],nu,U[nu]))* // ->||
+               U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[mu])));
-      adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[nu],nu,U[mu]))) ;
+    rect = rect +
           Gimpl::CovShiftForward(
               U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])) * // ->||
               adj(Gimpl::CovShiftForward(
                   U[nu], nu, Gimpl::CovShiftForward(U[nu], nu, U[mu])));
  }
-  static void traceDirRectangle(LatticeComplex &rect, const std::vector<GaugeMat> &U, const int mu, const int nu)
+  static void traceDirRectangle(LatticeComplex &rect,
-  {
+                                const std::vector<GaugeMat> &U, const int mu,
                                const int nu) {
    GaugeMat sp(U[0]._grid);
-    dirRectangle(sp,U,mu,nu);
+    dirRectangle(sp, U, mu, nu);
-    rect=trace(sp);
+    rect = trace(sp);
  }
-  static void siteRectangle(LatticeComplex &Rect,const std::vector<GaugeMat> &U)
+  static void siteRectangle(LatticeComplex &Rect,
-  {
+                            const std::vector<GaugeMat> &U) {
    LatticeComplex siteRect(U[0]._grid);
-    Rect=zero;
+    Rect = zero;
-    for(int mu=1;mu<Nd;mu++){
+    for (int mu = 1; mu < Nd; mu++) {
-      for(int nu=0;nu<mu;nu++){
+      for (int nu = 0; nu < mu; nu++) {
-	traceDirRectangle(siteRect,U,mu,nu);
+        traceDirRectangle(siteRect, U, mu, nu);
-	Rect = Rect + siteRect;
+        Rect = Rect + siteRect;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
-  static RealD sumRectangle(const GaugeLorentz &Umu){
+  static RealD sumRectangle(const GaugeLorentz &Umu) {
-    std::vector<GaugeMat> U(Nd,Umu._grid);
+    std::vector<GaugeMat> U(Nd, Umu._grid);
-    for(int mu=0;mu<Nd;mu++){
+    for (int mu = 0; mu < Nd; mu++) {
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Rect(Umu._grid);
-    
+
-    siteRectangle(Rect,U);
+    siteRectangle(Rect, U);
-    
+
    TComplex Tp = sum(Rect);
-    Complex p  = TensorRemove(Tp);
+    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
-  static RealD avgRectangle(const GaugeLorentz &Umu){
+  static RealD avgRectangle(const GaugeLorentz &Umu) {
    RealD sumrect = sumRectangle(Umu);
-    
+
    double vol = Umu._grid->gSites();
-    
+
-    double faces = (1.0*Nd*(Nd-1)); // 2 distinct orientations summed
+    double faces = (1.0 * Nd * (Nd - 1)); // 2 distinct orientations summed
-    
+
-    return sumrect/vol/faces/Nc; // Nd , Nc dependent... FIXME
+    return sumrect / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
-  static void RectStapleDouble(GaugeMat &U2,const GaugeMat & U,int mu){
+  static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
-    U2 = U * Cshift(U,mu,1);
+    U2 = U * Cshift(U, mu, 1);
  }
  ////////////////////////////////////////////////////////////////////////////
-  // Hop by two optimisation strategy does not work nicely with Gparity. (could do,
+  // Hop by two optimisation strategy does not work nicely with Gparity. (could
  // do,
  // but need to track two deep where cross boundary and apply a conjugation).
-  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do so .
+  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do
  // so .
  ////////////////////////////////////////////////////////////////////////////
-  static void RectStapleOptimised(GaugeMat &Stap,std::vector<GaugeMat> &U2,std::vector<GaugeMat> &U,int mu){
+  static void RectStapleOptimised(GaugeMat &Stap, std::vector<GaugeMat> &U2,
                                  std::vector<GaugeMat> &U, int mu) {
    Stap = zero;
    GridBase *grid = U[0]._grid;
-    GaugeMat Staple2x1 (grid);
+    GaugeMat Staple2x1(grid);
-    GaugeMat tmp (grid);
+    GaugeMat tmp(grid);
-    for(int nu=0;nu<Nd;nu++){
+    for (int nu = 0; nu < Nd; nu++) {
-      if ( nu!=mu) {
+      if (nu != mu) {
-	// Up staple    ___ ___ 
+        // Up staple    ___ ___
-	//             |       |
+        //             |       |
-	tmp = Cshift(adj(U[nu]),nu,-1); 
+        tmp = Cshift(adj(U[nu]), nu, -1);
-	tmp = adj(U2[mu])*tmp;
+        tmp = adj(U2[mu]) * tmp;
-	tmp = Cshift(tmp,mu,-2);
+        tmp = Cshift(tmp, mu, -2);
-	Staple2x1 = Gimpl::CovShiftForward (U[nu],nu,tmp);
+        Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);
        // Down staple
        //             |___ ___|
        //
        tmp = adj(U2[mu]) * U[nu];
        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Cshift(tmp, mu, -2));
-	// Down staple
+        //              ___ ___
-	//             |___ ___|
+        //             |    ___|
-	//
+        //             |___ ___|
-	tmp = adj(U2[mu])*U[nu];
+        //
 	Staple2x1+= Gimpl::CovShiftBackward(U[nu],nu,Cshift(tmp,mu,-2));
        Stap += Cshift(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
-	//              ___ ___
+        //              ___ ___
-	//             |    ___|
+        //             |___    |
-	//             |___ ___|
+        //             |___ ___|
-	//
+        //
-	Stap+= Cshift(Gimpl::CovShiftForward (U[mu],mu,Staple2x1),mu,1);
+        //	tmp= Staple2x1* Cshift(U[mu],mu,-2);
        //	Stap+= Cshift(tmp,mu,1) ;
        Stap += Cshift(Staple2x1, mu, 1) * Cshift(U[mu], mu, -1);
        ;
-	//              ___ ___
+        //       --
-	//             |___    |
+        //      |  |
-	//             |___ ___|
+        //
-	//
+        //      |  |
-	//	tmp= Staple2x1* Cshift(U[mu],mu,-2);
+        tmp = Cshift(adj(U2[nu]), nu, -2);
-	//	Stap+= Cshift(tmp,mu,1) ;
+        tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
-	Stap+= Cshift(Staple2x1,mu,1)*Cshift(U[mu],mu,-1); ;
+        tmp = U2[nu] * Cshift(tmp, nu, 2);
        Stap += Cshift(tmp, mu, 1);
-	//       --    
+        //      |  |
-	//      |  |              
+        //
-	//          
+        //      |  |
-	//      |  | 
+        //       --
 	tmp = Cshift(adj(U2[nu]),nu,-2);
 	tmp = Gimpl::CovShiftBackward(U[mu],mu,tmp);
 	tmp = U2[nu]*Cshift(tmp,nu,2);
 	Stap+= Cshift(tmp, mu, 1);
-	//      |  |              
+        tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
-	//          
+        tmp = adj(U2[nu]) * tmp;
-	//      |  | 
+        tmp = Cshift(tmp, nu, -2);
-	//       -- 
+        Stap += Cshift(tmp, mu, 1);
-	
+      }
 	tmp = Gimpl::CovShiftBackward(U[mu],mu,U2[nu]);
 	tmp = adj(U2[nu])*tmp;
 	tmp = Cshift(tmp,nu,-2);
 	Stap+=Cshift(tmp, mu, 1);
    }}
  }
  static void RectStaple(GaugeMat &Stap,const GaugeLorentz & Umu,int mu)
  {
    RectStapleUnoptimised(Stap,Umu,mu);
  }
  static void RectStaple(const GaugeLorentz & Umu,GaugeMat &Stap,
 			 std::vector<GaugeMat> &U2,
 			 std::vector<GaugeMat> &U, int mu)
  {
    if ( Gimpl::isPeriodicGaugeField() ){ 
      RectStapleOptimised(Stap,U2,U,mu);
    } else {
      RectStapleUnoptimised(Stap,Umu,mu);
    }
  }
-  static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
+  static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
    RectStapleUnoptimised(Stap, Umu, mu);
  }
  static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
                         std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
                         int mu) {
    if (Gimpl::isPeriodicGaugeField()) {
      RectStapleOptimised(Stap, U2, U, mu);
    } else {
      RectStapleUnoptimised(Stap, Umu, mu);
    }
  }
  static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu,
                                    int mu) {
    GridBase *grid = Umu._grid;
-    std::vector<GaugeMat> U(Nd,grid);
+    std::vector<GaugeMat> U(Nd, grid);
-    for(int d=0;d<Nd;d++){
+    for (int d = 0; d < Nd; d++) {
-      U[d] = PeekIndex<LorentzIndex>(Umu,d);
+      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
-    Stap=zero;
+    Stap = zero;
-    for(int nu=0;nu<Nd;nu++){
+    for (int nu = 0; nu < Nd; nu++) {
-      if ( nu!=mu) {
+      if (nu != mu) {
-    //           __ ___ 
+        //           __ ___
-    //          |    __ |
+        //          |    __ |
-    //
+        //
-    Stap+= Gimpl::ShiftStaple(
+        Stap += Gimpl::ShiftStaple(
-		  Gimpl::CovShiftForward (U[mu],mu,
+            Gimpl::CovShiftForward(
-		  Gimpl::CovShiftForward (U[nu],nu,
+                U[mu], mu,
-		  Gimpl::CovShiftBackward(U[mu],mu,
+                Gimpl::CovShiftForward(
-                  Gimpl::CovShiftBackward(U[mu],mu,
+                    U[nu], nu,
-		  Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
+                    Gimpl::CovShiftBackward(
                        U[mu], mu,
                        Gimpl::CovShiftBackward(
                            U[mu], mu,
                            Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
            mu);
-    //              __ 
+        //              __
-    //          |__ __ |
+        //          |__ __ |
-    Stap+= Gimpl::ShiftStaple(
+        Stap += Gimpl::ShiftStaple(
-                  Gimpl::CovShiftForward (U[mu],mu,
+            Gimpl::CovShiftForward(
-		  Gimpl::CovShiftBackward(U[nu],nu,
+                U[mu], mu,
-		  Gimpl::CovShiftBackward(U[mu],mu,
+                Gimpl::CovShiftBackward(
-                  Gimpl::CovShiftBackward(U[mu],mu, U[nu])))) , mu);
+                    U[nu], nu,
                    Gimpl::CovShiftBackward(
                        U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))),
            mu);
-    //           __ 
+        //           __
-    //          |__ __ |
+        //          |__ __ |
-    Stap+= Gimpl::ShiftStaple(
+        Stap += Gimpl::ShiftStaple(
-		  Gimpl::CovShiftBackward(U[nu],nu,
+            Gimpl::CovShiftBackward(
-		  Gimpl::CovShiftBackward(U[mu],mu,
+                U[nu], nu,
-		  Gimpl::CovShiftBackward(U[mu],mu,
+                Gimpl::CovShiftBackward(
-		  Gimpl::CovShiftForward(U[nu],nu,U[mu])))) , mu);
+                    U[mu], mu,
                    Gimpl::CovShiftBackward(
                        U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))),
            mu);
-    //           __ ___ 
+        //           __ ___
-    //          |__    |
+        //          |__    |
-    Stap+= Gimpl::ShiftStaple(
+        Stap += Gimpl::ShiftStaple(
-		   Gimpl::CovShiftForward (U[nu],nu,
+            Gimpl::CovShiftForward(
-	           Gimpl::CovShiftBackward(U[mu],mu,
+                U[nu], nu,
-                   Gimpl::CovShiftBackward(U[mu],mu,
+                Gimpl::CovShiftBackward(
-                   Gimpl::CovShiftBackward(U[nu],nu,U[mu])))) , mu);
+                    U[mu], mu,
                    Gimpl::CovShiftBackward(
                        U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))),
            mu);
-     //       --    
+        //       --
-     //      |  |              
+        //      |  |
-     //          
+        //
-     //      |  | 
+        //      |  |
    Stap+= Gimpl::ShiftStaple(
 		   Gimpl::CovShiftForward(U[nu],nu,
 		   Gimpl::CovShiftForward(U[nu],nu,
                   Gimpl::CovShiftBackward(U[mu],mu,
                   Gimpl::CovShiftBackward(U[nu],nu,
 		   Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
        Stap += Gimpl::ShiftStaple(
            Gimpl::CovShiftForward(
                U[nu], nu,
                Gimpl::CovShiftForward(
                    U[nu], nu,
                    Gimpl::CovShiftBackward(
                        U[mu], mu,
                        Gimpl::CovShiftBackward(
                            U[nu], nu,
                            Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
            mu);
-     //      |  |              
+        //      |  |
-     //          
+        //
-     //      |  | 
+        //      |  |
-     //       -- 
+        //       --
-     
+
-    Stap+= Gimpl::ShiftStaple(
+        Stap += Gimpl::ShiftStaple(
-		   Gimpl::CovShiftBackward(U[nu],nu,
+            Gimpl::CovShiftBackward(
-		   Gimpl::CovShiftBackward(U[nu],nu,
+                U[nu], nu,
-                   Gimpl::CovShiftBackward(U[mu],mu,
+                Gimpl::CovShiftBackward(
-                   Gimpl::CovShiftForward (U[nu],nu,U[nu])))) , mu);
+                    U[nu], nu,
-    }}
+                    Gimpl::CovShiftBackward(
                        U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))),
            mu);
      }
    }
  }
 };
 typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
 typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
 typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
 typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
 }
 }
- typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
+#endif
 typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
 typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
 typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
 }}
 #endif
--- a/lib/serialisation/.dirstamp
+++ b/lib/serialisation/.dirstamp
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -1,227 +1,234 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/simd/Grid_vector_unops.h
+Source file: ./lib/simd/Grid_vector_unops.h
-    Copyright (C) 2015
+Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_VECTOR_UNOPS
 #define GRID_VECTOR_UNOPS
 #include <cmath>
-namespace Grid { 
+namespace Grid {
-  template<class scalar> struct SqrtRealFunctor {
+template <class scalar>
-    scalar operator()(const scalar &a) const {
+struct SqrtRealFunctor {
-      return sqrt(real(a));
+  scalar operator()(const scalar &a) const { return sqrt(real(a)); }
-    }
+};
  };
-  template<class scalar> struct RSqrtRealFunctor {
+template <class scalar>
-    scalar operator()(const scalar &a)  const {
+struct RSqrtRealFunctor {
-      return scalar(1.0/sqrt(real(a)));
+  scalar operator()(const scalar &a) const {
-    }
+    return scalar(1.0 / sqrt(real(a)));
-  };
+  }
 };
-  template<class scalar> struct CosRealFunctor {
+template <class scalar>
-    scalar operator()(const scalar &a)  const {
+struct CosRealFunctor {
-      return cos(real(a));
+  scalar operator()(const scalar &a) const { return cos(real(a)); }
-    }
+};
  };
-  template<class scalar> struct SinRealFunctor {
+template <class scalar>
-    scalar operator()(const scalar &a)  const {
+struct SinRealFunctor {
-      return sin(real(a));
+  scalar operator()(const scalar &a) const { return sin(real(a)); }
-    }
+};
  };
-  template<class scalar> struct LogRealFunctor {
+template <class scalar>
-    scalar operator()(const scalar &a)  const {
+struct AcosRealFunctor {
-      return log(real(a));
+  scalar operator()(const scalar &a) const { return acos(real(a)); }
-    }
+};
  };
-  template<class scalar> struct ExpRealFunctor {
+template <class scalar>
-    scalar operator()(const scalar &a)  const {
+struct AsinRealFunctor {
-      return exp(real(a));
+  scalar operator()(const scalar &a) const { return asin(real(a)); }
-    }
+};
  };
  template<class scalar> struct NotFunctor {
    scalar operator()(const scalar &a)  const {
      return (!a);
    }
  };
  template<class scalar> struct AbsRealFunctor {
    scalar operator()(const scalar &a)  const {
      return std::abs(real(a));
    }
  };
-  template<class scalar> struct PowRealFunctor {
+template <class scalar>
-    double y;
+struct LogRealFunctor {
-  PowRealFunctor(double _y) : y(_y) {};
+  scalar operator()(const scalar &a) const { return log(real(a)); }
-    scalar operator()(const scalar &a)  const {
+};
      return pow(real(a),y);
    }
  };
-  template<class scalar> struct ModIntFunctor {
+template <class scalar>
-    Integer y;
+struct ExpRealFunctor {
-  ModIntFunctor(Integer _y) : y(_y) {};
+  scalar operator()(const scalar &a) const { return exp(real(a)); }
-    scalar operator()(const scalar &a)  const {
+};
-      return Integer(a)%y;
+template <class scalar>
-    }
+struct NotFunctor {
-  };
+  scalar operator()(const scalar &a) const { return (!a); }
 };
 template <class scalar>
 struct AbsRealFunctor {
  scalar operator()(const scalar &a) const { return std::abs(real(a)); }
 };
-  template<class scalar> struct DivIntFunctor {
+template <class scalar>
-    Integer y;
+struct PowRealFunctor {
-  DivIntFunctor(Integer _y) : y(_y) {};
+  double y;
-    scalar operator()(const scalar &a)  const {
+  PowRealFunctor(double _y) : y(_y){};
-      return Integer(a)/y;
+  scalar operator()(const scalar &a) const { return pow(real(a), y); }
-    }
+};
  };
-  template<class scalar> struct RealFunctor {
+template <class scalar>
-    scalar operator()(const scalar &a)  const {
+struct ModIntFunctor {
-      return real(a);
+  Integer y;
-    }
+  ModIntFunctor(Integer _y) : y(_y){};
-  };
+  scalar operator()(const scalar &a) const { return Integer(a) % y; }
-  template<class scalar> struct ImagFunctor {
+};
    scalar operator()(const scalar &a)  const {
      return imag(a);
    }
  };
  template < class S, class V > 
  inline Grid_simd<S,V> real(const Grid_simd<S,V> &r) {
    return SimdApply(RealFunctor<S>(),r);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> imag(const Grid_simd<S,V> &r) {
    return SimdApply(ImagFunctor<S>(),r);
  }
-  template < class S, class V > 
+template <class scalar>
-  inline Grid_simd<S,V> sqrt(const Grid_simd<S,V> &r) {
+struct DivIntFunctor {
-    return SimdApply(SqrtRealFunctor<S>(),r);
+  Integer y;
-  }
+  DivIntFunctor(Integer _y) : y(_y){};
-  template < class S, class V > 
+  scalar operator()(const scalar &a) const { return Integer(a) / y; }
-  inline Grid_simd<S,V> rsqrt(const Grid_simd<S,V> &r) {
+};
    return SimdApply(RSqrtRealFunctor<S>(),r);
  }
  template < class Scalar > 
  inline Scalar rsqrt(const Scalar &r) {
    return (RSqrtRealFunctor<Scalar>(),r);
  }
-  template < class S, class V > 
+template <class scalar>
-  inline Grid_simd<S,V> cos(const Grid_simd<S,V> &r) {
+struct RealFunctor {
-    return SimdApply(CosRealFunctor<S>(),r);
+  scalar operator()(const scalar &a) const { return std::real(a); }
-  }
+};
-  template < class S, class V > 
+template <class scalar>
-  inline Grid_simd<S,V> sin(const Grid_simd<S,V> &r) {
+struct ImagFunctor {
-    return SimdApply(SinRealFunctor<S>(),r);
+  scalar operator()(const scalar &a) const { return std::imag(a); }
-  }
+};
-  template < class S, class V > 
+template <class S, class V>
-  inline Grid_simd<S,V> log(const Grid_simd<S,V> &r) {
+inline Grid_simd<S, V> real(const Grid_simd<S, V> &r) {
-    return SimdApply(LogRealFunctor<S>(),r);
+  return SimdApply(RealFunctor<S>(), r);
-  }
+}
-  template < class S, class V > 
+template <class S, class V>
-  inline Grid_simd<S,V> abs(const Grid_simd<S,V> &r) {
+inline Grid_simd<S, V> imag(const Grid_simd<S, V> &r) {
-    return SimdApply(AbsRealFunctor<S>(),r);
+  return SimdApply(ImagFunctor<S>(), r);
-  }
+}
-  template < class S, class V > 
+template <class S, class V>
-  inline Grid_simd<S,V> exp(const Grid_simd<S,V> &r) {
+inline Grid_simd<S, V> sqrt(const Grid_simd<S, V> &r) {
-    return SimdApply(ExpRealFunctor<S>(),r);
+  return SimdApply(SqrtRealFunctor<S>(), r);
-  }
+}
-  template < class S, class V > 
+template <class S, class V>
-  inline Grid_simd<S,V> Not(const Grid_simd<S,V> &r) {
+inline Grid_simd<S, V> rsqrt(const Grid_simd<S, V> &r) {
-    return SimdApply(NotFunctor<S>(),r);
+  return SimdApply(RSqrtRealFunctor<S>(), r);
-  }
+}
-  template < class S, class V > 
+template <class Scalar>
-  inline Grid_simd<S,V> pow(const Grid_simd<S,V> &r,double y) {
+inline Scalar rsqrt(const Scalar &r) {
-    return SimdApply(PowRealFunctor<S>(y),r);
+  return (RSqrtRealFunctor<Scalar>(), r);
-  }
+}
  template < class S, class V > 
  inline Grid_simd<S,V> mod(const Grid_simd<S,V> &r,Integer y) {
    return SimdApply(ModIntFunctor<S>(y),r);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> div(const Grid_simd<S,V> &r,Integer y) {
    return SimdApply(DivIntFunctor<S>(y),r);
  }
  ////////////////////////////////////////////////////////////////////////////
  // Allows us to assign into **conformable** real vectors from complex
  ////////////////////////////////////////////////////////////////////////////
  //  template < class S, class V > 
  //  inline auto ComplexRemove(const Grid_simd<S,V> &c) -> Grid_simd<Grid_simd<S,V>::Real,V> {
  //    Grid_simd<Grid_simd<S,V>::Real,V> ret;
  //    ret.v = c.v;
  //    return ret;
  //  }
  template<class scalar> struct AndFunctor {
    scalar operator()(const scalar &x, const scalar &y)  const {
      return x & y;
    }
  };
  template<class scalar> struct OrFunctor {
    scalar operator()(const scalar &x, const scalar &y)  const {
      return x | y;
    }
  };
  template<class scalar> struct AndAndFunctor {
    scalar operator()(const scalar &x, const scalar &y)  const {
      return x && y;
    }
  };
  template<class scalar> struct OrOrFunctor {
    scalar operator()(const scalar &x, const scalar &y)  const {
      return x || y;
    }
  };
-  ////////////////////////////////
+template <class S, class V>
-  // Calls to simd binop functors
+inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
-  ////////////////////////////////
+  return SimdApply(CosRealFunctor<S>(), r);
-  template < class S, class V > 
+}
-  inline Grid_simd<S,V> operator &(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
+template <class S, class V>
-    return SimdApplyBinop(AndFunctor<S>(),x,y);
+inline Grid_simd<S, V> sin(const Grid_simd<S, V> &r) {
-  }
+  return SimdApply(SinRealFunctor<S>(), r);
-  template < class S, class V > 
+}
-  inline Grid_simd<S,V> operator &&(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
+template <class S, class V>
-    return SimdApplyBinop(AndAndFunctor<S>(),x,y);
+inline Grid_simd<S, V> acos(const Grid_simd<S, V> &r) {
-  }
+  return SimdApply(AcosRealFunctor<S>(), r);
-  template < class S, class V > 
+}
-  inline Grid_simd<S,V> operator |(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
+template <class S, class V>
-    return SimdApplyBinop(OrFunctor<S>(),x,y);
+inline Grid_simd<S, V> asin(const Grid_simd<S, V> &r) {
-  }
+  return SimdApply(AsinRealFunctor<S>(), r);
-  template < class S, class V > 
+}
-  inline Grid_simd<S,V> operator ||(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
+template <class S, class V>
-    return SimdApplyBinop(OrOrFunctor<S>(),x,y);
+inline Grid_simd<S, V> log(const Grid_simd<S, V> &r) {
-  }
+  return SimdApply(LogRealFunctor<S>(), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
  return SimdApply(AbsRealFunctor<S>(), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
  return SimdApply(ExpRealFunctor<S>(), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
  return SimdApply(NotFunctor<S>(), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> pow(const Grid_simd<S, V> &r, double y) {
  return SimdApply(PowRealFunctor<S>(y), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> mod(const Grid_simd<S, V> &r, Integer y) {
  return SimdApply(ModIntFunctor<S>(y), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> div(const Grid_simd<S, V> &r, Integer y) {
  return SimdApply(DivIntFunctor<S>(y), r);
 }
 ////////////////////////////////////////////////////////////////////////////
 // Allows us to assign into **conformable** real vectors from complex
 ////////////////////////////////////////////////////////////////////////////
 //  template < class S, class V >
 //  inline auto ComplexRemove(const Grid_simd<S,V> &c) ->
 //  Grid_simd<Grid_simd<S,V>::Real,V> {
 //    Grid_simd<Grid_simd<S,V>::Real,V> ret;
 //    ret.v = c.v;
 //    return ret;
 //  }
 template <class scalar>
 struct AndFunctor {
  scalar operator()(const scalar &x, const scalar &y) const { return x & y; }
 };
 template <class scalar>
 struct OrFunctor {
  scalar operator()(const scalar &x, const scalar &y) const { return x | y; }
 };
 template <class scalar>
 struct AndAndFunctor {
  scalar operator()(const scalar &x, const scalar &y) const { return x && y; }
 };
 template <class scalar>
 struct OrOrFunctor {
  scalar operator()(const scalar &x, const scalar &y) const { return x || y; }
 };
 ////////////////////////////////
 // Calls to simd binop functors
 ////////////////////////////////
 template <class S, class V>
 inline Grid_simd<S, V> operator&(const Grid_simd<S, V> &x,
                                 const Grid_simd<S, V> &y) {
  return SimdApplyBinop(AndFunctor<S>(), x, y);
 }
 template <class S, class V>
 inline Grid_simd<S, V> operator&&(const Grid_simd<S, V> &x,
                                  const Grid_simd<S, V> &y) {
  return SimdApplyBinop(AndAndFunctor<S>(), x, y);
 }
 template <class S, class V>
 inline Grid_simd<S, V> operator|(const Grid_simd<S, V> &x,
                                 const Grid_simd<S, V> &y) {
  return SimdApplyBinop(OrFunctor<S>(), x, y);
 }
 template <class S, class V>
 inline Grid_simd<S, V> operator||(const Grid_simd<S, V> &x,
                                  const Grid_simd<S, V> &y) {
  return SimdApplyBinop(OrOrFunctor<S>(), x, y);
 }
 }
 #endif
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -1,31 +1,32 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/tensors/Tensor_class.h
+Source file: ./lib/tensors/Tensor_class.h
-    Copyright (C) 2015
+Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_MATH_TENSORS_H
 #define GRID_MATH_TENSORS_H
@@ -38,17 +39,18 @@ namespace Grid {
 // It is useful to NOT have any constructors
 // so that these classes assert "is_pod<class> == true"
-// because then the standard C++ valarray container eliminates fill overhead on new allocation and 
+// because then the standard C++ valarray container eliminates fill overhead on
 // new allocation and
 // non-move copying.
 //
-// However note that doing this eliminates some syntactical sugar such as 
+// However note that doing this eliminates some syntactical sugar such as
 // calling the constructor explicitly or implicitly
 //
 class GridTensorBase {};
-template<class vtype> class iScalar 
+template <class vtype>
-{
+class iScalar {
-public:
+ public:
  vtype _internal;
  typedef vtype element;
@@ -60,13 +62,14 @@ public:
  typedef iScalar<recurse_scalar_object> scalar_object;
  // substitutes a real or complex version with same tensor structure
-  typedef iScalar<typename GridTypeMapper<vtype>::Complexified > Complexified;
+  typedef iScalar<typename GridTypeMapper<vtype>::Complexified> Complexified;
-  typedef iScalar<typename GridTypeMapper<vtype>::Realified >    Realified;
+  typedef iScalar<typename GridTypeMapper<vtype>::Realified> Realified;
-  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };
  // Scalar no action
-  //  template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
+  //  template<int Level> using tensor_reduce_level = typename
  //  iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
  iScalar() = default;
  /*
  iScalar(const iScalar<vtype> &copyme)=default;
@@ -74,83 +77,106 @@ public:
  iScalar<vtype> & operator= (const iScalar<vtype> &copyme) = default;
  iScalar<vtype> & operator= (iScalar<vtype> &&copyme) = default;
  */
-  iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
+  iScalar(scalar_type s)
-  iScalar(const Zero &z){ *this = zero; };
+      : _internal(s){};  // recurse down and hit the constructor for vector_type
  iScalar(const Zero &z) { *this = zero; };
-  iScalar<vtype> & operator= (const Zero &hero){
+  iScalar<vtype> &operator=(const Zero &hero) {
    zeroit(*this);
    return *this;
  }
-  friend strong_inline void vstream(iScalar<vtype> &out,const iScalar<vtype> &in){
+  friend strong_inline void vstream(iScalar<vtype> &out,
-    vstream(out._internal,in._internal);
+                                    const iScalar<vtype> &in) {
    vstream(out._internal, in._internal);
  }
-  friend strong_inline void zeroit(iScalar<vtype> &that){
+  friend strong_inline void zeroit(iScalar<vtype> &that) {
    zeroit(that._internal);
  }
-  friend strong_inline void prefetch(iScalar<vtype> &that){
+  friend strong_inline void prefetch(iScalar<vtype> &that) {
    prefetch(that._internal);
  }
-  friend strong_inline void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
+  friend strong_inline void permute(iScalar<vtype> &out,
-    permute(out._internal,in._internal,permutetype);
+                                    const iScalar<vtype> &in, int permutetype) {
    permute(out._internal, in._internal, permutetype);
  }
  // Unary negation
-  friend strong_inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
+  friend strong_inline iScalar<vtype> operator-(const iScalar<vtype> &r) {
    iScalar<vtype> ret;
-    ret._internal= -r._internal;
+    ret._internal = -r._internal;
    return ret;
  }
  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-  strong_inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
+  strong_inline iScalar<vtype> &operator*=(const iScalar<vtype> &r) {
-    *this = (*this)*r;
+    *this = (*this) * r;
    return *this;
  }
-  strong_inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
+  strong_inline iScalar<vtype> &operator-=(const iScalar<vtype> &r) {
-    *this = (*this)-r;
+    *this = (*this) - r;
    return *this;
  }
-  strong_inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
+  strong_inline iScalar<vtype> &operator+=(const iScalar<vtype> &r) {
-    *this = (*this)+r;
+    *this = (*this) + r;
    return *this;
  }
-  strong_inline vtype & operator ()(void) {
+  strong_inline vtype &operator()(void) { return _internal; }
-    return _internal;
+  strong_inline const vtype &operator()(void) const { return _internal; }
  }
  strong_inline const vtype & operator ()(void) const {
    return _internal;
  }
  // Type casts meta programmed, must be pure scalar to match TensorRemove
-  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0> operator ComplexF () const { return(TensorRemove(_internal)); };
+  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0,
-  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0> operator ComplexD () const { return(TensorRemove(_internal)); };
+            IfNotSimd<U> = 0>
-  //  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0> operator RealD    () const { return(real(TensorRemove(_internal))); }
+  operator ComplexF() const {
-  template<class U=vtype,class V=scalar_type,IfReal<V>    = 0,IfNotSimd<U> = 0> operator RealD    () const { return TensorRemove(_internal); }
+    return (TensorRemove(_internal));
-  template<class U=vtype,class V=scalar_type,IfInteger<V> = 0,IfNotSimd<U> = 0> operator Integer  () const { return Integer(TensorRemove(_internal)); }
+  };
-  
+  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0,
-  // convert from a something to a scalar via constructor of something arg
+            IfNotSimd<U> = 0>
-  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline iScalar<vtype> operator = (T arg)
+  operator ComplexD() const {
-    { 
+    return (TensorRemove(_internal));
-      _internal = arg;
+  };
-      return *this;
+  //  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> =
-    }
+  //  0> operator RealD    () const { return(real(TensorRemove(_internal))); }
  template <class U = vtype, class V = scalar_type, IfReal<V> = 0,
            IfNotSimd<U> = 0>
  operator RealD() const {
    return TensorRemove(_internal);
  }
  template <class U = vtype, class V = scalar_type, IfInteger<V> = 0,
            IfNotSimd<U> = 0>
  operator Integer() const {
    return Integer(TensorRemove(_internal));
  }
-    friend std::ostream& operator<< (std::ostream& stream, const iScalar<vtype> &o){
+  // convert from a something to a scalar via constructor of something arg
-      stream<< "S {"<<o._internal<<"}";
+  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type
-      return stream;
+                         * = nullptr>
-    };
+  strong_inline iScalar<vtype> operator=(T arg) {
    _internal = arg;
    return *this;
  }
  friend std::ostream &operator<<(std::ostream &stream,
                                  const iScalar<vtype> &o) {
    stream << "S {" << o._internal << "}";
    return stream;
  };
 };
 ///////////////////////////////////////////////////////////
 // Allows to turn scalar<scalar<scalar<double>>>> back to double.
 ///////////////////////////////////////////////////////////
-template<class T>     strong_inline typename std::enable_if<!isGridTensor<T>::value, T>::type TensorRemove(T arg) { return arg;}
+template <class T>
-template<class vtype> strong_inline auto TensorRemove(iScalar<vtype> arg) -> decltype(TensorRemove(arg._internal))
+strong_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
-{
+TensorRemove(T arg) {
  return arg;
 }
 template <class vtype>
 strong_inline auto TensorRemove(iScalar<vtype> arg)
    -> decltype(TensorRemove(arg._internal)) {
  return TensorRemove(arg._internal);
 }
-    
+
-template<class vtype,int N> class iVector 
+template <class vtype, int N>
-{
+class iVector {
-public:
+ public:
  vtype _internal[N];
  typedef vtype element;
@@ -159,23 +185,23 @@ public:
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
  typedef iScalar<tensor_reduced_v> tensor_reduced;
-  typedef iVector<recurse_scalar_object,N> scalar_object;
+  typedef iVector<recurse_scalar_object, N> scalar_object;
  // substitutes a real or complex version with same tensor structure
-  typedef iVector<typename GridTypeMapper<vtype>::Complexified,N > Complexified;
+  typedef iVector<typename GridTypeMapper<vtype>::Complexified, N> Complexified;
-  typedef iVector<typename GridTypeMapper<vtype>::Realified,N >    Realified;
+  typedef iVector<typename GridTypeMapper<vtype>::Realified, N> Realified;
-  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iVector<vtype,N>
+  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type
-    { 
+                         * = nullptr>
-      zeroit(*this);
+  strong_inline auto operator=(T arg) -> iVector<vtype, N> {
-      for(int i=0;i<N;i++)
+    zeroit(*this);
-	_internal[i] = arg;
+    for (int i = 0; i < N; i++) _internal[i] = arg;
-      return *this;
+    return *this;
-    }
+  }
-  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };
-  iVector(const Zero &z){ *this = zero; };
+  iVector(const Zero &z) { *this = zero; };
-  iVector() =default;
+  iVector() = default;
  /*
  iVector(const iVector<vtype,N> &copyme)=default;
  iVector(iVector<vtype,N> &&copyme)=default;
@@ -183,71 +209,71 @@ public:
  iVector<vtype,N> & operator= (iVector<vtype,N> &&copyme) = default;
  */
-  iVector<vtype,N> & operator= (const Zero &hero){
+  iVector<vtype, N> &operator=(const Zero &hero) {
    zeroit(*this);
    return *this;
  }
-  friend strong_inline void zeroit(iVector<vtype,N> &that){
+  friend strong_inline void zeroit(iVector<vtype, N> &that) {
-    for(int i=0;i<N;i++){
+    for (int i = 0; i < N; i++) {
      zeroit(that._internal[i]);
    }
  }
-  friend strong_inline void prefetch(iVector<vtype,N> &that){
+  friend strong_inline void prefetch(iVector<vtype, N> &that) {
-    for(int i=0;i<N;i++) prefetch(that._internal[i]);
+    for (int i = 0; i < N; i++) prefetch(that._internal[i]);
  }
-  friend strong_inline void vstream(iVector<vtype,N> &out,const iVector<vtype,N> &in){
+  friend strong_inline void vstream(iVector<vtype, N> &out,
-    for(int i=0;i<N;i++){
+                                    const iVector<vtype, N> &in) {
-      vstream(out._internal[i],in._internal[i]);
+    for (int i = 0; i < N; i++) {
      vstream(out._internal[i], in._internal[i]);
    }
  }
-  friend strong_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
+  friend strong_inline void permute(iVector<vtype, N> &out,
-    for(int i=0;i<N;i++){
+                                    const iVector<vtype, N> &in,
-      permute(out._internal[i],in._internal[i],permutetype);
+                                    int permutetype) {
    for (int i = 0; i < N; i++) {
      permute(out._internal[i], in._internal[i], permutetype);
    }
  }
  // Unary negation
-  friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
+  friend strong_inline iVector<vtype, N> operator-(const iVector<vtype, N> &r) {
-    iVector<vtype,N> ret;
+    iVector<vtype, N> ret;
-    for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
+    for (int i = 0; i < N; i++) ret._internal[i] = -r._internal[i];
    return ret;
  }
  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-  strong_inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
+  strong_inline iVector<vtype, N> &operator*=(const iScalar<vtype> &r) {
-    *this = (*this)*r;
+    *this = (*this) * r;
    return *this;
  }
-  strong_inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
+  strong_inline iVector<vtype, N> &operator-=(const iVector<vtype, N> &r) {
-    *this = (*this)-r;
+    *this = (*this) - r;
    return *this;
  }
-  strong_inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
+  strong_inline iVector<vtype, N> &operator+=(const iVector<vtype, N> &r) {
-    *this = (*this)+r;
+    *this = (*this) + r;
    return *this;
  }
-  strong_inline vtype & operator ()(int i) {
+  strong_inline vtype &operator()(int i) { return _internal[i]; }
-    return _internal[i];
+  strong_inline const vtype &operator()(int i) const { return _internal[i]; }
-  }
+  friend std::ostream &operator<<(std::ostream &stream,
-  strong_inline const vtype & operator ()(int i) const {
+                                  const iVector<vtype, N> &o) {
-    return _internal[i];
+    stream << "V<" << N << ">{";
-  }
+    for (int i = 0; i < N; i++) {
-  friend std::ostream& operator<< (std::ostream& stream, const iVector<vtype,N> &o){
+      stream << o._internal[i];
-    stream<< "V<"<<N<<">{";
+      if (i < N - 1) stream << ",";
    for(int i=0;i<N;i++) {
      stream<<o._internal[i];
      if (i<N-1)	stream<<",";
    }
-    stream<<"}";
+    stream << "}";
    return stream;
  };
  //    strong_inline vtype && operator ()(int i) {
  //      return _internal[i];
  //    }
 };
-    
+
-template<class vtype,int N> class iMatrix 
+template <class vtype, int N>
-{
+class iMatrix {
-public:
+ public:
  vtype _internal[N][N];
  typedef vtype element;
@@ -257,29 +283,27 @@ public:
  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
  // substitutes a real or complex version with same tensor structure
-  typedef iMatrix<typename GridTypeMapper<vtype>::Complexified,N > Complexified;
+  typedef iMatrix<typename GridTypeMapper<vtype>::Complexified, N> Complexified;
-  typedef iMatrix<typename GridTypeMapper<vtype>::Realified,N >    Realified;
+  typedef iMatrix<typename GridTypeMapper<vtype>::Realified, N> Realified;
  // Tensure removal
  typedef iScalar<tensor_reduced_v> tensor_reduced;
-  typedef iMatrix<recurse_scalar_object,N> scalar_object;
+  typedef iMatrix<recurse_scalar_object, N> scalar_object;
-  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };
  iMatrix(const Zero &z) { *this = zero; };
  iMatrix() = default;
-  iMatrix(const Zero &z){ *this = zero; };
+  iMatrix &operator=(const iMatrix &rhs) {
-  iMatrix() =default;
+    for (int i = 0; i < N; i++)
-  
+      for (int j = 0; j < N; j++) vstream(_internal[i][j], rhs._internal[i][j]);
  iMatrix& operator=(const iMatrix& rhs){
    for(int i=0;i<N;i++)
      for(int j=0;j<N;j++)
 	vstream(_internal[i][j],rhs._internal[i][j]);
    return *this;
-  }; 
+  };
-  iMatrix(scalar_type s)  { (*this) = s ;};// recurse down and hit the constructor for vector_type
+  iMatrix(scalar_type s) {
    (*this) = s;
  };  // recurse down and hit the constructor for vector_type
  /*
  iMatrix(const iMatrix<vtype,N> &copyme)=default;
@@ -288,118 +312,118 @@ public:
  iMatrix<vtype,N> & operator= (iMatrix<vtype,N> &&copyme) = default;
  */
-
+  iMatrix<vtype, N> &operator=(const Zero &hero) {
  iMatrix<vtype,N> & operator= (const Zero &hero){
    zeroit(*this);
    return *this;
  }
-  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iMatrix<vtype,N>
+  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type
-    { 
+                         * = nullptr>
-      zeroit(*this);
+  strong_inline auto operator=(T arg) -> iMatrix<vtype, N> {
-      for(int i=0;i<N;i++)
+    zeroit(*this);
-	_internal[i][i] = arg;
+    for (int i = 0; i < N; i++) _internal[i][i] = arg;
-      return *this;
+    return *this;
  }
  friend strong_inline void zeroit(iMatrix<vtype, N> &that) {
    for (int i = 0; i < N; i++) {
      for (int j = 0; j < N; j++) {
        zeroit(that._internal[i][j]);
      }
    }
  friend strong_inline void zeroit(iMatrix<vtype,N> &that){
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	zeroit(that._internal[i][j]);
    }}
  }
-  friend strong_inline void prefetch(iMatrix<vtype,N> &that){
+  friend strong_inline void prefetch(iMatrix<vtype, N> &that) {
-    for(int i=0;i<N;i++) 
+    for (int i = 0; i < N; i++)
-    for(int j=0;j<N;j++) 
+      for (int j = 0; j < N; j++) prefetch(that._internal[i][j]);
      prefetch(that._internal[i][j]);
  }
-  friend strong_inline void vstream(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in){
+  friend strong_inline void vstream(iMatrix<vtype, N> &out,
-      for(int i=0;i<N;i++){
+                                    const iMatrix<vtype, N> &in) {
-      for(int j=0;j<N;j++){
+    for (int i = 0; i < N; i++) {
-	vstream(out._internal[i][j],in._internal[i][j]);
+      for (int j = 0; j < N; j++) {
-      }}
+        vstream(out._internal[i][j], in._internal[i][j]);
      }
    }
  friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	permute(out._internal[i][j],in._internal[i][j],permutetype);
    }}
  }
  friend strong_inline void permute(iMatrix<vtype, N> &out,
                                    const iMatrix<vtype, N> &in,
                                    int permutetype) {
    for (int i = 0; i < N; i++) {
      for (int j = 0; j < N; j++) {
        permute(out._internal[i][j], in._internal[i][j], permutetype);
      }
    }
  }
  // Unary negation
-  friend strong_inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
+  friend strong_inline iMatrix<vtype, N> operator-(const iMatrix<vtype, N> &r) {
-    iMatrix<vtype,N> ret;
+    iMatrix<vtype, N> ret;
-    for(int i=0;i<N;i++){
+    for (int i = 0; i < N; i++) {
-      for(int j=0;j<N;j++){
+      for (int j = 0; j < N; j++) {
-	ret._internal[i][j]= -r._internal[i][j];
+        ret._internal[i][j] = -r._internal[i][j];
-    }}
+      }
    }
    return ret;
  }
  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-  template<class T>
+  template <class T>
-  strong_inline iMatrix<vtype,N> &operator *=(const T &r) {
+  strong_inline iMatrix<vtype, N> &operator*=(const T &r) {
-    *this = (*this)*r;
+    *this = (*this) * r;
    return *this;
  }
-  template<class T>
+  template <class T>
-  strong_inline iMatrix<vtype,N> &operator -=(const T &r) {
+  strong_inline iMatrix<vtype, N> &operator-=(const T &r) {
-    *this = (*this)-r;
+    *this = (*this) - r;
    return *this;
  }
-  template<class T>
+  template <class T>
-  strong_inline iMatrix<vtype,N> &operator +=(const T &r) {
+  strong_inline iMatrix<vtype, N> &operator+=(const T &r) {
-    *this = (*this)+r;
+    *this = (*this) + r;
    return *this;
  }
  // returns an lvalue reference
-  strong_inline vtype & operator ()(int i,int j) {
+  strong_inline vtype &operator()(int i, int j) { return _internal[i][j]; }
  strong_inline const vtype &operator()(int i, int j) const {
    return _internal[i][j];
  }
-  strong_inline const vtype & operator ()(int i,int j) const {
+  friend std::ostream &operator<<(std::ostream &stream,
-    return _internal[i][j];
+                                  const iMatrix<vtype, N> &o) {
-  }
+    stream << "M<" << N << ">{";
-  friend std::ostream& operator<< (std::ostream& stream, const iMatrix<vtype,N> &o){
+    for (int i = 0; i < N; i++) {
-    stream<< "M<"<<N<<">{";
+      stream << "{";
-    for(int i=0;i<N;i++) {
+      for (int j = 0; j < N; j++) {
-      stream<< "{";
+        stream << o._internal[i][j];
-      for(int j=0;j<N;j++) {
+        if (i < N - 1) stream << ",";
 	stream<<o._internal[i][j];
 	if (i<N-1)	stream<<",";
      }
-      stream<<"}";
+      stream << "}";
-      if(i!=N-1) stream<<"\n\t\t";
+      if (i != N - 1) stream << "\n\t\t";
    }
-    stream<<"}";
+    stream << "}";
    return stream;
  };
  //  strong_inline vtype && operator ()(int i,int j) {
  //    return _internal[i][j];
  //  }
 };
-template<class v> void vprefetch(const iScalar<v> &vv)
+template <class v>
-{
+void vprefetch(const iScalar<v> &vv) {
  vprefetch(vv._internal);
 }
-template<class v,int N> void vprefetch(const iVector<v,N> &vv)
+template <class v, int N>
-{
+void vprefetch(const iVector<v, N> &vv) {
-  for(int i=0;i<N;i++){
+  for (int i = 0; i < N; i++) {
    vprefetch(vv._internal[i]);
  }
 }
-template<class v,int N> void vprefetch(const iMatrix<v,N> &vv)
+template <class v, int N>
-{
+void vprefetch(const iMatrix<v, N> &vv) {
-  for(int i=0;i<N;i++){
+  for (int i = 0; i < N; i++) {
-  for(int j=0;j<N;j++){
+    for (int j = 0; j < N; j++) {
-    vprefetch(vv._internal[i][j]);
+      vprefetch(vv._internal[i][j]);
-  }}
+    }
  }
 }
 }
 #endif
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -10,6 +10,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -167,6 +168,33 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac
  }
 }
 ////////////////////////////////////////////////////////////////////////
 // Extract to a bunch of scalar object pointers of different scalar type, with offset. Useful for precision change
 ////////////////////////////////////////////////////////////////////////
 template<class vobj, class sobj> inline 
 void extract1(const vobj &vec,std::vector<sobj*> &extracted, int offset)
 {
  typedef typename vobj::scalar_type vobj_scalar_type ;
  typedef typename vobj::vector_type vobj_vector_type ;
  typedef typename sobj::scalar_type sobj_scalar_type ;
  static const int words=sizeof(vobj)/sizeof(vobj_vector_type);
  static const int Nsimd=vobj_vector_type::Nsimd();
  int Nextr=extracted.size();
  int s = Nsimd/Nextr;
  vobj_scalar_type * vp = (vobj_scalar_type *)&vec;
  for(int w=0;w<words;w++){
    for(int i=0;i<Nextr;i++){
      sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
      pointer[w] = vp[i*s+w*Nsimd];
    }
  }
 }
 ////////////////////////////////////////////////////////////////////////
 // Merge a contiguous array of scalar objects
 ////////////////////////////////////////////////////////////////////////
--- a/lib/tensors/Tensor_traits.h
+++ b/lib/tensors/Tensor_traits.h
@@ -8,6 +8,7 @@
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -230,6 +231,35 @@ namespace Grid {
    static const bool value = true;
  };
  //Get the SIMD vector type from a Grid tensor or Lattice<Tensor>
  template<typename T>
  struct getVectorType{
    typedef T type;
  };
  //Query if a tensor or Lattice<Tensor> is SIMD vector or scalar
  template<typename T>
  class isSIMDvectorized{
    template<typename U>
    static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,   typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *);
    template<typename U>
    static double test(...);
  public:
    enum {value = sizeof(test<T>(0)) == sizeof(char) };
  };
  //Get the precision of a Lattice, tensor or scalar type in units of sizeof(float)
  template<typename T>
  class getPrecision{
    typedef typename getVectorType<T>::type vector_obj; //get the vector_obj (i.e. a grid Tensor) if its a Lattice<vobj>, do nothing otherwise (i.e. if fundamental or grid Tensor)
    typedef typename GridTypeMapper<vector_obj>::scalar_type scalar_type; //get the associated scalar type. Works on fundamental and tensor types
    typedef typename GridTypeMapper<scalar_type>::Realified real_scalar_type; //remove any std::complex wrapper, should get us to the fundamental type
  public:
    enum { value = sizeof(real_scalar_type)/sizeof(float) };
  };
 }
 #endif
--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -86,6 +86,8 @@ UNARY(sqrt);
 UNARY(rsqrt);
 UNARY(sin);
 UNARY(cos);
 UNARY(asin);
 UNARY(acos);
 UNARY(log);
 UNARY(exp);
 UNARY(abs);
--- a/scripts/Make.inc
+++ b/scripts/Make.inc
@@ -0,0 +1,4 @@
 HFILES=
 CCFILES=
--- a/scripts/copyright
+++ b/scripts/copyright
@@ -5,13 +5,13 @@ while (( "$#" )); do
 echo $1
 cat > message  <<EOF
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid 
-    Source file: $1
+Source file: $1
-    Copyright (C) 2015
+Copyright (C) 2015
 EOF
@@ -19,23 +19,23 @@ git log $1 | grep Author | sort -u >> message
 cat >> message <<EOF
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
+*************************************************************************************/
-    /*  END LEGAL */
+/*  END LEGAL */
 EOF
 cat message > tmp.fil
--- a/scripts/filelist
+++ b/scripts/filelist
@@ -2,7 +2,8 @@
 cd lib
-HFILES=`find . -type f -name '*.h'`
+HFILES=`find . -type f -name '*.h' -not -path '*/Old/*'`
 HFILES="$HFILES Config.h"
 CCFILES=`find . -type f -name '*.cc' -not  -name '*ommunicator*.cc'`
 echo> Make.inc
 echo HFILES=$HFILES >> Make.inc
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,13 +1,5 @@
-bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
+bin_PROGRAMS += Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_mixedcg_prec Test_dwf_rb5d Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
 Test_GaugeAction_LDADD=-lGrid
 Test_RectPlaq_SOURCES=Test_RectPlaq.cc
 Test_RectPlaq_LDADD=-lGrid
 Test_cayley_cg_SOURCES=Test_cayley_cg.cc
@@ -106,6 +98,10 @@ Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
 Test_dwf_lanczos_LDADD=-lGrid
 Test_dwf_mixedcg_prec_SOURCES=Test_dwf_mixedcg_prec.cc
 Test_dwf_mixedcg_prec_LDADD=-lGrid
 Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
 Test_dwf_rb5d_LDADD=-lGrid
@@ -114,8 +110,8 @@ Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid
-Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc
+Test_GaugeAction_SOURCES=Test_GaugeAction.cc
-Test_gp_rect_force_LDADD=-lGrid
+Test_GaugeAction_LDADD=-lGrid
 Test_gparity_SOURCES=Test_gparity.cc
@@ -126,6 +122,10 @@ Test_gpdwf_force_SOURCES=Test_gpdwf_force.cc
 Test_gpdwf_force_LDADD=-lGrid
 Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc
 Test_gp_rect_force_LDADD=-lGrid
 Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
 Test_gpwilson_even_odd_LDADD=-lGrid
@@ -202,6 +202,10 @@ Test_rect_force_SOURCES=Test_rect_force.cc
 Test_rect_force_LDADD=-lGrid
 Test_RectPlaq_SOURCES=Test_RectPlaq.cc
 Test_RectPlaq_LDADD=-lGrid
 Test_remez_SOURCES=Test_remez.cc
 Test_remez_LDADD=-lGrid
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -0,0 +1,108 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_dwf_cg_prec.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::GammaMatrix Gmu [] = {
    Gamma::GammaX,
    Gamma::GammaY,
    Gamma::GammaZ,
    Gamma::GammaT
  };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridCartesian         * UGrid_f   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f);
  GridCartesian         * FGrid_f   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
  GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  LatticeFermionD    src(FGrid); random(RNG5,src);
  LatticeFermionD result(FGrid); result=zero;
  LatticeGaugeFieldD Umu(UGrid);
  LatticeGaugeFieldF Umu_f(UGrid_f); 
  SU3::HotConfiguration(RNG4,Umu);
  precisionChange(Umu_f,Umu);
  RealD mass=0.1;
  RealD M5=1.8;
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionF Ddwf_f(Umu_f,*FGrid_f,*FrbGrid_f,*UGrid_f,*UrbGrid_f,mass,M5);
  LatticeFermionD    src_o(FrbGrid);
  LatticeFermionD result_o(FrbGrid);
  LatticeFermionD result_o_2(FrbGrid);
  pickCheckerboard(Odd,src_o,src);
  result_o.checkerboard = Odd;
  result_o = zero;
  result_o_2.checkerboard = Odd;
  result_o_2 = zero;
  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
  std::cout << "Starting mixed CG" << std::endl;
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
  mCG(src_o,result_o);
  std::cout << "Starting regular CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
  CG(HermOpEO,src_o,result_o_2);
  LatticeFermionD diff_o(FrbGrid);
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
  std::cout << "Diff between mixed and regular CG: " << diff << std::endl;
  Grid_finalize();
 }
--- a/tests/Test_hmc_EOWilsonFermionGauge.cc
+++ b/tests/Test_hmc_EOWilsonFermionGauge.cc
@@ -65,7 +65,10 @@ public:
    ConjugateGradient<FermionField>  CG(1.0e-8,10000);
    TwoFlavourEvenOddPseudoFermionAction<ImplPolicy> Nf2(FermOp,CG,CG);
-  
+
    //Set smearing (true/false), default: false
    Nf2.is_smeared=false;
    //Collect actions
    ActionLevel<LatticeGaugeField> Level1(1);
    Level1.push_back(&Nf2);
--- a/tests/Test_hmc_EOWilsonRatio.cc
+++ b/tests/Test_hmc_EOWilsonRatio.cc
@@ -66,6 +66,9 @@ public:
    ConjugateGradient<FermionField>  CG(1.0e-8,10000);
    TwoFlavourEvenOddRatioPseudoFermionAction<ImplPolicy> Nf2(NumOp, DenOp,CG,CG);
    //Set smearing (true/false), default: false
    Nf2.is_smeared=true;
    //Collect actions
    ActionLevel<LatticeGaugeField> Level1;
    Level1.push_back(&Nf2);
--- a/tests/Test_hmc_WilsonFermionGauge.cc
+++ b/tests/Test_hmc_WilsonFermionGauge.cc
@@ -67,6 +67,10 @@ public:
    TwoFlavourPseudoFermionAction<ImplPolicy> Nf2(FermOp,CG,CG);
    //Set smearing (true/false), default: false
    Nf2.is_smeared = true;
    //Collect actions
    ActionLevel<LatticeGaugeField> Level1(1);
    Level1.push_back(&Nf2);
--- a/tests/Test_hmc_WilsonRatio.cc
+++ b/tests/Test_hmc_WilsonRatio.cc
@@ -65,6 +65,9 @@ public:
    ConjugateGradient<FermionField>  CG(1.0e-8,10000);
    TwoFlavourRatioPseudoFermionAction<ImplPolicy> Nf2(NumOp, DenOp,CG,CG);
    //Set smearing (true/false), default: false
    Nf2.is_smeared=true;
    //Collect actions
    ActionLevel<LatticeGaugeField> Level1;
--- a/tests/Test_main.cc
+++ b/tests/Test_main.cc
--- a/tests/Test_rhmc_EOWilson1p1.cc
+++ b/tests/Test_rhmc_EOWilson1p1.cc
@@ -1,97 +1,100 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./tests/Test_rhmc_EOWilson1p1.cc
+Source file: ./tests/Test_rhmc_EOWilson1p1.cc
-    Copyright (C) 2015
+Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
 /*  END LEGAL */
 #include "Grid.h"
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
-namespace Grid { 
+namespace Grid {
-  namespace QCD { 
+namespace QCD {
 class HmcRunner : public NerscHmcRunner {
-public:
+ public:
-
+  void BuildTheAction(int argc, char **argv)
  void BuildTheAction (int argc, char **argv)
  {
    typedef WilsonImplR ImplPolicy;
    typedef WilsonFermionR FermionAction;
    typedef typename FermionAction::FermionField FermionField;
-    UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    UGrid = SpaceTimeGrid::makeFourDimGrid(
        GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
        GridDefaultMpi());
    UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  
+
-    FGrid   = UGrid;
+    FGrid = UGrid;
    FrbGrid = UrbGrid;
    // temporarily need a gauge field
-    LatticeGaugeField  U(UGrid);
+    LatticeGaugeField U(UGrid);
    // Gauge action
    WilsonGaugeActionR Waction(5.6);
-    Real mass=-0.77;
+    Real mass = -0.77;
-    FermionAction FermOp(U,*FGrid,*FrbGrid,mass);
+    FermionAction FermOp(U, *FGrid, *FrbGrid, mass);
    // 1+1 flavour
-    OneFlavourRationalParams Params(1.0e-4,64.0,1000,1.0e-6);
+    OneFlavourRationalParams Params(1.0e-4, 64.0, 2000, 1.0e-6);
-    OneFlavourEvenOddRationalPseudoFermionAction<WilsonImplR> WilsonNf1a(FermOp,Params);
+    OneFlavourEvenOddRationalPseudoFermionAction<WilsonImplR> WilsonNf1a(
-    OneFlavourEvenOddRationalPseudoFermionAction<WilsonImplR> WilsonNf1b(FermOp,Params);
+        FermOp, Params);
    OneFlavourEvenOddRationalPseudoFermionAction<WilsonImplR> WilsonNf1b(
        FermOp, Params);
-    //Collect actions
+    //Smearing on/off
    WilsonNf1a.is_smeared = true;
    WilsonNf1b.is_smeared = true;
    // Collect actions
    ActionLevel<LatticeGaugeField> Level1;
    Level1.push_back(&WilsonNf1a);
    Level1.push_back(&WilsonNf1b);
    Level1.push_back(&Waction);
-    
+
    TheAction.push_back(Level1);
-    Run(argc,argv);
+    Run(argc, argv);
  };
 };
-
+}
 }}
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  HmcRunner TheHMC;
  TheHMC.BuildTheAction(argc,argv);
 }
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  std::cout << GridLogMessage << "Grid is setup to use " << threads
            << " threads" << std::endl;
  HmcRunner TheHMC;
  TheHMC.BuildTheAction(argc, argv);
 }
--- a/tests/Test_rhmc_Wilson1p1.cc
+++ b/tests/Test_rhmc_Wilson1p1.cc
@@ -66,6 +66,10 @@ public:
    OneFlavourRationalPseudoFermionAction<WilsonImplR> WilsonNf1a(FermOp,Params);
    OneFlavourRationalPseudoFermionAction<WilsonImplR> WilsonNf1b(FermOp,Params);
    //Set smearing (true/false), default: false
    WilsonNf1a.is_smeared=false;
    WilsonNf1b.is_smeared=false;
    //Collect actions
    ActionLevel<LatticeGaugeField> Level1;
    Level1.push_back(&WilsonNf1a);
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -1,31 +1,32 @@
-    /*************************************************************************************
+/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./tests/Test_simd.cc
+Source file: ./tests/Test_simd.cc
-    Copyright (C) 2015
+Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
-    This program is free software; you can redistribute it and/or modify
+This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+(at your option) any later version.
-    This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
+You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution
-    *************************************************************************************/
+directory
-    /*  END LEGAL */
+*************************************************************************************/
 /*  END LEGAL */
 #include <Grid.h>
 using namespace std;
@@ -62,6 +63,18 @@ public:
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = adj(i1);}
  std::string name(void) const { return std::string("Adj"); }
 };
 class funcImag {
 public:
  funcImag() {};
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = imag(i1);}
  std::string name(void) const { return std::string("imag"); }
 };
 class funcReal {
 public:
  funcReal() {};
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = real(i1);}
  std::string name(void) const { return std::string("real"); }
 };
 class funcTimesI {
 public:
@@ -141,7 +154,13 @@ void Tester(const functor &func)
  }
  extract<vec,scal>(v_result,result);
-  std::cout<<GridLogMessage << " " << func.name()<<std::endl;
+
  std::cout << GridLogMessage << " " << func.name() << std::endl;
  std::cout << GridLogDebug << v_input1 << std::endl;
  std::cout << GridLogDebug << v_result << std::endl;
  int ok=0;
  for(int i=0;i<Nsimd;i++){
@@ -389,6 +408,8 @@ int main (int argc, char ** argv)
  Tester<ComplexF,vComplexF>(funcTimes());
  Tester<ComplexF,vComplexF>(funcConj());
  Tester<ComplexF,vComplexF>(funcAdj());
  Tester<ComplexF,vComplexF>(funcReal());
  Tester<ComplexF,vComplexF>(funcImag());
  Tester<ComplexF,vComplexF>(funcInnerProduct());
  ReductionTester<ComplexF,ComplexF,vComplexF>(funcReduce());
@@ -421,17 +442,21 @@ int main (int argc, char ** argv)
  Tester<ComplexD,vComplexD>(funcTimes());
  Tester<ComplexD,vComplexD>(funcConj());
  Tester<ComplexD,vComplexD>(funcAdj());
-  Tester<ComplexD,vComplexD>(funcInnerProduct());
+  Tester<ComplexD, vComplexD>(funcReal());
-  ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());
+  Tester<ComplexD, vComplexD>(funcImag());
  Tester<ComplexD, vComplexD>(funcInnerProduct());
  ReductionTester<ComplexD, ComplexD, vComplexD>(funcReduce());
-  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout << GridLogMessage
-  std::cout<<GridLogMessage << "Testing vComplexD permutes "<<std::endl;
+            << "===================================" << std::endl;
-  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout << GridLogMessage << "Testing vComplexD permutes " << std::endl;
  std::cout << GridLogMessage
            << "===================================" << std::endl;
  // Log2 iteration
-  for(int i=0;(1<<i)< vComplexD::Nsimd();i++){
+  for (int i = 0; (1 << i) < vComplexD::Nsimd(); i++) {
-    PermTester<ComplexD,vComplexD>(funcPermute(i));
+    PermTester<ComplexD, vComplexD>(funcPermute(i));
  }