Merge branch 'develop' into feature/json-fix

2025-11-05 06:19:31 +00:00 · 2017-07-07 14:17:50 +01:00
parent b672717096 7b0237b081
commit d9593c4b81
102 changed files with 4235 additions and 3759 deletions
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -41,6 +41,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/Action.h>
+#include <Grid/qcd/utils/GaugeFix.h>
 #include <Grid/qcd/smearing/Smearing.h>
 #include <Grid/parallelIO/MetaData.h>
 #include <Grid/qcd/hmc/HMC_aggregate.h>
--- a/lib/algorithms/densematrix/DenseMatrix.h
+++ b/lib/algorithms/densematrix/DenseMatrix.h
@@ -1,137 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/DenseMatrix.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_DENSE_MATRIX_H
-#define GRID_DENSE_MATRIX_H
-
-namespace Grid {
-    /////////////////////////////////////////////////////////////
-    // Matrix untils
-    /////////////////////////////////////////////////////////////
-
-template<class T> using DenseVector = std::vector<T>;
-template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
-
-template<class T> void Size(DenseVector<T> & vec, int &N) 
-{ 
-  N= vec.size();
-}
-template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
-{ 
-  N= mat.size();
-  M= mat[0].size();
-}
-
-template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
-{ 
-  int M; Size(mat,N,M);
-  assert(N==M);
-}
-
-template<class T> void Resize(DenseVector<T > & mat, int N) { 
-  mat.resize(N);
-}
-template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
-  mat.resize(N);
-  for(int i=0;i<N;i++){
-    mat[i].resize(M);
-  }
-}
-template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
-  int N,M;
-  Size(mat,N,M);
-  for(int i=0;i<N;i++){
-  for(int j=0;j<M;j++){
-    mat[i][j] = val;
-  }}
-}
-
-/** Transpose of a matrix **/
-template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
-  int N,M;
-  Size(mat,N,M);
-  DenseMatrix<T> C; Resize(C,M,N);
-  for(int i=0;i<M;i++){
-  for(int j=0;j<N;j++){
-    C[i][j] = mat[j][i];
-  }} 
-  return C;
-}
-/** Set DenseMatrix to unit matrix **/
-template<class T> void Unity(DenseMatrix<T> &A){
-  int N;  SizeSquare(A,N);
-  for(int i=0;i<N;i++){
-    for(int j=0;j<N;j++){
-      if ( i==j ) A[i][j] = 1;
-      else        A[i][j] = 0;
-    } 
-  } 
-}
-
-/** Add C * I to matrix **/
-template<class T>
-void PlusUnit(DenseMatrix<T> & A,T c){
-  int dim;  SizeSquare(A,dim);
-  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
-}
-
-/** return the Hermitian conjugate of matrix **/
-template<class T>
-DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
-
-  int dim; SizeSquare(mat,dim);
-
-  DenseMatrix<T> C; Resize(C,dim,dim);
-
-  for(int i=0;i<dim;i++){
-    for(int j=0;j<dim;j++){
-      C[i][j] = conj(mat[j][i]);
-    } 
-  } 
-  return C;
-}
-/**Get a square submatrix**/
-template <class T>
-DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
-{
-  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
-
-  for(int i = row_st; i<row_end; i++){
-  for(int j = col_st; j<col_end; j++){
-    H[i-row_st][j-col_st]=A[i][j];
-  }}
-  return H;
-}
-
-}
-
-#include "Householder.h"
-#include "Francis.h"
-
-#endif
-
--- a/lib/algorithms/densematrix/Francis.h
+++ b/lib/algorithms/densematrix/Francis.h
@@ -1,525 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/Francis.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef FRANCIS_H
-#define FRANCIS_H
-
-#include <cstdlib>
-#include <string>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <fstream>
-#include <complex>
-#include <algorithm>
-
-//#include <timer.h>
-//#include <lapacke.h>
-//#include <Eigen/Dense>
-
-namespace Grid {
-
-template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
-template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
-
-/**
-  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
-H =
-      x  x  x  x  x  x  x  x  x
-      x  x  x  x  x  x  x  x  x
-      0  x  x  x  x  x  x  x  x
-      0  0  x  x  x  x  x  x  x
-      0  0  0  x  x  x  x  x  x
-      0  0  0  0  x  x  x  x  x
-      0  0  0  0  0  x  x  x  x
-      0  0  0  0  0  0  x  x  x
-      0  0  0  0  0  0  0  x  x
-Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
-**/
-template <class T>
-int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
-{
-  DenseMatrix<T> H = Hin; 
-
-  int N ; SizeSquare(H,N);
-  int M = N;
-
-  Fill(evals,0);
-  Fill(evecs,0);
-
-  T s,t,x=0,y=0,z=0;
-  T u,d;
-  T apd,amd,bc;
-  DenseVector<T> p(N,0);
-  T nrm = Norm(H);    ///DenseMatrix Norm
-  int n, m;
-  int e = 0;
-  int it = 0;
-  int tot_it = 0;
-  int l = 0;
-  int r = 0;
-  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
-  DenseVector<int> trows(N,0);
-
-  /// Check if the matrix is really hessenberg, if not abort
-  RealD sth = 0;
-  for(int j=0;j<N;j++){
-    for(int i=j+2;i<N;i++){
-      sth = abs(H[i][j]);
-      if(sth > small){
-	std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
-	exit(1);
-      }
-    }
-  }
-
-  do{
-    std::cout << "Francis QR Step N = " << N << std::endl;
-    /** Check for convergence
-      x  x  x  x  x
-      0  x  x  x  x
-      0  0  x  x  x
-      0  0  x  x  x
-      0  0  0  0  x
-      for this matrix l = 4
-     **/
-    do{
-      l = Chop_subdiag(H,nrm,e,small);
-      r = 0;    ///May have converged on more than one eval
-      ///Single eval
-      if(l == N-1){
-        evals[e] = H[l][l];
-        N--; e++; r++; it = 0;
-      }
-      ///RealD eval
-      if(l == N-2){
-        trows[l+1] = 1;    ///Needed for UTSolve
-        apd = H[l][l] + H[l+1][l+1];
-        amd = H[l][l] - H[l+1][l+1];
-        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
-        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
-        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
-        N-=2; e+=2; r++; it = 0;
-      }
-    } while(r>0);
-
-    if(N ==0) break;
-
-    DenseVector<T > ck; Resize(ck,3);
-    DenseVector<T> v;   Resize(v,3);
-
-    for(int m = N-3; m >= l; m--){
-      ///Starting vector essentially random shift.
-      if(it%10 == 0 && N >= 3 && it > 0){
-        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
-        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
-        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
-        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
-        z = H[m+1][m]*H[m+2][m+1];
-      }
-      ///Starting vector implicit Q theorem
-      else{
-        s = (H[N-2][N-2] + H[N-1][N-1]);
-        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
-        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
-        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
-        z = H[m+1][m]*H[m+2][m+1];
-      }
-      ck[0] = x; ck[1] = y; ck[2] = z;
-
-      if(m == l) break;
-
-      /** Some stupid thing from numerical recipies, seems to work**/
-      // PAB.. for heaven's sake quote page, purpose, evidence it works.
-      //       what sort of comment is that!?!?!?
-      u=abs(H[m][m-1])*(abs(y)+abs(z));
-      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
-      if ((T)abs(u+d) == (T)abs(d) ){
-	l = m; break;
-      }
-
-      //if (u < small){l = m; break;}
-    }
-    if(it > 100000){
-     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
-     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
-      exit(1);
-    }
-    normalize(ck);    ///Normalization cancels in PHP anyway
-    T beta;
-    Householder_vector<T >(ck, 0, 2, v, beta);
-    Householder_mult<T >(H,v,beta,0,l,l+2,0);
-    Householder_mult<T >(H,v,beta,0,l,l+2,1);
-    ///Accumulate eigenvector
-    Householder_mult<T >(P,v,beta,0,l,l+2,1);
-    int sw = 0;      ///Are we on the last row?
-    for(int k=l;k<N-2;k++){
-      x = H[k+1][k];
-      y = H[k+2][k];
-      z = (T)0.0;
-      if(k+3 <= N-1){
-	z = H[k+3][k];
-      } else{
-	sw = 1; 
-	v[2] = (T)0.0;
-      }
-      ck[0] = x; ck[1] = y; ck[2] = z;
-      normalize(ck);
-      Householder_vector<T >(ck, 0, 2-sw, v, beta);
-      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
-      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
-      ///Accumulate eigenvector
-      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
-    }
-    it++;
-    tot_it++;
-  }while(N > 1);
-  N = evals.size();
-  ///Annoying - UT solves in reverse order;
-  DenseVector<T> tmp; Resize(tmp,N);
-  for(int i=0;i<N;i++){
-    tmp[i] = evals[N-i-1];
-  } 
-  evals = tmp;
-  UTeigenvectors(H, trows, evals, evecs);
-  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
-  return tot_it;
-}
-
-template <class T>
-int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
-{
-  /**
-  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
-  H =
-  x  x  0  0  0  0
-  x  x  x  0  0  0
-  0  x  x  x  0  0
-  0  0  x  x  x  0
-  0  0  0  x  x  x
-  0  0  0  0  x  x
-  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
-  return my_Wilkinson(Hin, evals, evecs, small, small);
-}
-
-template <class T>
-int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
-{
-  int N; SizeSquare(Hin,N);
-  int M = N;
-
-  ///I don't want to modify the input but matricies must be passed by reference
-  //Scale a matrix by its "norm"
-  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
-  DenseMatrix<T> H;  H = Hin;
-  
-  RealD Hnorm = abs(Norm(Hin));
-  H = H * (1.0 / Hnorm);
-
-  // TODO use openmp and memset
-  Fill(evals,0);
-  Fill(evecs,0);
-
-  T s, t, x = 0, y = 0, z = 0;
-  T u, d;
-  T apd, amd, bc;
-  DenseVector<T> p; Resize(p,N); Fill(p,0);
-
-  T nrm = Norm(H);    ///DenseMatrix Norm
-  int n, m;
-  int e = 0;
-  int it = 0;
-  int tot_it = 0;
-  int l = 0;
-  int r = 0;
-  DenseMatrix<T> P; Resize(P,N,N);
-  Unity(P);
-  DenseVector<int> trows(N, 0);
-  /// Check if the matrix is really symm tridiag
-  RealD sth = 0;
-  for(int j = 0; j < N; ++j)
-  {
-    for(int i = j + 2; i < N; ++i)
-    {
-      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
-      {
-	std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
-	std::cout << "Warning tridiagonalize and call again" << std::endl;
-        // exit(1); // see what is going on
-        //return;
-      }
-    }
-  }
-
-  do{
-    do{
-      //Jasper
-      //Check if the subdiagonal term is small enough (<small)
-      //if true then it is converged.
-      //check start from H.dim - e - 1
-      //How to deal with more than 2 are converged?
-      //What if Chop_symm_subdiag return something int the middle?
-      //--------------
-      l = Chop_symm_subdiag(H,nrm, e, small);
-      r = 0;    ///May have converged on more than one eval
-      //Jasper
-      //In this case
-      // x  x  0  0  0  0
-      // x  x  x  0  0  0
-      // 0  x  x  x  0  0
-      // 0  0  x  x  x  0
-      // 0  0  0  x  x  0
-      // 0  0  0  0  0  x  <- l
-      //--------------
-      ///Single eval
-      if(l == N - 1)
-      {
-        evals[e] = H[l][l];
-        N--;
-        e++;
-        r++;
-        it = 0;
-      }
-      //Jasper
-      // x  x  0  0  0  0
-      // x  x  x  0  0  0
-      // 0  x  x  x  0  0
-      // 0  0  x  x  0  0
-      // 0  0  0  0  x  x  <- l
-      // 0  0  0  0  x  x
-      //--------------
-      ///RealD eval
-      if(l == N - 2)
-      {
-        trows[l + 1] = 1;    ///Needed for UTSolve
-        apd = H[l][l] + H[l + 1][ l + 1];
-        amd = H[l][l] - H[l + 1][l + 1];
-        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
-        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
-        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
-        N -= 2;
-        e += 2;
-        r++;
-        it = 0;
-      }
-    }while(r > 0);
-    //Jasper
-    //Already converged
-    //--------------
-    if(N == 0) break;
-
-    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
-
-    for(int m = N - 3; m >= l; m--)
-    {
-      ///Starting vector essentially random shift.
-      if(it%10 == 0 && N >= 3 && it > 0)
-      {
-        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
-        x = H[m][m] - t;
-        z = H[m + 1][m];
-      } else {
-      ///Starting vector implicit Q theorem
-        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
-        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
-	  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
-        x = H[m][m] - t;
-        z = H[m + 1][m];
-      }
-      //Jasper
-      //why it is here????
-      //-----------------------
-      if(m == l)
-        break;
-
-      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
-      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
-      if ((T)abs(u + d) == (T)abs(d))
-      {
-        l = m;
-        break;
-      }
-    }
-    //Jasper
-    if(it > 1000000)
-    {
-      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
-      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
-      exit(1);
-    }
-    //
-    T s, c;
-    Givens_calc<T>(x, z, c, s);
-    Givens_mult<T>(H, l, l + 1, c, -s, 0);
-    Givens_mult<T>(H, l, l + 1, c,  s, 1);
-    Givens_mult<T>(P, l, l + 1, c,  s, 1);
-    //
-    for(int k = l; k < N - 2; ++k)
-    {
-      x = H.A[k + 1][k];
-      z = H.A[k + 2][k];
-      Givens_calc<T>(x, z, c, s);
-      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
-      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
-      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
-    }
-    it++;
-    tot_it++;
-  }while(N > 1);
-
-  N = evals.size();
-  ///Annoying - UT solves in reverse order;
-  DenseVector<T> tmp(N);
-  for(int i = 0; i < N; ++i)
-    tmp[i] = evals[N-i-1];
-  evals = tmp;
-  //
-  UTeigenvectors(H, trows, evals, evecs);
-  //UTSymmEigenvectors(H, trows, evals, evecs);
-  for(int i = 0; i < evals.size(); ++i)
-  {
-    evecs[i] = P * evecs[i];
-    normalize(evecs[i]);
-    evals[i] = evals[i] * Hnorm;
-  }
-  // // FIXME this is to test
-  // Hin.write("evecs3", evecs);
-  // Hin.write("evals3", evals);
-  // // check rsd
-  // for(int i = 0; i < M; i++) {
-  //   vector<T> Aevec = Hin * evecs[i];
-  //   RealD norm2(0.);
-  //   for(int j = 0; j < M; j++) {
-  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
-  //   }
-  // }
-  return tot_it;
-}
-
-template <class T>
-void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
-
-  /**
-  turn a matrix A =
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  into
-  x  x  x  x  x
-  x  x  x  x  x
-  0  x  x  x  x
-  0  0  x  x  x
-  0  0  0  x  x
-  with householder rotations
-  Slow.
-  */
-  int N ; SizeSquare(A,N);
-  DenseVector<T > p; Resize(p,N); Fill(p,0);
-
-  for(int k=start;k<N-2;k++){
-    //cerr << "hess" << k << std::endl;
-    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
-    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
-    normalize(ck);    ///Normalization cancels in PHP anyway
-    T beta;
-    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
-    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
-    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
-    ///Accumulate eigenvector
-    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
-  }
-  /*for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-    A(0,k,l);
-    }
-    }*/
-}
-
-template <class T>
-void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
-///Tridiagonalize a matrix
-  int N; SizeSquare(A,N);
-  Hess(A,Q,start);
-  /*for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-    A(0,l,k);
-    }
-    }*/
-}
-
-template <class T>
-void ForceTridiagonal(DenseMatrix<T> &A){
-///Tridiagonalize a matrix
-  int N ; SizeSquare(A,N);
-  for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-      A[l][k]=0;
-      A[k][l]=0;
-    }
-  }
-}
-
-template <class T>
-int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
-  int N; SizeSquare(Ain,N);
-  DenseMatrix<T > A; A = Ain;
-  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
-  Tri(A,Q,0);
-  int it = my_Wilkinson<T>(A, evals, evecs, small);
-  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
-  return it;
-}
-
-
-template <class T>
-int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  return my_Wilkinson(Ain, evals, evecs, small);
-}
-
-template <class T>
-int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  return my_SymmEigensystem(Ain, evals, evecs, small);
-}
-
-template <class T>
-int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-///Solve a general eigensystem, not necessarily in tridiagonal form
-  int N = Ain.dim;
-  DenseMatrix<T > A(N); A = Ain;
-  DenseMatrix<T > Q(N);Q.Unity();
-  Hess(A,Q,0);
-  int it = QReigensystem<T>(A, evals, evecs, small);
-  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
-  return it;
-}
-
-}
-#endif
--- a/lib/algorithms/densematrix/Householder.h
+++ b/lib/algorithms/densematrix/Householder.h
@@ -1,242 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/Householder.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef HOUSEHOLDER_H
-#define HOUSEHOLDER_H
-
-#define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-#define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-#define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-
-#include <cstdlib>
-#include <string>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <fstream>
-#include <complex>
-#include <algorithm>
-
-namespace Grid {
-/** Comparison function for finding the max element in a vector **/
-template <class T> bool cf(T i, T j) { 
-  return abs(i) < abs(j); 
-}
-
-/** 
-	Calculate a real Givens angle 
- **/
-template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
-
-  RealD mz = (RealD)abs(z);
-  
-  if(mz==0.0){
-    c = 1; s = 0;
-  }
-  if(mz >= (RealD)abs(y)){
-    T t = -y/z;
-    s = (T)1.0 / sqrt ((T)1.0 + t * t);
-    c = s * t;
-  } else {
-    T t = -z/y;
-    c = (T)1.0 / sqrt ((T)1.0 + t * t);
-    s = c * t;
-  }
-}
-
-template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
-{
-  int q ; SizeSquare(A,q);
-
-  if(dir == 0){
-    for(int j=0;j<q;j++){
-      T nu = A[i][j];
-      T w  = A[k][j];
-      A[i][j] = (c*nu + s*w);
-      A[k][j] = (-s*nu + c*w);
-    }
-  }
-
-  if(dir == 1){
-    for(int j=0;j<q;j++){
-      T nu = A[j][i];
-      T w  = A[j][k];
-      A[j][i] = (c*nu - s*w);
-      A[j][k] = (s*nu + c*w);
-    }
-  }
-}
-
-/**
-	from input = x;
-	Compute the complex Householder vector, v, such that
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-
-	P | x |    | x | k = 0
-	| x |    | 0 | 
-	| x | =  | 0 |
-	| x |    | 0 | j = 3
-	| x |	   | x |
-
-	These are the "Unreduced" Householder vectors.
-
- **/
-template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
-{
-  int N ; Size(input,N);
-  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
-
-  if(abs(m) > 0.0){
-    T alpha = 0;
-
-    for(int i=k; i<j+1; i++){
-      v[i] = input[i]/m;
-      alpha = alpha + v[i]*conj(v[i]);
-    }
-    alpha = sqrt(alpha);
-    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
-
-    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
-    else                 v[k] = -alpha;
-  } else{
-    for(int i=k; i<j+1; i++){
-      v[i] = 0.0;
-    } 
-  }
-}
-
-/**
-	from input = x;
-	Compute the complex Householder vector, v, such that
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-
-	Px = alpha*e_dir
-
-	These are the "Unreduced" Householder vectors.
-
- **/
-
-template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
-{
-  int N = input.size();
-  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
-  
-  if(abs(m) > 0.0){
-    T alpha = 0;
-
-    for(int i=k; i<j+1; i++){
-      v[i] = input[i]/m;
-      alpha = alpha + v[i]*conj(v[i]);
-    }
-    
-    alpha = sqrt(alpha);
-    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
-	
-    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
-    else                  v[dir] = -alpha;
-  }else{
-    for(int i=k; i<j+1; i++){
-      v[i] = 0.0;
-    } 
-  }
-}
-
-/**
-	Compute the product PA if trans = 0
-	AP if trans = 1
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-	start at element l of matrix A
-	v is of length j - k + 1 of v are nonzero
- **/
-
-template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
-{
-  int N ; SizeSquare(A,N);
-
-  if(abs(beta) > 0.0){
-    for(int p=l; p<N; p++){
-      T s = 0;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
-	s *= beta;
-	for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
-      } else {
-	for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
-	s *= beta;
-	for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
-      }
-    }
-  }
-}
-
-/**
-	Compute the product PA if trans = 0
-	AP if trans = 1
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-	start at element l of matrix A
-	v is of length j - k + 1 of v are nonzero
-	A is tridiagonal
- **/
-template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
-{
-  if(abs(beta) > 0.0){
-
-    int N ; SizeSquare(A,N);
-
-    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
-
-    T s;
-    for(int p=l; p<M; p++){
-      s = 0;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
-      }else{
-	for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
-      }
-      s = beta*s;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
-      }else{
-	for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
-      }
-    }
-    for(int p=l; p<M; p++){
-      if(trans==0){
-	for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
-      }else{
-	for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
-      }
-    }
-  }
-}
-}
-#endif
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -33,6 +33,8 @@ directory

 namespace Grid {

+enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
+
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
@@ -40,25 +42,273 @@ template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
 public:

+
  typedef typename Field::scalar_type scomplex;

-  const int blockDim = 0;
-
+  int blockDim ;
  int Nblock;
+
+  BlockCGtype CGtype;
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  
-  BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol),
-    MaxIterations(maxit),
-    ErrorOnNoConverge(err_on_no_conv){};
+  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
+  {};

+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Thin QR factorisation (google it)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 Field & Q,
+		 const Field & R)
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  //Dimensions
+  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
+  //
+  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
+  //
+  //   Q  C = R => Q = R C^{-1}
+  //
+  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
+  //
+  // Set C = L^{dag}, and then Q^dag Q = ident 
+  //
+  // Checks:
+  // Cdag C = Rdag R ; passes.
+  // QdagQ  = 1      ; passes
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  sliceInnerProductMatrix(m_rr,R,R,Orthog);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Cholesky from Eigen
+  // There exists a ldlt that is documented as more stable
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Q = R C^{-1}
+  //
+  // Q_j  = R_i Cinv(i,j) 
+  //
+  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // FIXME:: make a sliceMulMatrix to avoid zero vector
+  sliceMulMatrix(Q,Cinv,R,Orthog);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Call one of several implementations
+////////////////////////////////////////////////////////////////////////////////////////////////////
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
-  int Orthog = 0; // First dimension is block dim
+  if ( CGtype == BlockCGrQ ) {
+    BlockCGrQsolve(Linop,Src,Psi);
+  } else if (CGtype == BlockCG ) {
+    BlockCGsolve(Linop,Src,Psi);
+  } else if (CGtype == CGmultiRHS ) {
+    CGmultiRHSsolve(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQ implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  Nblock = B._grid->_fdimensions[Orthog];
+
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
+
+  X.checkerboard = B.checkerboard;
+  conformable(X, B);
+
+  Field tmp(B);
+  Field Q(B);
+  Field D(B);
+  Field Z(B);
+  Field AD(B);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  sliceNorm(ssq,B,Orthog);
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  sliceNorm(residuals,B,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  sliceNorm(residuals,X,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+
+  Linop.HermOp(X, AD);
+  tmp = B - AD;  
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+  D=Q;
+
+  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    Linop.HermOp(D, Z);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
+	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
+		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      Linop.HermOp(X, AD);
+      AD = AD-B;
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+//////////////////////////////////////////////////////////////////////////
+// Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
+//////////////////////////////////////////////////////////////////////////
+void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = Src._grid->_fdimensions[Orthog];

  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
@@ -162,8 +412,9 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
     *********************
     */
    RealD max_resid=0;
+    RealD rr;
    for(int b=0;b<Nblock;b++){
-      RealD rr = real(m_rr(b,b))/ssq[b];
+      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    
@@ -173,13 +424,14 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)

      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
+		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;

      Linop.HermOp(Psi, AP);
      AP = AP-Src;
-      std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -197,35 +449,13 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
-};
-
-
 //////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
+// Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
-template <class Field>
-class MultiRHSConjugateGradient : public OperatorFunction<Field> {
- public:
-
-  typedef typename Field::scalar_type scomplex;
-
-  const int blockDim = 0;
-
-  int Nblock;
-  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
-                           // Defaults true.
-  RealD Tolerance;
-  Integer MaxIterations;
-  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
-  
-   MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol),
-    MaxIterations(maxit),
-    ErrorOnNoConverge(err_on_no_conv){};
-
-void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
-  int Orthog = 0; // First dimension is block dim
+  int Orthog = blockDim; // First dimension is block dim
  Nblock = Src._grid->_fdimensions[Orthog];

  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
@@ -285,12 +515,10 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
    MatrixTimer.Stop();

    // Alpha
-    //    sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog);
    sliceInnerTimer.Start();
    sliceInnerProductVector(v_pAp,P,AP,Orthog);
    sliceInnerTimer.Stop();
    for(int b=0;b<Nblock;b++){
-      //      std::cout << " "<< v_pAp[b]<<" "<< v_pAp_test[b]<<std::endl;
      v_alpha[b] = v_rr[b]/real(v_pAp[b]);
    }

@@ -332,7 +560,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)

      std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
+	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;

@@ -358,9 +586,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
+
 };

-
-
 }
 #endif
--- a/lib/algorithms/iterative/EigenSort.h
+++ b/lib/algorithms/iterative/EigenSort.h
@@ -1,81 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/EigenSort.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_EIGENSORT_H
-#define GRID_EIGENSORT_H
-
-
-namespace Grid {
-    /////////////////////////////////////////////////////////////
-    // Eigen sorter to begin with
-    /////////////////////////////////////////////////////////////
-
-template<class Field>
-class SortEigen {
- private:
-  
-//hacking for testing for now
- private:
-  static bool less_lmd(RealD left,RealD right){
-    return left > right;
-  }  
-  static bool less_pair(std::pair<RealD,Field const*>& left,
-                        std::pair<RealD,Field const*>& right){
-    return left.first > (right.first);
-  }  
-  
-  
- public:
-
-  void push(DenseVector<RealD>& lmd,
-            DenseVector<Field>& evec,int N) {
-    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
-    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
-    
-    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
-    for(int i=0;i<lmd.size();++i)
-      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
-
-    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
-
-    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
-    for(int i=0;i<N;++i){
-      lmd[i]=it->first;
-      evec[i]=*(it->second);
-      ++it;
-    }
-  }
-  void push(DenseVector<RealD>& lmd,int N) {
-    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
-  }
-  bool saturated(RealD lmd, RealD thrs) {
-    return fabs(lmd) > fabs(thrs);
-  }
-};
-
-}
-#endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -98,7 +98,14 @@ public:
 #else
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
 #endif
-
+    // First touch optimise in threaded loop
+    uint8_t *cp = (uint8_t *)ptr;
+#ifdef GRID_OMP
+#pragma omp parallel for
+#endif
+    for(size_type n=0;n<bytes;n+=4096){
+      cp[n]=0;
+    }
    return ptr;
  }

@@ -186,6 +193,12 @@ public:
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
+    size_type bytes = __n*sizeof(_Tp);
+    uint8_t *cp = (uint8_t *)ptr;
+#pragma omp parallel for
+    for(size_type n=0;n<bytes;n+=4096){
+      cp[n]=0;
+    }
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -1,4 +1,4 @@
- /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_reduction.h
    Copyright (C) 2015
@@ -369,71 +369,6 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };

-
-/*
-template<class vobj>
-static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
-			     int Orthog,RealD scale=1.0) 
-{    
-  // FIXME: Implementation is slow
-  // Best base the linear combination by constructing a 
-  // set of vectors of size grid->_rdimensions[Orthog].
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-  
-  int Nblock = X._grid->GlobalDimensions()[Orthog];
-  
-  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
-  // If we based this on Cshift it would work for spread out
-  // but it would be even slower
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Rslice,Y,i,Orthog);
-    ExtractSlice(Xslice,X,i,Orthog);
-    Rslice = Rslice + Xslice*(scale*a[i]);
-    InsertSlice(Rslice,R,i,Orthog);
-  }
-};
-template<class vobj>
-static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
-  {
-    // FIXME: Implementation is slow
-    // Look at localInnerProduct implementation,
-    // and do inside a site loop with block strided iterators
-    typedef typename vobj::scalar_object sobj;
-    typedef typename vobj::scalar_type scalar_type;
-    typedef typename vobj::vector_type vector_type;
-    typedef typename vobj::tensor_reduced scalar;
-    typedef typename scalar::scalar_object  scomplex;
-  
-    int Nblock = lhs._grid->GlobalDimensions()[Orthog];
-    vec.resize(Nblock);
-    std::vector<scomplex> sip(Nblock);
-    Lattice<scalar> IP(lhs._grid); 
-    IP=localInnerProduct(lhs,rhs);
-    sliceSum(IP,sip,Orthog);
-  
-    for(int ss=0;ss<Nblock;ss++){
-      vec[ss] = TensorRemove(sip[ss]);
-    }
-  }
-*/
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// FIXME: Implementation is slow
-// If we based this on Cshift it would work for spread out
-// but it would be even slower
-//
-// Repeated extract slice is inefficient
-//
-// Best base the linear combination by constructing a 
-// set of vectors of size grid->_rdimensions[Orthog].
-//////////////////////////////////////////////////////////////////////////////////////////
-
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
@@ -453,7 +388,6 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }

-
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
@@ -462,28 +396,103 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  typedef typename vobj::vector_type vector_type;

  int Nblock = X._grid->GlobalDimensions()[Orthog];
-  
+
  GridBase *FullGrid  = X._grid;
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  
+
  Lattice<vobj> Xslice(SliceGrid);
  Lattice<vobj> Rslice(SliceGrid);
-  
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Rslice,Y,i,Orthog);
-    for(int j=0;j<Nblock;j++){
-      ExtractSlice(Xslice,X,j,Orthog);
-      Rslice = Rslice + Xslice*(scale*aa(j,i));
-    }
-    InsertSlice(Rslice,R,i,Orthog);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  int nl = SliceGrid->_ndimension;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+#pragma omp parallel 
+  {
+    std::vector<vobj> s_x(Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = Y[o+i*ostride];
+	for(int j=0;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }}
  }
 };

+template<class vobj>
+static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X._grid->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X._grid;
+  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+  Lattice<vobj> Xslice(SliceGrid);
+  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  int nl = SliceGrid->_ndimension;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+#pragma omp parallel 
+  {
+    std::vector<vobj> s_x(Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = s_x[0]*(scale*aa(0,i));
+	for(int j=1;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }}
+  }
+
+};
+
+
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
-  // FIXME: Implementation is slow
-  // Not sure of best solution.. think about it
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -497,22 +506,49 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
  Lattice<vobj> Rslice(SliceGrid);
  
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Lslice,lhs,i,Orthog);
-    for(int j=0;j<Nblock;j++){
-      ExtractSlice(Rslice,rhs,j,Orthog);
-      mat(i,j) = innerProduct(Lslice,Rslice);
-    }
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  int nl = SliceGrid->_ndimension;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+
+  typedef typename vobj::vector_typeD vector_typeD;
+
+#pragma omp parallel 
+  {
+    std::vector<vobj> Left(Nblock);
+    std::vector<vobj> Right(Nblock);
+    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	Left [i] = lhs[o+i*ostride];
+	Right[i] = rhs[o+i*ostride];
+      }
+
+      for(int i=0;i<Nblock;i++){
+      for(int j=0;j<Nblock;j++){
+	auto tmp = innerProduct(Left[i],Right[j]);
+	vector_typeD rtmp = TensorRemove(tmp);
+	mat_thread(i,j) += Reduce(rtmp);
+      }}
+    }}
+#pragma omp critical
+    {
+      mat += mat_thread;
+    }  
  }
-#undef FORCE_DIAG
-#ifdef FORCE_DIAG
-  for(int i=0;i<Nblock;i++){
-    for(int j=0;j<Nblock;j++){
-      if ( i != j ) mat(i,j)=0.0;
-    }
-  }
-#endif
  return;
 }

--- a/lib/perfmon/PerfCount.cc
+++ b/lib/perfmon/PerfCount.cc
@@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
    // 4
-#ifdef AVX512
+#ifdef KNL
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
--- a/lib/qcd/action/fermion/Fermion.h
+++ b/lib/qcd/action/fermion/Fermion.h
@@ -237,4 +237,11 @@ typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermion

  }}

+////////////////////
+// Scalar QED actions
+// TODO: this needs to move to another header after rename to Fermion.h
+////////////////////
+#include <Grid/qcd/action/scalar/Scalar.h>
+#include <Grid/qcd/action/gauge/Photon.h>
+
 #endif
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@@ -0,0 +1,286 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./lib/qcd/action/gauge/Photon.h
+ 
+ Copyright (C) 2015
+ 
+ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+#ifndef QCD_PHOTON_ACTION_H
+#define QCD_PHOTON_ACTION_H
+
+namespace Grid{
+namespace QCD{
+  template <class S>
+  class QedGimpl
+  {
+  public:
+    typedef S Simd;
+    
+    template <typename vtype>
+    using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
+    template <typename vtype>
+    using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
+    
+    typedef iImplGaugeLink<Simd>  SiteLink;
+    typedef iImplGaugeField<Simd> SiteField;
+    typedef SiteField             SiteComplex;
+    
+    typedef Lattice<SiteLink>  LinkField;
+    typedef Lattice<SiteField> Field;
+    typedef Field              ComplexField;
+  };
+  
+  typedef QedGimpl<vComplex> QedGimplR;
+  
+  template<class Gimpl>
+  class Photon
+  {
+  public:
+    INHERIT_GIMPL_TYPES(Gimpl);
+    GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
+    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2);
+  public:
+    Photon(Gauge gauge, ZmScheme zmScheme);
+    virtual ~Photon(void) = default;
+    void FreePropagator(const GaugeField &in, GaugeField &out);
+    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
+    void StochasticWeight(GaugeLinkField &weight);
+    void StochasticField(GaugeField &out, GridParallelRNG &rng);
+    void StochasticField(GaugeField &out, GridParallelRNG &rng,
+                         const GaugeLinkField &weight);
+  private:
+    void invKHatSquared(GaugeLinkField &out);
+    void zmSub(GaugeLinkField &out);
+  private:
+    Gauge    gauge_;
+    ZmScheme zmScheme_;
+  };
+
+  typedef Photon<QedGimplR>  PhotonR;
+  
+  template<class Gimpl>
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
+  : gauge_(gauge), zmScheme_(zmScheme)
+  {}
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
+  {
+    FFT theFFT(in._grid);
+    
+    GaugeField in_k(in._grid);
+    GaugeField prop_k(in._grid);
+    
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    MomentumSpacePropagator(prop_k,in_k);
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
+  {
+    GridBase           *grid = out._grid;
+    GaugeLinkField     kmu(grid), one(grid);
+    const unsigned int nd    = grid->_ndimension;
+    std::vector<int>   &l    = grid->_fdimensions;
+    std::vector<int>   zm(nd,0);
+    TComplex           Tone = Complex(1.0,0.0);
+    TComplex           Tzero= Complex(0.0,0.0);
+    
+    one = Complex(1.0,0.0);
+    out = zero;
+    for(int mu = 0; mu < nd; mu++)
+    {
+      Real twoPiL = M_PI*2./l[mu];
+      
+      LatticeCoordinate(kmu,mu);
+      kmu = 2.*sin(.5*twoPiL*kmu);
+      out = out + kmu*kmu;
+    }
+    pokeSite(Tone, out, zm);
+    out = one/out;
+    pokeSite(Tzero, out, zm);
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::zmSub(GaugeLinkField &out)
+  {
+    GridBase           *grid = out._grid;
+    const unsigned int nd    = grid->_ndimension;
+    
+    switch (zmScheme_)
+    {
+      case ZmScheme::qedTL:
+      {
+        std::vector<int> zm(nd,0);
+        TComplex         Tzero = Complex(0.0,0.0);
+        
+        pokeSite(Tzero, out, zm);
+        
+        break;
+      }
+      case ZmScheme::qedL:
+      {
+        LatticeInteger spNrm(grid), coor(grid);
+        GaugeLinkField z(grid);
+        
+        spNrm = zero;
+        for(int d = 0; d < grid->_ndimension - 1; d++)
+        {
+          LatticeCoordinate(coor,d);
+          spNrm = spNrm + coor*coor;
+        }
+        out = where(spNrm == Integer(0), 0.*out, out);
+        
+        break;
+      }
+      default:
+        break;
+    }
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
+                                               GaugeField &out)
+  {
+    GridBase           *grid = out._grid;
+    LatticeComplex     k2Inv(grid);
+    
+    invKHatSquared(k2Inv);
+    zmSub(k2Inv);
+    
+    out = in*k2Inv;
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
+  {
+    auto               *grid     = dynamic_cast<GridCartesian *>(weight._grid);
+    const unsigned int nd        = grid->_ndimension;
+    std::vector<int>   latt_size = grid->_fdimensions;
+    
+    Integer vol = 1;
+    for(int d = 0; d < nd; d++)
+    {
+      vol = vol * latt_size[d];
+    }
+    invKHatSquared(weight);
+    weight = sqrt(vol*real(weight));
+    zmSub(weight);
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
+  {
+    auto           *grid = dynamic_cast<GridCartesian *>(out._grid);
+    GaugeLinkField weight(grid);
+    
+    StochasticWeight(weight);
+    StochasticField(out, rng, weight);
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
+                                      const GaugeLinkField &weight)
+  {
+    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
+    const unsigned int nd = grid->_ndimension;
+    GaugeLinkField     r(grid);
+    GaugeField         aTilde(grid);
+    FFT                fft(grid);
+    
+    for(int mu = 0; mu < nd; mu++)
+    {
+      gaussian(rng, r);
+      r = weight*r;
+      pokeLorentz(aTilde, r, mu);
+    }
+    fft.FFT_all_dim(out, aTilde, FFT::backward);
+    
+    out = real(out);
+  }
+//  template<class Gimpl>
+//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
+//                                                            const GaugeField &in)
+//  {
+//    
+//    FeynmanGaugeMomentumSpacePropagator_TL(out,in);
+//    
+//    GridBase *grid = out._grid;
+//    LatticeInteger     coor(grid);
+//    GaugeField zz(grid); zz=zero;
+//    
+//    // xyzt
+//    for(int d = 0; d < grid->_ndimension-1;d++){
+//      LatticeCoordinate(coor,d);
+//      out = where(coor==Integer(0),zz,out);
+//    }
+//  }
+//  
+//  template<class Gimpl>
+//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
+//                                                             const GaugeField &in)
+//  {
+//    
+//    // what type LatticeComplex
+//    GridBase *grid = out._grid;
+//    int nd = grid->_ndimension;
+//    
+//    typedef typename GaugeField::vector_type vector_type;
+//    typedef typename GaugeField::scalar_type ScalComplex;
+//    typedef Lattice<iSinglet<vector_type> > LatComplex;
+//    
+//    std::vector<int> latt_size   = grid->_fdimensions;
+//    
+//    LatComplex denom(grid); denom= zero;
+//    LatComplex   one(grid); one = ScalComplex(1.0,0.0);
+//    LatComplex   kmu(grid);
+//    
+//    ScalComplex ci(0.0,1.0);
+//    // momphase = n * 2pi / L
+//    for(int mu=0;mu<Nd;mu++) {
+//      
+//      LatticeCoordinate(kmu,mu);
+//      
+//      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+//      
+//      kmu = TwoPiL * kmu ;
+//      
+//      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
+//    }
+//    std::vector<int> zero_mode(nd,0);
+//    TComplexD Tone = ComplexD(1.0,0.0);
+//    TComplexD Tzero= ComplexD(0.0,0.0);
+//    
+//    pokeSite(Tone,denom,zero_mode);
+//    
+//    denom= one/denom;
+//    
+//    pokeSite(Tzero,denom,zero_mode);
+//    
+//    out = zero;
+//    out = in*denom;
+//  };
+  
+}}
+#endif
--- a/lib/qcd/action/scalar/Scalar.h
+++ b/lib/qcd/action/scalar/Scalar.h
@@ -31,6 +31,7 @@ directory

 #include <Grid/qcd/action/scalar/ScalarImpl.h>
 #include <Grid/qcd/action/scalar/ScalarAction.h>
+#include <Grid/qcd/action/scalar/ScalarInteractionAction.h>

 namespace Grid {
 namespace QCD {
@@ -39,6 +40,10 @@ namespace QCD {
  typedef ScalarAction<ScalarImplF>                 ScalarActionF;
  typedef ScalarAction<ScalarImplD>                 ScalarActionD;

+  template <int Colours, int Dimensions> using ScalarAdjActionR = ScalarInteractionAction<ScalarNxNAdjImplR<Colours>, Dimensions>;
+  template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>;
+  template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>;
+  
 }
 }

--- a/lib/qcd/action/scalar/ScalarAction.h
+++ b/lib/qcd/action/scalar/ScalarAction.h
@@ -6,10 +6,10 @@

  Copyright (C) 2015

-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+  Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+  Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+  Author: neo <cossu@post.kek.jp>
+  Author: paboyle <paboyle@ph.ed.ac.uk>

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -35,50 +35,49 @@ directory

 namespace Grid {
  // FIXME drop the QCD namespace everywhere here
-  
-  template <class Impl>
-  class ScalarAction : public QCD::Action<typename Impl::Field> {
-  public:
+
+template <class Impl>
+class ScalarAction : public QCD::Action<typename Impl::Field> {
+ public:
    INHERIT_FIELD_TYPES(Impl);
-    
-  private:
+
+ private:
    RealD mass_square;
    RealD lambda;
-    
-  public:
-    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};

-    virtual std::string LogParameters(){
+ public:
+    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {}
+
+    virtual std::string LogParameters() {
      std::stringstream sstream;
      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
      return sstream.str();
-      
    }
-    
-    virtual std::string action_name(){return "ScalarAction";}
-    
-    virtual void refresh(const Field &U,
-			 GridParallelRNG &pRNG){};  // noop as no pseudoferms
-    
+    virtual std::string action_name() {return "ScalarAction";}
+
+    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}  // noop as no pseudoferms
+
    virtual RealD S(const Field &p) {
      return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
-	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
-	ScalarObs<Impl>::sumphider(p);
+    (lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
+    ScalarObs<Impl>::sumphider(p);
    };
-    
+
    virtual void deriv(const Field &p,
-		       Field &force) {
+                       Field &force) {
      Field tmp(p._grid);
      Field p2(p._grid);
      ScalarObs<Impl>::phisquared(p2, p);
      tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
      for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      
-      force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
-    };
-  };
-  
-} // Grid
+
+      force =+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
+    }
+};
+
+
+
+}  // namespace Grid

 #endif // SCALAR_ACTION_H
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@@ -5,96 +5,158 @@
 namespace Grid {
  //namespace QCD {

-  template <class S>
-  class ScalarImplTypes {
-  public:
+template <class S>
+class ScalarImplTypes {
+ public:
    typedef S Simd;
-    
+
    template <typename vtype>
    using iImplField = iScalar<iScalar<iScalar<vtype> > >;
-    
+
    typedef iImplField<Simd> SiteField;
-    
+    typedef SiteField        SitePropagator;
+    typedef SiteField        SiteComplex;
    
    typedef Lattice<SiteField> Field;
+    typedef Field              ComplexField;
+    typedef Field              FermionField;
+    typedef Field              PropagatorField;
    
    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
      gaussian(pRNG, P);
    }
-    
+
    static inline Field projectForce(Field& P){return P;}
-    
-    static inline void update_field(Field& P, Field& U, double ep){
+
+    static inline void update_field(Field& P, Field& U, double ep) {
      U += P*ep;
    }
-    
-    static inline RealD FieldSquareNorm(Field& U){
+
+    static inline RealD FieldSquareNorm(Field& U) {
      return (- sum(trace(U*U))/2.0);
    }
-    
+
    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
      gaussian(pRNG, U);
    }
-    
+
    static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
      gaussian(pRNG, U);
    }
-    
+
    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
      U = 1.0;
    }
    
+    static void MomentumSpacePropagator(Field &out, RealD m)
+    {
+      GridBase           *grid = out._grid;
+      Field              kmu(grid), one(grid);
+      const unsigned int nd    = grid->_ndimension;
+      std::vector<int>   &l    = grid->_fdimensions;
+      
+      one = Complex(1.0,0.0);
+      out = m*m;
+      for(int mu = 0; mu < nd; mu++)
+      {
+        Real twoPiL = M_PI*2./l[mu];
+        
+        LatticeCoordinate(kmu,mu);
+        kmu = 2.*sin(.5*twoPiL*kmu);
+        out = out + kmu*kmu;
+      }
+      out = one/out;
+    }
+    
+    static void FreePropagator(const Field &in, Field &out,
+                               const Field &momKernel)
+    {
+      FFT   fft((GridCartesian *)in._grid);
+      Field inFT(in._grid);
+      
+      fft.FFT_all_dim(inFT, in, FFT::forward);
+      inFT = inFT*momKernel;
+      fft.FFT_all_dim(out, inFT, FFT::backward);
+    }
+    
+    static void FreePropagator(const Field &in, Field &out, RealD m)
+    {
+      Field momKernel(in._grid);
+      
+      MomentumSpacePropagator(momKernel, m);
+      FreePropagator(in, out, momKernel);
+    }
+    
  };

  template <class S, unsigned int N>
-  class ScalarMatrixImplTypes {
+  class ScalarAdjMatrixImplTypes {
  public:
    typedef S Simd;
+    typedef QCD::SU<N> Group;
    
    template <typename vtype>
-    using iImplField = iScalar<iScalar<iMatrix<vtype, N> > >;
+    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
+    template <typename vtype>
+    using iImplComplex = iScalar<iScalar<iScalar<vtype>>>;
+
+    typedef iImplField<Simd>   SiteField;
+    typedef SiteField          SitePropagator;
+    typedef iImplComplex<Simd> SiteComplex;
    
-    typedef iImplField<Simd> SiteField;
-    
-    
-    typedef Lattice<SiteField> Field;
-    
-    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
-      gaussian(pRNG, P);
+    typedef Lattice<SiteField>   Field;
+    typedef Lattice<SiteComplex> ComplexField;
+    typedef Field                FermionField;
+    typedef Field                PropagatorField;
+
+    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
    }
-    
-    static inline Field projectForce(Field& P){return P;}
-    
-    static inline void update_field(Field& P, Field& U, double ep){
+
+    static inline Field projectForce(Field& P) {return P;}
+
+    static inline void update_field(Field& P, Field& U, double ep) {
      U += P*ep;
    }
-    
-    static inline RealD FieldSquareNorm(Field& U){
-      return (TensorRemove(- sum(trace(U*U))*0.5).real());
+
+    static inline RealD FieldSquareNorm(Field& U) {
+      return (TensorRemove(sum(trace(U*U))).real());
    }
-    
+
    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
    }
-    
+
    static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
    }
-    
+
    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-      U = 1.0;
+      U = zero;
    }
-    
+
  };


-  
-  
+
+
  typedef ScalarImplTypes<vReal> ScalarImplR;
  typedef ScalarImplTypes<vRealF> ScalarImplF;
  typedef ScalarImplTypes<vRealD> ScalarImplD;
+  typedef ScalarImplTypes<vComplex> ScalarImplCR;
+  typedef ScalarImplTypes<vComplexF> ScalarImplCF;
+  typedef ScalarImplTypes<vComplexD> ScalarImplCD;
+    
+  // Hardcoding here the size of the matrices
+  typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR;
+  typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
+  typedef ScalarAdjMatrixImplTypes<vComplexD, QCD::Nc> ScalarAdjImplD;
+
+  template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >;
+  template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >;
+  template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >;
  
-  //} 
-} 
+  //}
+}

 #endif
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -6,10 +6,7 @@

  Copyright (C) 2015

-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+  Author: Guido Cossu <guido,cossu@ed.ac.uk>

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -30,55 +27,122 @@ directory
  *************************************************************************************/
 /*  END LEGAL */

-#ifndef SCALAR_ACTION_H
-#define SCALAR_ACTION_H
+#ifndef SCALAR_INT_ACTION_H
+#define SCALAR_INT_ACTION_H
+
+
+// Note: this action can completely absorb the ScalarAction for real float fields
+// use the scalarObjs to generalise the structure

 namespace Grid {
  // FIXME drop the QCD namespace everywhere here
-  
-  template <class Impl>
+
+  template <class Impl, int Ndim >
  class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
  public:
    INHERIT_FIELD_TYPES(Impl);
-    
  private:
    RealD mass_square;
    RealD lambda;
-    
-  public:
-    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};

-    virtual std::string LogParameters(){
+
+    typedef typename Field::vector_object vobj;
+    typedef CartesianStencil<vobj,vobj> Stencil;
+
+    SimpleCompressor<vobj> compressor;
+    int npoint = 2*Ndim;
+    std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
+    std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1};
+
+
+  public:
+
+    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
+      for (int mu = 0 ; mu < Ndim; mu++){
+		directions[mu]         = mu; directions[mu+Ndim]    = mu;
+		displacements[mu]      =  1; displacements[mu+Ndim] = -1;
+      }
+    }
+
+    virtual std::string LogParameters() {
      std::stringstream sstream;
      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
      return sstream.str();
-      
    }
-    
-    virtual std::string action_name(){return "ScalarAction";}
-    
-    virtual void refresh(const Field &U,
-			 GridParallelRNG &pRNG){};  // noop as no pseudoferms
-    
+
+    virtual std::string action_name() {return "ScalarAction";}
+
+    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
+
    virtual RealD S(const Field &p) {
-      return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
-	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
-	ScalarObs<Impl>::sumphider(p);
+      assert(p._grid->Nd() == Ndim);
+      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+      phiStencil.HaloExchange(p, compressor);
+      Field action(p._grid), pshift(p._grid), phisquared(p._grid);
+      phisquared = p*p;
+      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
+      for (int mu = 0; mu < Ndim; mu++) {
+	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
+	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
+	  int permute_type;
+	  StencilEntry *SE;
+	  vobj temp2;
+	  const vobj *temp, *t_p;
+	    
+	  SE = phiStencil.GetEntry(permute_type, mu, i);
+	  t_p  = &p._odata[i];
+	  if ( SE->_is_local ) {
+	    temp = &p._odata[SE->_offset];
+	    if ( SE->_permute ) {
+	      permute(temp2, *temp, permute_type);
+	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
+	    } else {
+	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
+	    }
+	  } else {
+	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
+	  }
+	}
+	//  action -= pshift*p + p*pshift;
+      }
+      // NB the trace in the algebra is normalised to 1/2
+      // minus sign coming from the antihermitian fields
+      return -(TensorRemove(sum(trace(action)))).real();
    };
-    
-    virtual void deriv(const Field &p,
-		       Field &force) {
-      Field tmp(p._grid);
-      Field p2(p._grid);
-      ScalarObs<Impl>::phisquared(p2, p);
-      tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
-      for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+
+    virtual void deriv(const Field &p, Field &force) {
+      assert(p._grid->Nd() == Ndim);
+      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
+      // move this outside
+      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+      phiStencil.HaloExchange(p, compressor);
      
-      force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
-    };
+      //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+      for (int point = 0; point < npoint; point++) {
+	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
+	  const vobj *temp;
+	  vobj temp2;
+	  int permute_type;
+	  StencilEntry *SE;
+	  SE = phiStencil.GetEntry(permute_type, point, i);
+	  
+	  if ( SE->_is_local ) {
+	    temp = &p._odata[SE->_offset];
+	    if ( SE->_permute ) {
+	      permute(temp2, *temp, permute_type);
+	      force._odata[i] -= temp2;
+	    } else {
+	      force._odata[i] -= *temp;
+	    }
+	  } else {
+	    force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
+	  }
+	}
+      }
+    }
  };
  
-} // Grid
+}  // namespace Grid

-#endif // SCALAR_ACTION_H
+#endif  // SCALAR_INT_ACTION_H
--- a/lib/qcd/hmc/GenericHMCrunner.h
+++ b/lib/qcd/hmc/GenericHMCrunner.h
@@ -207,6 +207,12 @@ using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator,
 typedef HMCWrapperTemplate<ScalarImplR, MinimumNorm2, ScalarFields>
    ScalarGenericHMCRunner;

+typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
+    ScalarAdjGenericHMCRunner;
+
+template <int Colours> 
+using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
+
 }  // namespace QCD
 }  // namespace Grid

--- a/lib/qcd/hmc/HMC.h
+++ b/lib/qcd/hmc/HMC.h
@@ -76,7 +76,7 @@ struct HMCparameters: Serializable {

  template < class ReaderClass > 
  void initialize(Reader<ReaderClass> &TheReader){
-  	std::cout << "Reading HMC\n";
+  	std::cout << GridLogMessage << "Reading HMC\n";
  	read(TheReader, "HMC", *this);
  }

--- a/lib/qcd/hmc/HMCResourceManager.h
+++ b/lib/qcd/hmc/HMCResourceManager.h
@@ -253,6 +253,7 @@ class HMCResourceManager {
  template<class T, class... Types>
  void AddObservable(Types&&... Args){
    ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...)));
+    ObservablesList.back()->print_parameters();
  }

  std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){
@@ -297,4 +298,4 @@ private:
 }
 }

-#endif  // HMC_RESOURCE_MANAGER_H
+#endif  // HMC_RESOURCE_MANAGER_H
--- a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -102,7 +102,7 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
    FieldMetaData header;
    IldgReader _IldgReader;
    _IldgReader.open(config);
-    _IldgReader.readConfiguration(config,U,header);  // format from the header
+    _IldgReader.readConfiguration(U,header);  // format from the header
    _IldgReader.close();

    std::cout << GridLogMessage << "Read ILDG Configuration from " << config
--- a/lib/qcd/representations/hmc_types.h
+++ b/lib/qcd/representations/hmc_types.h
@@ -62,7 +62,10 @@ class Representations {

 typedef Representations<FundamentalRepresentation> NoHirep;
 typedef Representations<EmptyRep<typename ScalarImplR::Field> > ScalarFields;
-  //typedef Representations<EmptyRep<typename ScalarMatrixImplR::Field> > ScalarMatrixFields;
+typedef Representations<EmptyRep<typename ScalarAdjImplR::Field> > ScalarMatrixFields;
+
+template < int Colours> 
+using ScalarNxNMatrixFields = Representations<EmptyRep<typename ScalarNxNAdjImplR<Colours>::Field> >;

 // Helper classes to access the elements
 // Strips the first N parameters from the tuple
--- a/lib/qcd/smearing/WilsonFlow.h
+++ b/lib/qcd/smearing/WilsonFlow.h
@@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
    if (maxTau - taus < epsilon){
        epsilon = maxTau-taus;
    }
-    std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
    GaugeField Z(U._grid);
    GaugeField Zprime(U._grid);
    GaugeField tmp(U._grid), Uprime(U._grid);
@@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
    // adjust integration step
    
    taus += epsilon;
-    std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
+    //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
    
    epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
-    std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;

 }

@@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
    out = in;
    for (unsigned int step = 1; step <= Nstep; step++) {
        auto start = std::chrono::high_resolution_clock::now();
-        std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
        evolve_step(out);
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> diff = end - start;
@@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
    unsigned int step = 0;
    do{
        step++;
-        std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
+        //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
        evolve_step_adaptive(out, maxTau);
        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
            << step << "  "
--- a/lib/qcd/utils/GaugeFix.h
+++ b/lib/qcd/utils/GaugeFix.h
@@ -0,0 +1,188 @@
+    /*************************************************************************************
+
+    grid` physics library, www.github.com/paboyle/Grid 
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+//#include <Grid/Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+template <class Gimpl> 
+class FourierAcceleratedGaugeFixer  : public Gimpl {
+  public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  typedef typename Gimpl::GaugeLinkField GaugeMat;
+  typedef typename Gimpl::GaugeField GaugeLorentz;
+
+  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
+    for(int mu=0;mu<Nd;mu++){
+      Complex cmi(0.0,-1.0);
+      A[mu] = Ta(U[mu]) * cmi;
+    }
+  }
+  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu) {
+    dmuAmu=zero;
+    for(int mu=0;mu<Nd;mu++){
+      dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
+    }
+  }  
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false) {
+    GridBase *grid = Umu._grid;
+
+    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+    Real old_trace = org_link_trace;
+    Real trG;
+
+    std::vector<GaugeMat> U(Nd,grid);
+                 GaugeMat dmuAmu(grid);
+
+    for(int i=0;i<maxiter;i++){
+      for(int mu=0;mu<Nd;mu++) U[mu]= PeekIndex<LorentzIndex>(Umu,mu);
+      if ( Fourier==false ) { 
+	trG = SteepestDescentStep(U,alpha,dmuAmu);
+      } else { 
+	trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu);
+      }
+      for(int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(Umu,U[mu],mu);
+      // Monitor progress and convergence test 
+      // infrequently to minimise cost overhead
+      if ( i %20 == 0 ) { 
+	Real plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+	Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+
+	if (Fourier) 
+	  std::cout << GridLogMessage << "Fourier Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
+	else 
+	  std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
+	
+	Real Phi  = 1.0 - old_trace / link_trace ;
+	Real Omega= 1.0 - trG;
+
+
+	std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
+	if ( (Omega < Omega_tol) && ( ::fabs(Phi) < Phi_tol) ) {
+	  std::cout << GridLogMessage << "Converged ! "<<std::endl;
+	  return;
+	}
+
+	old_trace = link_trace;
+
+      }
+    }
+  };
+  static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
+    GridBase *grid = U[0]._grid;
+
+    std::vector<GaugeMat> A(Nd,grid);
+    GaugeMat g(grid);
+
+    GaugeLinkToLieAlgebraField(U,A);
+    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
+
+
+    Real vol = grid->gSites();
+    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+
+    SU<Nc>::GaugeTransform(U,g);
+
+    return trG;
+  }
+
+  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
+
+    GridBase *grid = U[0]._grid;
+
+    Real vol = grid->gSites();
+
+    FFT theFFT((GridCartesian *)grid);
+
+    LatticeComplex  Fp(grid);
+    LatticeComplex  psq(grid); psq=zero;
+    LatticeComplex  pmu(grid); 
+    LatticeComplex   one(grid); one = Complex(1.0,0.0);
+
+    GaugeMat g(grid);
+    GaugeMat dmuAmu_p(grid);
+    std::vector<GaugeMat> A(Nd,grid);
+
+    GaugeLinkToLieAlgebraField(U,A);
+
+    DmuAmu(A,dmuAmu);
+
+    theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward);
+
+    //////////////////////////////////
+    // Work out Fp = psq_max/ psq...
+    //////////////////////////////////
+    std::vector<int> latt_size = grid->GlobalDimensions();
+    std::vector<int> coor(grid->_ndimension,0);
+    for(int mu=0;mu<Nd;mu++) {
+
+      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      LatticeCoordinate(pmu,mu);
+      pmu = TwoPiL * pmu ;
+      psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
+    }
+
+    Complex psqMax(16.0);
+    Fp =  psqMax*one/psq;
+
+    /*
+    static int once;
+    if ( once == 0 ) { 
+      std::cout << " Fp " << Fp <<std::endl;
+      once ++;
+      }*/
+
+    pokeSite(TComplex(1.0),Fp,coor);
+
+    dmuAmu_p  = dmuAmu_p * Fp; 
+
+    theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
+
+    GaugeMat ciadmam(grid);
+    Complex cialpha(0.0,-alpha);
+    ciadmam = dmuAmu*cialpha;
+    SU<Nc>::taExp(ciadmam,g);
+
+    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+
+    SU<Nc>::GaugeTransform(U,g);
+
+    return trG;
+  }
+
+  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
+    GridBase *grid = g._grid;
+    Complex cialpha(0.0,-alpha);
+    GaugeMat ciadmam(grid);
+    DmuAmu(A,dmuAmu);
+    ciadmam = dmuAmu*cialpha;
+    SU<Nc>::taExp(ciadmam,g);
+  }  
+};
+
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -716,8 +716,7 @@ template<typename GaugeField,typename GaugeMat>

    for (int a = 0; a < AdjointDimension; a++) {
      generator(a, Ta);
-      auto tmp = - 2.0 * (trace(timesI(Ta) * in)) * scale;// 2.0 for the normalization of the trace in the fundamental rep
-      pokeColour(h_out, tmp, a);
+      pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a);
    }
  }

--- a/lib/serialisation/Hdf5IO.cc
+++ b/lib/serialisation/Hdf5IO.cc
@@ -65,10 +65,12 @@ Hdf5Reader::Hdf5Reader(const std::string &fileName)
                      Hdf5Type<unsigned int>::type());
 }

-void Hdf5Reader::push(const std::string &s)
+bool Hdf5Reader::push(const std::string &s)
 {
  group_ = group_.openGroup(s);
  path_.push_back(s);
+  
+  return true;
 }

 void Hdf5Reader::pop(void)
--- a/lib/serialisation/Hdf5IO.h
+++ b/lib/serialisation/Hdf5IO.h
@@ -54,7 +54,7 @@ namespace Grid
  public:
    Hdf5Reader(const std::string &fileName);
    virtual ~Hdf5Reader(void) = default;
-    void push(const std::string &s);
+    bool push(const std::string &s);
    void pop(void);
    template <typename U>
    void readDefault(const std::string &s, U &output);
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -701,9 +701,28 @@ namespace Optimization {
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
-    // FIXME unimplemented
-    printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    __m128i ret;
+#if defined (AVX2)
+    // AVX2 horizontal adds within upper and lower halves of register; use
+    // SSE to add upper and lower halves for result.
+    __m256i v1, v2;
+    __m128i u1, u2;
+    v1  = _mm256_hadd_epi32(in, in);
+    v2  = _mm256_hadd_epi32(v1, v1);
+    u1  = _mm256_castsi256_si128(v2);      // upper half
+    u2  = _mm256_extracti128_si256(v2, 1); // lower half
+    ret = _mm_add_epi32(u1, u2);
+#else
+    // No AVX horizontal add; extract upper and lower halves of register & use
+    // SSE intrinsics.
+    __m128i u1, u2, u3;
+    u1  = _mm256_extractf128_si256(in, 0); // upper half
+    u2  = _mm256_extractf128_si256(in, 1); // lower half
+    u3  = _mm_add_epi32(u1, u2);
+    u1  = _mm_hadd_epi32(u3, u3);
+    ret = _mm_hadd_epi32(u1, u1);
+#endif
+    return _mm_cvtsi128_si32(ret);
  }

 }
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -543,6 +543,24 @@ namespace Optimization {
     u512d conv; conv.v = v1;
     return conv.f[0];
  }
+  
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
+    // No full vector reduce, use AVX to add upper and lower halves of register
+    // and perform AVX reduction.
+    __m256i v1, v2, v3;
+    __m128i u1, u2, ret;
+    v1  = _mm512_castsi512_si256(in);       // upper half
+    v2  = _mm512_extracti32x8_epi32(in, 1); // lower half
+    v3  = _mm256_add_epi32(v1, v2);
+    v1  = _mm256_hadd_epi32(v3, v3);
+    v2  = _mm256_hadd_epi32(v1, v1);
+    u1  = _mm256_castsi256_si128(v2)        // upper half
+    u2  = _mm256_extracti128_si256(v2, 1);  // lower half
+    ret = _mm_add_epi32(u1, u2);
+    return _mm_cvtsi128_si32(ret);
+  }
 #else
  //Complex float Reduce
  template<>
@@ -570,9 +588,7 @@ namespace Optimization {
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
-    // FIXME unimplemented
-    printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    return _mm512_reduce_add_epi32(in);
  }
 #endif
  
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@@ -401,9 +401,7 @@ namespace Optimization {
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
-    // FIXME unimplemented
-    printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    return _mm512_reduce_add_epi32(in);
  }
  
  
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -1,13 +1,14 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid

    Source file: ./lib/simd/Grid_neon.h

    Copyright (C) 2015

-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: neo <cossu@post.kek.jp>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -26,19 +27,25 @@ Author: neo <cossu@post.kek.jp>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-//----------------------------------------------------------------------
-/*! @file Grid_sse4.h
-  @brief Optimization libraries for NEON (ARM) instructions set ARMv8

-  Experimental - Using intrinsics - DEVELOPING! 
+/*
+
+  ARMv8 NEON intrinsics layer by
+
+  Nils Meyer <nils.meyer@ur.de>,
+  University of Regensburg, Germany
+  SFB/TRR55
+
 */
-// Time-stamp: <2015-07-10 17:45:09 neo>
-//----------------------------------------------------------------------

+#ifndef GEN_SIMD_WIDTH
+#define GEN_SIMD_WIDTH 16u
+#endif
+
+#include "Grid_generic_types.h"
 #include <arm_neon.h>

-// ARMv8 supports double precision
-
+namespace Grid {
 namespace Optimization {

  template<class vtype>
@@ -46,16 +53,20 @@ namespace Optimization {
    float32x4_t f;
    vtype v;
  };
-
  union u128f {
    float32x4_t v;
    float f[4];
  };
  union u128d {
    float64x2_t v;
-    double f[4];
+    double f[2];
  };
-  
+  // half precision
+  union u128h {
+    float16x8_t v;
+    uint16_t f[8];
+  };
+
  struct Vsplat{
    //Complex float
    inline float32x4_t operator()(float a, float b){
@@ -64,31 +75,31 @@ namespace Optimization {
    }
    // Real float
    inline float32x4_t operator()(float a){
-      return vld1q_dup_f32(&a);
+      return vdupq_n_f32(a);
    }
    //Complex double
-    inline float32x4_t operator()(double a, double b){
-      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
-      return vld1q_f32(tmp);
+    inline float64x2_t operator()(double a, double b){
+      double tmp[2]={a,b};
+      return vld1q_f64(tmp);
    }
-    //Real double
-    inline float32x4_t operator()(double a){
-      return vld1q_dup_f32(&a);
+    //Real double // N:tbc
+    inline float64x2_t operator()(double a){
+      return vdupq_n_f64(a);
    }
-    //Integer
+    //Integer // N:tbc
    inline uint32x4_t operator()(Integer a){
-      return vld1q_dup_u32(&a);
+      return vdupq_n_u32(a);
    }
  };

  struct Vstore{
-    //Float 
+    //Float
    inline void operator()(float32x4_t a, float* F){
      vst1q_f32(F, a);
    }
    //Double
-    inline void operator()(float32x4_t a, double* D){
-      vst1q_f32((float*)D, a);
+    inline void operator()(float64x2_t a, double* D){
+      vst1q_f64(D, a);
    }
    //Integer
    inline void operator()(uint32x4_t a, Integer* I){
@@ -97,54 +108,54 @@ namespace Optimization {

  };

-  struct Vstream{
-    //Float
+  struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
+    //Float // N:generic
    inline void operator()(float * a, float32x4_t b){
-    
+      memcpy(a,&b,4*sizeof(float));
    }
-    //Double
-    inline void operator()(double * a, float32x4_t b){
-  
+    //Double // N:generic
+    inline void operator()(double * a, float64x2_t b){
+      memcpy(a,&b,2*sizeof(double));
    }


  };

+  // Nils: Vset untested; not used currently in Grid at all;
+  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
  struct Vset{
-    // Complex float 
+    // Complex float // N:ok
    inline float32x4_t operator()(Grid::ComplexF *a){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
+      return vld1q_f32(tmp);
    }
-    // Complex double 
-    inline float32x4_t operator()(Grid::ComplexD *a){
-      float32x4_t foo;
-      return foo;
+    // Complex double // N:ok
+    inline float64x2_t operator()(Grid::ComplexD *a){
+      double tmp[2]={a[0].imag(),a[0].real()};
+      return vld1q_f64(tmp);
    }
-    // Real float 
+    // Real float // N:ok
    inline float32x4_t operator()(float *a){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a[3],a[2],a[1],a[0]};
+      return vld1q_f32(tmp);
    }
-    // Real double
-    inline float32x4_t operator()(double *a){
-      float32x4_t foo;
-      return foo;
+    // Real double // N:ok
+    inline float64x2_t operator()(double *a){
+      double tmp[2]={a[1],a[0]};
+      return vld1q_f64(tmp);
    }
-    // Integer
+    // Integer // N:ok
    inline uint32x4_t operator()(Integer *a){
-      uint32x4_t foo;
-      return foo;
+      return vld1q_dup_u32(a);
    }
-
-
  };

+  // N:leaving as is
  template <typename Out_type, typename In_type>
  struct Reduce{
    //Need templated class to overload output type
    //General form must generate error if compiled
-    inline Out_type operator()(In_type in){
+      inline Out_type operator()(In_type in){
      printf("Error, using wrong Reduce function\n");
      exit(1);
      return 0;
@@ -184,26 +195,98 @@ namespace Optimization {
    }
  };

+  struct MultRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vmulq_f32(re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vmulq_f64(re, b);
+    }
+  };
+
+  struct MaddRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vfmaq_f32(c, re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vfmaq_f64(c, re, b);
+    }
+  };
+
+  struct Div{
+    // Real float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      return vdivq_f32(a, b);
+    }
+    // Real double
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vdivq_f64(a, b);
+    }
+  };
+
  struct MultComplex{
    // Complex float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+
+      float32x4_t r0, r1, r2, r3, r4;
+
+      // a = ar ai Ar Ai
+      // b = br bi Br Bi
+      // collect real/imag part, negate bi and Bi
+      r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br
+      r1 = vnegq_f32(b);           // -br -bi -Br -Bi
+      r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi
+
+      // the fun part
+      r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ...
+      r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ...
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
+
+      // no fma, use mul and add
+      //float32x4_t r5;
+      //r5 = vmulq_f32(r0, a);
+      //return vaddq_f32(r4, r5);
    }
    // Complex double
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      float32x4_t foo;
-      return foo;
+
+      float64x2_t r0, r1, r2, r3, r4;
+
+      // b = br bi
+      // collect real/imag part, negate bi
+      r0 = vtrn1q_f64(b, b);       //  br  br
+      r1 = vnegq_f64(b);           // -br -bi
+      r2 = vtrn2q_f64(b, r1);      //  bi -bi
+
+      // the fun part
+      r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai
+      r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
+
+      // no fma, use mul and add
+      //float64x2_t r5;
+      //r5 = vmulq_f64(r0, a);
+      //return vaddq_f64(r4, r5);
    }
  };

  struct Mult{
    // Real float
    inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
-      return vaddq_f32(vmulq_f32(b,c),a);
+      //return vaddq_f32(vmulq_f32(b,c),a);
+      return vfmaq_f32(a, b, c);
    }
    inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
-      return vaddq_f64(vmulq_f64(b,c),a);
+      //return vaddq_f64(vmulq_f64(b,c),a);
+      return vfmaq_f64(a, b, c);
    }
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
      return vmulq_f32(a,b);
@@ -221,89 +304,275 @@ namespace Optimization {
  struct Conj{
    // Complex single
    inline float32x4_t operator()(float32x4_t in){
-      return in;
+      // ar ai br bi -> ar -ai br -bi
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(in, r1); //  ar -ai  br -bi
    }
    // Complex double
-    //inline float32x4_t operator()(float32x4_t in){
-    // return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t in){
+
+      float64x2_t r0, r1;
+      r0 = vextq_f64(in, in, 1);    //  ai  ar
+      r1 = vnegq_f64(r0);           // -ai -ar
+      return vextq_f64(r0, r1, 1);  //  ar -ai
+    }
    // do not define for integer input
  };

  struct TimesMinusI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      return in;
+      // ar ai br bi -> ai -ar ai -br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(in);      //  ai  ar  bi  br
+      return vtrn1q_f32(r1, r0); //  ar -ai  br -bi
    }
    //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-    //  return in;
-    //}
-
-
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> b -ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(in, tmp, 1);
+    }
  };

  struct TimesI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      //need shuffle
-      return in;
+      // ar ai br bi -> -ai ar -bi br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(r1, in); // -ai  ar -bi  br
    }
    //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-    //  return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> -b ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(tmp, in, 1);
+    }
+  };
+
+  struct Permute{
+
+    static inline float32x4_t Permute0(float32x4_t in){ // N:ok
+      // AB CD -> CD AB
+      return vextq_f32(in, in, 2);
+    };
+    static inline float32x4_t Permute1(float32x4_t in){ // N:ok
+      // AB CD -> BA DC
+      return vrev64q_f32(in);
+    };
+    static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+
+    static inline float64x2_t Permute0(float64x2_t in){ // N:ok
+      // AB -> BA
+      return vextq_f64(in, in, 1);
+    };
+    static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+
+  };
+
+  struct Rotate{
+
+    static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB CD -> AB CD
+        return tRotate<0>(in);
+        break;
+      case 1: // AB CD -> BC DA
+        return tRotate<1>(in);
+        break;
+      case 2: // AB CD -> CD AB
+        return tRotate<2>(in);
+        break;
+      case 3: // AB CD -> DA BC
+        return tRotate<3>(in);
+        break;
+      default: assert(0);
+      }
+    }
+    static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB -> AB
+        return tRotate<0>(in);
+        break;
+      case 1: // AB -> BA
+        return tRotate<1>(in);
+        break;
+      default: assert(0);
+      }
+    }
+
+// working, but no restriction on n
+//    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
+//    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
+
+// restriction on n
+    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
+    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
+
+  };
+
+  struct PrecisionChange {
+
+    static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
+      float16x4_t h = vcvt_f16_f32(a);
+      return vcvt_high_f16_f32(h, b);
+    }
+    static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
+      sb = vcvt_high_f32_f16(h);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
+      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
+      // workaround for clang
+      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
+      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
+      sa = vcvt_high_f32_f16(h1);
+    }
+    static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
+      float32x2_t s = vcvt_f32_f64(a);
+      return vcvt_high_f32_f64(s, b);
+
+    }
+    static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
+      b = vcvt_high_f64_f32(s);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      float32x4_t s1 = vextq_f32(s, s, 2);
+      a = vcvt_high_f64_f32(s1);
+
+    }
+    static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
+      float32x4_t s1 = DtoS(a, b);
+      float32x4_t s2 = DtoS(c, d);
+      return StoH(s1, s2);
+    }
+    static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
+      float32x4_t s1, s2;
+      HtoS(h, s1, s2);
+      StoD(s1, a, b);
+      StoD(s2, c, d);
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Exchange support
+
+  struct Exchange{
+    static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: ABEF
+      // in2: EFGH -> out2: CDGH
+
+      // z: CDAB
+      float32x4_t z = vextq_f32(in1, in1, 2);
+      // out1: ABEF
+      out1 = vextq_f32(z, in2, 2);
+
+      // z: GHEF
+      z = vextq_f32(in2, in2, 2);
+      // out2: CDGH
+      out2 = vextq_f32(in1, z, 2);
+    };
+
+    static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: AECG
+      // in2: EFGH -> out2: BFDH
+      out1 = vtrn1q_f32(in1, in2);
+      out2 = vtrn2q_f32(in1, in2);
+    };
+    static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    // double precision
+    static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      // in1: AB -> out1: AC
+      // in2: CD -> out2: BD
+      out1 = vzip1q_f64(in1, in2);
+      out2 = vzip2q_f64(in1, in2);
+    };
+    static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
  };

  //////////////////////////////////////////////
  // Some Template specialization
-  template < typename vtype > 
-    void permute(vtype &a, vtype b, int perm) {

-  }; 

  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x4_t v1; // two complex
+    v1 = Optimization::Permute::Permute0(in);
+    v1 = vaddq_f32(v1,in);
+    u128f conv;    conv.v=v1;
+    return Grid::ComplexF(conv.f[0],conv.f[1]);
  }
  //Real float Reduce
  template<>
  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    float32x2_t high = vget_high_f32(in);
-    float32x2_t low = vget_low_f32(in);
-    float32x2_t tmp = vadd_f32(low, high);
-    float32x2_t sum = vpadd_f32(tmp, tmp);
-    return vget_lane_f32(sum,0);
+    return vaddvq_f32(in);
  }
-  
-  
+
+
  //Complex double Reduce
-  template<>
+  template<> // N:by Boyle
  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
-    return 0;
+    u128d conv; conv.v = in;
+    return Grid::ComplexD(conv.f[0],conv.f[1]);
  }
-  
+
  //Real double Reduce
  template<>
  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
-    float64x2_t sum = vpaddq_f64(in, in);
-    return vgetq_lane_f64(sum,0);
+    return vaddvq_f64(in);
  }

  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
    // FIXME unimplemented
-   printf("Reduce : Missing integer implementation -> FIX\n");
+    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
 }

 //////////////////////////////////////////////////////////////////////////////////////
-// Here assign types 
-namespace Grid {
+// Here assign types

+// typedef Optimization::vech SIMD_Htype; // Reduced precision type
+  typedef float16x8_t  SIMD_Htype; // Half precision type
  typedef float32x4_t  SIMD_Ftype; // Single precision type
  typedef float64x2_t  SIMD_Dtype; // Double precision type
  typedef uint32x4_t   SIMD_Itype; // Integer type
@@ -312,13 +581,6 @@ namespace Grid {
  inline void prefetch_HINT_T0(const char *ptr){};


-  // Gpermute function
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  }
-
-
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
@@ -326,16 +588,19 @@ namespace Grid {
  typedef Optimization::Vstream  VstreamSIMD;
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;

- 
+


  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;

-}
+}
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -374,6 +374,84 @@ namespace Optimization {
    // Complex float
    FLOAT_WRAP_2(operator(), inline)
  };
+#define USE_FP16
+  struct PrecisionChange {
+    static inline vech StoH (const vector4float &a, const vector4float &b) {
+      vech ret;
+      std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl;
+      assert(0);
+      return ret;
+    }
+    static inline void  HtoS (vech h, vector4float &sa, vector4float &sb) {
+      std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl;
+      assert(0);
+    }
+    static inline vector4float DtoS (vector4double a, vector4double b) {
+      vector4float ret;
+      std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl;
+      assert(0);
+      return ret;
+    }
+    static inline void StoD (vector4float s, vector4double &a, vector4double &b) {
+      std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl;
+      assert(0);
+    }
+    static inline vech DtoH (vector4double a, vector4double b, 
+                             vector4double c, vector4double d) {
+      vech ret;
+      std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl;
+      assert(0);
+      return ret;
+    }
+    static inline void HtoD (vech h, vector4double &a, vector4double &b, 
+                                     vector4double &c, vector4double &d) {
+      std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl;
+      assert(0);
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Exchange support
+#define FLOAT_WRAP_EXCHANGE(fn) \
+  static inline void fn(vector4float &out1, vector4float &out2, \
+                        vector4float in1,  vector4float in2) \
+  { \
+    vector4double out1d, out2d, in1d, in2d; \
+    in1d  = Vset()(in1);   \
+    in2d  = Vset()(in2);   \
+    fn(out1d, out2d, in1d, in2d); \
+    Vstore()(out1d, out1); \
+    Vstore()(out2d, out2); \
+  }
+
+  struct Exchange{
+
+    // double precision
+    static inline void Exchange0(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      out1 = vec_perm(in1, in2, vec_gpci(0145));
+      out2 = vec_perm(in1, in2, vec_gpci(02367));
+    }
+    static inline void Exchange1(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      out1 = vec_perm(in1, in2, vec_gpci(0426));
+      out2 = vec_perm(in1, in2, vec_gpci(01537));
+    }
+    static inline void Exchange2(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      assert(0);
+    }
+    static inline void Exchange3(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      assert(0);
+    }
+
+    // single precision
+    FLOAT_WRAP_EXCHANGE(Exchange0);
+    FLOAT_WRAP_EXCHANGE(Exchange1);
+    FLOAT_WRAP_EXCHANGE(Exchange2);
+    FLOAT_WRAP_EXCHANGE(Exchange3);
+  };

  struct Permute{
    //Complex double
@@ -497,15 +575,19 @@ namespace Optimization {
  
  //Integer Reduce
  template<>
-  inline Integer Reduce<Integer, int>::operator()(int in){
-    // FIXME unimplemented
-    printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+  inline Integer Reduce<Integer, veci>::operator()(veci in){
+    Integer a = 0;
+    for (unsigned int i = 0; i < W<Integer>::r; ++i)
+    {
+        a += in.v[i];
+    }
+    return a;
  }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Here assign types
+typedef Optimization::vech         SIMD_Htype;  // Half precision type
 typedef Optimization::vector4float SIMD_Ftype;  // Single precision type
 typedef vector4double              SIMD_Dtype; // Double precision type
 typedef Optimization::veci         SIMD_Itype; // Integer type
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -570,9 +570,9 @@ namespace Optimization {
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
-    // FIXME unimplemented
-   printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    __m128i v1 = _mm_hadd_epi32(in, in);
+    __m128i v2 = _mm_hadd_epi32(v1, v1);
+    return _mm_cvtsi128_si32(v2);
  }
 }

--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -53,7 +53,7 @@ directory
 #if defined IMCI
 #include "Grid_imci.h"
 #endif
-#ifdef NEONv8
+#ifdef NEONV8
 #include "Grid_neon.h"
 #endif
 #if defined QPX
@@ -751,8 +751,8 @@ inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {

  conv.v = in.v;
  for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
-    assert(conv.s[i + 1] ==
-           conv.s[i]);  // trap any cases where real was not duplicated
+    assert(conv.s[i + 1] == conv.s[i]);  
+    // trap any cases where real was not duplicated
    // indicating the SIMD grids of real and imag assignment did not correctly
    // match
    conv.s[i + 1] = 0.0;  // zero imaginary parts
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@@ -32,8 +32,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {

 int LebesgueOrder::UseLebesgueOrder;
+#ifdef KNL
 std::vector<int> LebesgueOrder::Block({8,2,2,2});
-
+#else
+std::vector<int> LebesgueOrder::Block({2,2,2,2});
+#endif
 LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
  n--;           // 1000 0011 --> 1000 0010
  n |= n >> 1;   // 1000 0010 | 0100 0001 = 1100 0011
@@ -51,8 +54,31 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
  if ( Block[0]==0) ZGraph();
  else if ( Block[1]==0) NoBlocking();
  else CartesianBlocking();
-}

+  if (0) {
+    std::cout << "Thread Interleaving"<<std::endl;
+    ThreadInterleave();
+  } 
+}
+void LebesgueOrder::ThreadInterleave(void)
+{
+  std::vector<IndexInteger> reorder = _LebesgueReorder;
+  std::vector<IndexInteger> throrder;
+  int vol = _LebesgueReorder.size();
+  int threads = GridThread::GetThreads();
+  int blockbits=3;
+  int blocklen = 8;
+  int msk      = 0x7;
+
+  for(int t=0;t<threads;t++){
+    for(int ss=0;ss<vol;ss++){
+       if ( ( ss >> blockbits) % threads == t ) { 
+         throrder.push_back(reorder[ss]);
+       }
+    }
+  }
+  _LebesgueReorder = throrder;
+}
 void LebesgueOrder::NoBlocking(void) 
 {
  std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
--- a/lib/stencil/Lebesgue.h
+++ b/lib/stencil/Lebesgue.h
@@ -70,6 +70,8 @@ namespace Grid {
 		  std::vector<IndexInteger> & xi,
 		  std::vector<IndexInteger> &dims);

+    void ThreadInterleave(void);
+
  private:
    std::vector<IndexInteger> _LebesgueReorder;

--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -285,7 +285,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  {
    int dimension    = _directions[point];
    int displacement = _distances[point];
-
+    
    int fd = _grid->_fdimensions[dimension];
    int rd = _grid->_rdimensions[dimension];
    
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -156,11 +156,18 @@ class iScalar {

  // convert from a something to a scalar via constructor of something arg
  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
-    strong_inline iScalar<vtype> operator=(T arg) {
+  strong_inline iScalar<vtype> operator=(T arg) {
    _internal = arg;
    return *this;
  }

+  // Convert elements
+  template <class ttype>
+  strong_inline iScalar<vtype> operator=(iScalar<ttype> &&arg) {
+    _internal = arg._internal;
+    return *this;
+  }
+
  friend std::ostream &operator<<(std::ostream &stream,const iScalar<vtype> &o) {
    stream << "S {" << o._internal << "}";
    return stream;
--- a/lib/tensors/Tensor_exp.h
+++ b/lib/tensors/Tensor_exp.h
@@ -80,8 +80,11 @@ template<class vtype, int N> inline iVector<vtype, N> Exponentiate(const iVector
      mat iQ2 = arg*arg*alpha*alpha;
      mat iQ3 = arg*iQ2*alpha;   
      // sign in c0 from the conventions on the Ta
-      c0 = -imag( trace(iQ3) ) * one_over_three;  
-      c1 = -real( trace(iQ2) ) * one_over_two;
+      scalar imQ3, reQ2;
+      imQ3 = imag( trace(iQ3) );
+      reQ2 = real( trace(iQ2) );
+      c0 = -imQ3 * one_over_three;  
+      c1 = -reQ2 * one_over_two;

      // Cayley Hamilton checks to machine precision, tested
      tmp = c1 * one_over_three;