From e3d53194704ea052968379660c1e7589440e5a05 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 6 Jul 2016 14:16:03 +0100
Subject: [PATCH] Debugged the real() and imag() functions and added tests to
 Test_Simd

---
 lib/Simd.h                       |   45 +-
 lib/lattice/Lattice_reduction.h  |    2 +-
 lib/qcd/smearing/StoutSmearing.h |  247 ++++---
 lib/simd/Grid_vector_types.h     | 1080 +++++++++++++++++-------------
 lib/simd/Grid_vector_unops.h     |  415 ++++++------
 tests/Test_simd.cc               |   77 ++-
 6 files changed, 999 insertions(+), 867 deletions(-)

diff --git a/lib/Simd.h b/lib/Simd.h
index de49cca7..6a812e5e 100644
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -1,32 +1,33 @@
-    /*************************************************************************************
+/*************************************************************************************
 
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
 
-    Source file: ./lib/Simd.h
+Source file: ./lib/Simd.h
 
-    Copyright (C) 2015
+Copyright (C) 2015
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
 
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_SIMD_H
 #define GRID_SIMD_H
 
@@ -118,6 +119,14 @@ namespace Grid {
   inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
   inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
   inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
+
+  // define projections to real and imaginay parts
+  inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
+  inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
+  inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
+  inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
+
+  // define auxiliary functions for complex computations
   inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
   inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
   inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index 4599f2f9..2615af48 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -40,7 +40,7 @@ namespace Grid {
     ////////////////////////////////////////////////////////////////////////////////////////////////////
   template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
     ComplexD nrm = innerProduct(arg,arg);
-    return real(nrm); 
+    return std::real(nrm); 
   }
 
     template<class vobj>
diff --git a/lib/qcd/smearing/StoutSmearing.h b/lib/qcd/smearing/StoutSmearing.h
index 511a5c29..50a09972 100644
--- a/lib/qcd/smearing/StoutSmearing.h
+++ b/lib/qcd/smearing/StoutSmearing.h
@@ -5,163 +5,156 @@
 #ifndef STOUT_SMEAR_
 #define STOUT_SMEAR_
 
-  namespace Grid {
-  	namespace QCD {
+namespace Grid {
+namespace QCD {
 
-    /*!  @brief Stout smearing of link variable. */
-    template <class Gimpl> 
-  		class Smear_Stout: public Smear<Gimpl> {
-  		private:
-  			const Smear < Gimpl > * SmearBase;
+/*!  @brief Stout smearing of link variable. */
+template <class Gimpl>
+class Smear_Stout : public Smear<Gimpl> {
+ private:
+  const Smear<Gimpl>* SmearBase;
 
-  		public:
-  			INHERIT_GIMPL_TYPES(Gimpl)
+ public:
+  INHERIT_GIMPL_TYPES(Gimpl)
 
-  			Smear_Stout(Smear < Gimpl >* base):SmearBase(base){
-  				static_assert(Nc==3, "Stout smearing currently implemented only for Nc==3");
-  			}
+  Smear_Stout(Smear<Gimpl>* base) : SmearBase(base) {
+    static_assert(Nc == 3,
+                  "Stout smearing currently implemented only for Nc==3");
+  }
 
-			/*! Default constructor */
-  			Smear_Stout(double rho = 1.0):SmearBase(new Smear_APE < Gimpl > (rho)){
-  				static_assert(Nc==3, "Stout smearing currently implemented only for Nc==3");
-  			}
+  /*! Default constructor */
+  Smear_Stout(double rho = 1.0) : SmearBase(new Smear_APE<Gimpl>(rho)) {
+    static_assert(Nc == 3,
+                  "Stout smearing currently implemented only for Nc==3");
+  }
 
-  			~Smear_Stout(){} //delete SmearBase...
+  ~Smear_Stout() {}  // delete SmearBase...
 
-  			void smear(GaugeField& u_smr,const GaugeField& U) const{
-  				GaugeField C(U._grid);
-  				GaugeLinkField tmp(U._grid), iq_mu(U._grid), Umu(U._grid);
+  void smear(GaugeField& u_smr, const GaugeField& U) const {
+    GaugeField C(U._grid);
+    GaugeLinkField tmp(U._grid), iq_mu(U._grid), Umu(U._grid);
 
-  				std::cout<< GridLogDebug << "Stout smearing started\n";
+    std::cout << GridLogDebug << "Stout smearing started\n";
 
-				//Smear the configurations
-  				SmearBase->smear(C, U);
+    // Smear the configurations
+    SmearBase->smear(C, U);
 
-  				for (int mu = 0; mu<Nd; mu++)
-  				{
-  					tmp = peekLorentz(C,mu);
-  					Umu = peekLorentz(U,mu);
-		  			iq_mu = Ta(tmp * adj(Umu)); // iq_mu = Ta(Omega_mu) to match the signs with the paper
-		  			exponentiate_iQ(tmp, iq_mu);  
-					pokeLorentz(u_smr, tmp*Umu, mu);// u_smr = exp(iQ_mu)*U_mu
-				}
-				std::cout<< GridLogDebug << "Stout smearing completed\n";
-			};
+    for (int mu = 0; mu < Nd; mu++) {
+      tmp = peekLorentz(C, mu);
+      Umu = peekLorentz(U, mu);
+      iq_mu = Ta(
+          tmp *
+          adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
+      exponentiate_iQ(tmp, iq_mu);
+      pokeLorentz(u_smr, tmp * Umu, mu);  // u_smr = exp(iQ_mu)*U_mu
+    }
+    std::cout << GridLogDebug << "Stout smearing completed\n";
+  };
 
+  void derivative(GaugeField& SigmaTerm, const GaugeField& iLambda,
+                  const GaugeField& Gauge) const {
+    SmearBase->derivative(SigmaTerm, iLambda, Gauge);
+  };
 
-			void derivative(GaugeField& SigmaTerm,
-				const GaugeField& iLambda,
-				const GaugeField& Gauge) const{
-				SmearBase->derivative(SigmaTerm, iLambda, Gauge);
-			};
+  void BaseSmear(GaugeField& C, const GaugeField& U) const {
+    SmearBase->smear(C, U);
+  };
 
+  void exponentiate_iQ(GaugeLinkField& e_iQ, const GaugeLinkField& iQ) const {
+    // Put this outside
+    // only valid for SU(3) matrices
 
-			void BaseSmear(GaugeField& C,
-				const GaugeField& U) const{
-				SmearBase->smear(C, U);
-			};
+    // only one Lorentz direction at a time
 
-			void exponentiate_iQ(GaugeLinkField& e_iQ,
-				const GaugeLinkField& iQ) const{
-		// Put this outside 
-		// only valid for SU(3) matrices
+    // notice that it actually computes
+    // exp ( input matrix )
+    // the i sign is coming from outside
+    // input matrix is anti-hermitian NOT hermitian
 
-		// only one Lorentz direction at a time 
+    GridBase* grid = iQ._grid;
+    GaugeLinkField unity(grid);
+    unity = 1.0;
 
-		// notice that it actually computes
-		// exp ( input matrix )  
-		// the i sign is coming from outside
-		// input matrix is anti-hermitian NOT hermitian
+    GaugeLinkField iQ2(grid), iQ3(grid);
+    LatticeComplex u(grid), w(grid);
+    LatticeComplex f0(grid), f1(grid), f2(grid);
 
-				GridBase *grid = iQ._grid;
-				GaugeLinkField unity(grid);
-				unity=1.0;
+    iQ2 = iQ * iQ;
+    iQ3 = iQ * iQ2;
 
-				GaugeLinkField iQ2(grid), iQ3(grid);
-				LatticeComplex u(grid), w(grid);
-				LatticeComplex f0(grid), f1(grid), f2(grid);
+    set_uw(u, w, iQ2, iQ3);
+    set_fj(f0, f1, f2, u, w);
 
-				iQ2 = iQ * iQ;
-				iQ3 = iQ * iQ2;
+    e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;
+  };
 
-				set_uw(u, w, iQ2, iQ3);
-				set_fj(f0, f1, f2, u, w);
+  void set_uw(LatticeComplex& u, LatticeComplex& w, GaugeLinkField& iQ2,
+              GaugeLinkField& iQ3) const {
+    Complex one_over_three = 1.0 / 3.0;
+    Complex one_over_two = 1.0 / 2.0;
 
-				e_iQ = f0*unity + timesMinusI(f1) * iQ - f2 * iQ2;
+    GridBase* grid = u._grid;
+    LatticeComplex c0(grid), c1(grid), tmp(grid), c0max(grid), theta(grid);
 
+    // sign in c0 from the conventions on the Ta
+    c0 = -imag(trace(iQ3)) * one_over_three;  
+    c1 = -real(trace(iQ2)) * one_over_two;
 
-			};
+    // Cayley Hamilton checks to machine precision, tested
+    tmp = c1 * one_over_three;
+    c0max = 2.0 * pow(tmp, 1.5);
 
+    theta = acos(c0 / c0max) *
+            one_over_three;  // divide by three here, now leave as it is
+    u = sqrt(tmp) * cos(theta);
+    w = sqrt(c1) * sin(theta);
+  }
 
-			void set_uw(LatticeComplex& u, LatticeComplex& w,
-				GaugeLinkField& iQ2, GaugeLinkField& iQ3) const{
-				Complex one_over_three = 1.0/3.0;
-				Complex one_over_two = 1.0/2.0;
+  void set_fj(LatticeComplex& f0, LatticeComplex& f1, LatticeComplex& f2,
+              const LatticeComplex& u, const LatticeComplex& w) const {
+    GridBase* grid = u._grid;
+    LatticeComplex xi0(grid), u2(grid), w2(grid), cosw(grid);
+    LatticeComplex fden(grid);
+    LatticeComplex h0(grid), h1(grid), h2(grid);
+    LatticeComplex e2iu(grid), emiu(grid), ixi0(grid), qt(grid);
+    LatticeComplex unity(grid);
+    unity = 1.0;
 
-				GridBase *grid = u._grid;
-				LatticeComplex c0(grid), c1(grid), tmp(grid), c0max(grid), theta(grid);
+    xi0 = func_xi0(w);
+    u2 = u * u;
+    w2 = w * w;
+    cosw = cos(w);
 
-				// sign in c0 from the conventions on the Ta
-				c0    = - real(timesMinusI(trace(iQ3))) * one_over_three; //temporary hack 
-				c1    = - real(trace(iQ2)) * one_over_two;
+    ixi0 = timesI(xi0);
+    emiu = cos(u) - timesI(sin(u));
+    e2iu = cos(2.0 * u) + timesI(sin(2.0 * u));
 
-				//Cayley Hamilton checks to machine precision, tested
-				tmp   = c1 * one_over_three;
-				c0max = 2.0 * pow(tmp, 1.5);
+    h0 = e2iu * (u2 - w2) +
+         emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0));
+    h1 = e2iu * (2.0 * u) - emiu * ((2.0 * u * cosw) - (3.0 * u2 - w2) * ixi0);
+    h2 = e2iu - emiu * (cosw + (3.0 * u) * ixi0);
 
-				theta = acos(c0/c0max)*one_over_three; // divide by three here, now leave as it is
-				u = sqrt(tmp) * cos( theta );
-				w = sqrt(c1)  * sin( theta );
-			}
+    fden = unity / (9.0 * u2 - w2);  // reals
+    f0 = h0 * fden;
+    f1 = h1 * fden;
+    f2 = h2 * fden;
+  }
 
-			void set_fj(LatticeComplex& f0, LatticeComplex& f1, LatticeComplex& f2,
-				const LatticeComplex& u, const LatticeComplex& w) const{
+  LatticeComplex func_xi0(const LatticeComplex& w) const {
+    // Define a function to do the check
+    // if( w < 1e-4 ) std::cout << GridLogWarning<< "[Smear_stout] w too small:
+    // "<< w <<"\n";
+    return sin(w) / w;
+  }
 
-				GridBase *grid = u._grid;
-				LatticeComplex xi0(grid), u2(grid), w2(grid), cosw(grid);
-				LatticeComplex fden(grid);
-				LatticeComplex h0(grid), h1(grid), h2(grid);
-				LatticeComplex e2iu(grid), emiu(grid), ixi0(grid), qt(grid);
-				LatticeComplex unity(grid);
-				unity = 1.0;
-
-				xi0   = func_xi0(w);
-				u2    = u * u;
-				w2    = w * w;
-				cosw  = cos(w);
-
-				ixi0  = timesI(xi0);
-				emiu  = cos(u)     - timesI(sin(u));
-				e2iu  = cos(2.0*u) + timesI(sin(2.0*u));
-
-				h0    = e2iu * (u2 - w2) + emiu * ( (8.0*u2*cosw) + (2.0*u*(3.0*u2 + w2)*ixi0));
-				h1    = e2iu * (2.0 * u) - emiu * ( (2.0*u*cosw) - (3.0*u2-w2)*ixi0);
-				h2    = e2iu             - emiu * ( cosw + (3.0*u)*ixi0);
-
-				fden   = unity/(9.0*u2 - w2);// reals
-				f0    = h0 * fden;
-				f1    = h1 * fden;
-				f2    = h2 * fden;	
-			}
-
-
-
-
-			LatticeComplex func_xi0(const LatticeComplex& w) const{
-	// Define a function to do the check
-	//if( w < 1e-4 ) std::cout << GridLogWarning<< "[Smear_stout] w too small: "<< w <<"\n";
-				return  sin(w)/w;
-			}
-
-			LatticeComplex func_xi1(const LatticeComplex& w) const{
-	// Define a function to do the check
-	//if( w < 1e-4 ) std::cout << GridLogWarning << "[Smear_stout] w too small: "<< w <<"\n";
-				return  cos(w)/(w*w) - sin(w)/(w*w*w);
-			}
-
-		};
-
-	}
+  LatticeComplex func_xi1(const LatticeComplex& w) const {
+    // Define a function to do the check
+    // if( w < 1e-4 ) std::cout << GridLogWarning << "[Smear_stout] w too small:
+    // "<< w <<"\n";
+    return cos(w) / (w * w) - sin(w) / (w * w * w);
+  }
+};
+}
 }
 
-#endif  
+#endif
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index b9f7169e..0b2b6728 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -1,33 +1,34 @@
-    /*************************************************************************************
+/*************************************************************************************
 
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
 
-    Source file: ./lib/simd/Grid_vector_types.h
+Source file: ./lib/simd/Grid_vector_types.h
 
-    Copyright (C) 2015
+Copyright (C) 2015
 
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
 
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 //---------------------------------------------------------------------------
 /*! @file Grid_vector_types.h
   @brief Defines templated class Grid_simd to deal with inner vector types
@@ -43,7 +44,7 @@ Author: neo <cossu@post.kek.jp>
 #ifdef SSE4
 #include "Grid_sse4.h"
 #endif
-#if defined (AVX1)|| defined (AVX2) || defined (AVXFMA4)
+#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4)
 #include "Grid_avx.h"
 #endif
 #if defined AVX512
@@ -61,518 +62,635 @@ Author: neo <cossu@post.kek.jp>
 
 namespace Grid {
 
-  //////////////////////////////////////
-  // To take the floating point type of real/complex type
-  //////////////////////////////////////
-  template <typename T> struct RealPart {
-    typedef T type;
-  };
-  template <typename T> struct RealPart< std::complex<T> >{
-    typedef T type;
-  };
-  
-  //////////////////////////////////////
-  // demote a vector to real type
-  //////////////////////////////////////
+//////////////////////////////////////
+// To take the floating point type of real/complex type
+//////////////////////////////////////
+template <typename T>
+struct RealPart {
+  typedef T type;
+};
+template <typename T>
+struct RealPart<std::complex<T> > {
+  typedef T type;
+};
 
-  // type alias used to simplify the syntax of std::enable_if
-  template <typename T> using Invoke                                  =  typename T::type;
-  template <typename Condition, typename ReturnType> using EnableIf   =  Invoke<std::enable_if<Condition::value, ReturnType> >;
-  template <typename Condition, typename ReturnType> using NotEnableIf=  Invoke<std::enable_if<!Condition::value, ReturnType> >;
+//////////////////////////////////////
+// demote a vector to real type
+//////////////////////////////////////
 
+// type alias used to simplify the syntax of std::enable_if
+template <typename T>
+using Invoke = typename T::type;
+template <typename Condition, typename ReturnType>
+using EnableIf = Invoke<std::enable_if<Condition::value, ReturnType> >;
+template <typename Condition, typename ReturnType>
+using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
 
-  ////////////////////////////////////////////////////////
-  // Check for complexity with type traits
-  template <typename T>   struct is_complex : public std::false_type {};
-  template <> struct is_complex<std::complex<double> >: public std::true_type {};
-  template <> struct is_complex<std::complex<float> > : public std::true_type {};
+////////////////////////////////////////////////////////
+// Check for complexity with type traits
+template <typename T>
+struct is_complex : public std::false_type {};
+template <>
+struct is_complex<std::complex<double> > : public std::true_type {};
+template <>
+struct is_complex<std::complex<float> > : public std::true_type {};
 
-  template <typename T> using IfReal    = Invoke<std::enable_if<std::is_floating_point<T>::value,int> > ;
-  template <typename T> using IfComplex = Invoke<std::enable_if<is_complex<T>::value,int> > ;
-  template <typename T> using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value,int> > ;
+template <typename T>
+using IfReal = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
+template <typename T>
+using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
+template <typename T>
+using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
 
-  template <typename T> using IfNotReal    = Invoke<std::enable_if<!std::is_floating_point<T>::value,int> > ;
-  template <typename T> using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value,int> > ;
-  template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value,int> > ;
+template <typename T>
+using IfNotReal =
+    Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
+template <typename T>
+using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
+template <typename T>
+using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
 
-  ////////////////////////////////////////////////////////
-  // Define the operation templates functors
-  // general forms to allow for vsplat syntax
-  // need explicit declaration of types when used since
-  // clang cannot automatically determine the output type sometimes
-  template < class Out, class Input1, class Input2, class Operation > 
-    Out binary(Input1 src_1, Input2 src_2, Operation op){
-    return op(src_1, src_2);
-  } 
+////////////////////////////////////////////////////////
+// Define the operation templates functors
+// general forms to allow for vsplat syntax
+// need explicit declaration of types when used since
+// clang cannot automatically determine the output type sometimes
+template <class Out, class Input1, class Input2, class Operation>
+Out binary(Input1 src_1, Input2 src_2, Operation op) {
+  return op(src_1, src_2);
+}
 
-  template < class Out, class Input, class Operation > 
-    Out unary(Input src, Operation op){
-    return op(src);
-  } 
-  ///////////////////////////////////////////////
+template <class Out, class Input, class Operation>
+Out unary(Input src, Operation op) {
+  return op(src);
+}
+///////////////////////////////////////////////
 
+/*
+  @brief Grid_simd class for the SIMD vector type operations
+ */
+template <class Scalar_type, class Vector_type>
+class Grid_simd {
+ public:
+  typedef typename RealPart<Scalar_type>::type Real;
+  typedef Vector_type vector_type;
+  typedef Scalar_type scalar_type;
 
-
-  /*
-    @brief Grid_simd class for the SIMD vector type operations
-   */
-  template < class Scalar_type, class Vector_type > 
-    class Grid_simd {
-    
-  public:
-    typedef typename RealPart < Scalar_type >::type Real; 
-    typedef Vector_type     vector_type;
-    typedef Scalar_type     scalar_type;
-
-
-    typedef union conv_t_union {
-	Vector_type v;
-	Scalar_type s[sizeof(Vector_type)/sizeof(Scalar_type)];
-      conv_t_union(){};
-    } conv_t;
-    
-   
+  typedef union conv_t_union {
     Vector_type v;
-    
-    static inline int Nsimd(void) { return sizeof(Vector_type)/sizeof(Scalar_type);}
-    
-    Grid_simd& operator=(const Grid_simd&& rhs){v=rhs.v;return *this;};
-    Grid_simd& operator=(const Grid_simd& rhs){v=rhs.v;return *this;}; //faster than not declaring it and leaving to the compiler
-    Grid_simd()=default; 
-    Grid_simd(const Grid_simd& rhs) :v(rhs.v){};    //compiles in movaps
-    Grid_simd(const Grid_simd&& rhs):v(rhs.v){};  
+    Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
+    conv_t_union(){};
+  } conv_t;
 
-    /////////////////////////////
-    // Constructors
-    /////////////////////////////
-    Grid_simd & operator = ( Zero & z){
-      vzero(*this);
-      return (*this);
-    }
-  
-    //Enable if complex type
-    template < typename S = Scalar_type > 
-    Grid_simd(const typename std::enable_if< is_complex < S >::value, S>::type a){
-      vsplat(*this,a);
-    };
+  Vector_type v;
 
-    Grid_simd(const Real a){
-      vsplat(*this,Scalar_type(a));
-    };
-       
-    ///////////////////////////////////////////////
-    // mac, mult, sub, add, adj
-    ///////////////////////////////////////////////
-
-    // FIXME -- alias this to an inline MAC struct.
-    friend inline void mac (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-
-
-    friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
-    friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
-    friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
-
-    friend inline void mac (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ a,const Grid_simd   *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-    friend inline void mult(Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd   *__restrict__ r){ *y = (*l) * (*r); }
-    friend inline void sub (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd   *__restrict__ r){ *y = (*l) - (*r); }
-    friend inline void add (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd   *__restrict__ r){ *y = (*l) + (*r); }
-
-    friend inline void mac (Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ a,const Scalar_type *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-    friend inline void mult(Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) * (*r); }
-    friend inline void sub (Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) - (*r); }
-    friend inline void add (Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) + (*r); }
-
-    ////////////////////////////////////////////////////////////////////////
-    // FIXME:  gonna remove these load/store, get, set, prefetch
-    ////////////////////////////////////////////////////////////////////////
-    friend inline void vset(Grid_simd &ret, Scalar_type *a){
-      ret.v = unary<Vector_type>(a, VsetSIMD());
-    }
-        
-    ///////////////////////
-    // Vstore
-    ///////////////////////
-    friend inline void vstore(const Grid_simd &ret, Scalar_type *a){
-      binary<void>(ret.v, (Real*)a, VstoreSIMD());
-    }
-
-    ///////////////////////
-    // Vprefetch
-    ///////////////////////
-    friend inline void vprefetch(const Grid_simd &v)
-    {
-      prefetch_HINT_T0((const char*)&v.v);
-    }
-
-    ///////////////////////
-    // Reduce
-    ///////////////////////
-    friend inline Scalar_type Reduce(const Grid_simd & in)
-    {
-      return unary<Scalar_type>(in.v, ReduceSIMD<Scalar_type, Vector_type>());
-    }
-
-    ////////////////////////////
-    // opreator scalar * simd
-    ////////////////////////////
-    friend inline Grid_simd operator * (const Scalar_type &a, Grid_simd b){
-      Grid_simd va;
-      vsplat(va,a);
-      return va*b;
-    }
-    friend inline Grid_simd operator * (Grid_simd b,const Scalar_type &a){
-      return a*b;
-    }
-
-    ///////////////////////
-    // Unary negation
-    ///////////////////////
-    friend inline Grid_simd operator -(const Grid_simd &r) {
-      Grid_simd ret;
-      vzero(ret);
-      ret = ret - r;
-      return ret;
-    }
-    // *=,+=,-= operators
-    inline Grid_simd &operator *=(const Grid_simd &r) {
-      *this = (*this)*r;
-      return *this;
-      // return (*this)*r; ?
-    }
-    inline Grid_simd &operator +=(const Grid_simd &r) {
-      *this = *this+r;
-      return *this;
-    }
-    inline Grid_simd &operator -=(const Grid_simd &r) {
-      *this = *this-r;
-      return *this;
-    }
-
-    ///////////////////////////////////////
-    // Not all functions are supported
-    // through SIMD and must breakout to 
-    // scalar type and back again. This
-    // provides support
-    ///////////////////////////////////////
-
-    template<class functor> friend inline Grid_simd SimdApply (const functor &func,const Grid_simd &v) {
-      Grid_simd ret;
-      Grid_simd::conv_t conv;
-
-      conv.v = v.v;
-      for(int i=0;i<Nsimd();i++){
-	conv.s[i]=func(conv.s[i]);
-      }
-      ret.v = conv.v;
-      return ret;
-    }
-    template<class functor> friend inline Grid_simd SimdApplyBinop (const functor &func,const Grid_simd &x,const Grid_simd &y) {
-      Grid_simd ret;
-      Grid_simd::conv_t cx;
-      Grid_simd::conv_t cy;
-
-      cx.v = x.v;
-      cy.v = y.v;
-      for(int i=0;i<Nsimd();i++){
-	cx.s[i]=func(cx.s[i],cy.s[i]);
-      }
-      ret.v = cx.v;
-      return ret;
-    }
-
-    ////////////////////////////////////////////////////////////////////
-    // General permute; assumes vector length is same across 
-    // all subtypes; may not be a good assumption, but could
-    // add the vector width as a template param for BG/Q for example
-    ////////////////////////////////////////////////////////////////////
-    friend inline void permute0(Grid_simd &y,Grid_simd b){
-      y.v = Optimization::Permute::Permute0(b.v);
-    }
-    friend inline void permute1(Grid_simd &y,Grid_simd b){
-      y.v = Optimization::Permute::Permute1(b.v);
-    }
-    friend inline void permute2(Grid_simd &y,Grid_simd b){
-      y.v = Optimization::Permute::Permute2(b.v);
-    }
-    friend inline void permute3(Grid_simd &y,Grid_simd b){
-      y.v = Optimization::Permute::Permute3(b.v);
-    }
-    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
-    {
-      if ( perm & RotateBit ) {
-	int dist = perm&0xF;
-        y=rotate(b,dist);
-	return;
-      }
-      switch(perm){
-      case 3: permute3(y,b); break;
-      case 2: permute2(y,b); break;
-      case 1: permute1(y,b); break;
-      case 0: permute0(y,b); break;
-      default: assert(0);
-      }
-    }
-    
-  };// end of Grid_simd class definition 
-
-  ////////////////////////////////////////////////////////////////////
-  // General rotate
-  ////////////////////////////////////////////////////////////////////
-  template <class S, class V, IfNotComplex<S> =0> 
-  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
-  {
-    nrot = nrot % Grid_simd<S,V>::Nsimd();
-    Grid_simd<S,V> ret;
-    //    std::cout << "Rotate Real by "<<nrot<<std::endl;
-    ret.v = Optimization::Rotate::rotate(b.v,nrot);
-    return ret;
-  }
-  template <class S, class V, IfComplex<S> =0> 
-  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
-  {
-    nrot = nrot % Grid_simd<S,V>::Nsimd();
-    Grid_simd<S,V> ret;
-    //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
-    ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
-    return ret;
+  static inline int Nsimd(void) {
+    return sizeof(Vector_type) / sizeof(Scalar_type);
   }
 
-  ///////////////////////
-  // Splat
-  ///////////////////////
-  
-  // this is only for the complex version
-  template <class S, class V, IfComplex<S> =0, class ABtype> 
-  inline void vsplat(Grid_simd<S,V> &ret,ABtype a, ABtype b){
-    ret.v = binary<V>(a, b, VsplatSIMD());
-  }    
+  Grid_simd &operator=(const Grid_simd &&rhs) {
+    v = rhs.v;
+    return *this;
+  };
+  Grid_simd &operator=(const Grid_simd &rhs) {
+    v = rhs.v;
+    return *this;
+  };  // faster than not declaring it and leaving to the compiler
+  Grid_simd() = default;
+  Grid_simd(const Grid_simd &rhs) : v(rhs.v){};  // compiles in movaps
+  Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
 
-  // overload if complex
-  template <class S,class V> inline void vsplat(Grid_simd<S,V> &ret, EnableIf<is_complex < S >, S> c) {
-    vsplat(ret,real(c),imag(c));
+  /////////////////////////////
+  // Constructors
+  /////////////////////////////
+  Grid_simd &operator=(Zero &z) {
+    vzero(*this);
+    return (*this);
   }
 
-  //if real fill with a, if complex fill with a in the real part (first function above)
-  template <class S,class V>
-    inline void vsplat(Grid_simd<S,V> &ret,NotEnableIf<is_complex< S>,S> a){
-    ret.v = unary<V>(a, VsplatSIMD());
-  }    
-  //////////////////////////
+  // Enable if complex type
+  template <typename S = Scalar_type>
+  Grid_simd(const typename std::enable_if<is_complex<S>::value, S>::type a) {
+    vsplat(*this, a);
+  };
+
+  Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
 
   ///////////////////////////////////////////////
-  // Initialise to 1,0,i for the correct types
+  // mac, mult, sub, add, adj
   ///////////////////////////////////////////////
-  // For complex types
-  template <class S,class V, IfComplex<S> = 0 > inline void vone(Grid_simd<S,V>  &ret)     { vsplat(ret,S(1.0,0.0)); }
-  template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret)     { vsplat(ret,S(0.0,0.0)); }// use xor?
-  template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} 
 
-  template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));} 
-  template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));} 
-
-  // if not complex overload here 
-  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
-  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
-   
-  // For integral types
-  template <class S,class V,IfInteger<S> = 0 > inline void vone(Grid_simd<S,V> &ret)  {vsplat(ret,1); }
-  template <class S,class V,IfInteger<S> = 0 > inline void vzero(Grid_simd<S,V> &ret) {vsplat(ret,0); }
-  template <class S,class V,IfInteger<S> = 0 > inline void vtrue (Grid_simd<S,V> &ret){vsplat(ret,0xFFFFFFFF);}
-  template <class S,class V,IfInteger<S> = 0 > inline void vfalse(Grid_simd<S,V> &ret){vsplat(ret,0);}
-
-  template<class S,class V> inline void zeroit(Grid_simd<S,V> &z){ vzero(z);}
-
-  ///////////////////////
-  // Vstream
-  ///////////////////////
-  template <class S,class V, IfReal<S> = 0 > 
-  inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
-    binary<void>((S *)&out.v, in.v, VstreamSIMD());
-  }
-  template <class S,class V, IfComplex<S> = 0 > 
-  inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
-    typedef typename S::value_type T;
-    binary<void>((T *)&out.v, in.v, VstreamSIMD());
-  }
-
-  template <class S,class V, IfInteger<S> = 0 > 
-  inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
-    out=in;
-  }
-
-  ////////////////////////////////////
-  // Arithmetic operator overloads +,-,*
-  ////////////////////////////////////
-  template<class S,class V> inline Grid_simd<S,V> operator + (Grid_simd<S,V> a, Grid_simd<S,V> b) {
-    Grid_simd<S,V> ret;
-    ret.v = binary<V>(a.v, b.v, SumSIMD());
-    return ret;
-  };
-        
-  template<class S,class V> inline Grid_simd<S,V> operator - (Grid_simd<S,V> a, Grid_simd<S,V> b) {
-    Grid_simd<S,V> ret;
-    ret.v = binary<V>(a.v, b.v, SubSIMD());
-    return ret;
-  };
-        
-  // Distinguish between complex types and others
-  template<class S,class V, IfComplex<S> = 0 > inline Grid_simd<S,V> operator * (Grid_simd<S,V> a, Grid_simd<S,V> b) {
-    Grid_simd<S,V> ret;
-    ret.v = binary<V>(a.v,b.v, MultComplexSIMD());
-    return ret;
+  // FIXME -- alias this to an inline MAC struct.
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ a,
+                         const Grid_simd *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
   };
 
-    // Real/Integer types
-  template<class S,class V, IfNotComplex<S> = 0 > inline Grid_simd<S,V> operator * (Grid_simd<S,V> a, Grid_simd<S,V> b) {
-    Grid_simd<S,V> ret;
-    ret.v = binary<V>(a.v,b.v, MultSIMD());
-    return ret;
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Grid_simd *__restrict__ l,
+                          const Grid_simd *__restrict__ r) {
+    *y = (*l) * (*r);
+  }
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ a,
+                         const Grid_simd *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
   };
-  
-
-    ///////////////////////
-    // Conjugate
-    ///////////////////////
-  template <class S,class V, IfComplex<S> = 0 > 
-    inline Grid_simd<S,V> conjugate(const Grid_simd<S,V>  &in){
-    Grid_simd<S,V>  ret ; 
-    ret.v = unary<V>(in.v, ConjSIMD());
-    return ret;
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Scalar_type *__restrict__ l,
+                          const Grid_simd *__restrict__ r) {
+    *y = (*l) * (*r);
   }
-  template <class S,class V, IfNotComplex<S> = 0 > inline Grid_simd<S,V> conjugate(const Grid_simd<S,V>  &in){
-    return in; // for real objects
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ a,
+                         const Scalar_type *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
+  };
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Grid_simd *__restrict__ l,
+                          const Scalar_type *__restrict__ r) {
+    *y = (*l) * (*r);
+  }
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Scalar_type *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Scalar_type *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // FIXME:  gonna remove these load/store, get, set, prefetch
+  ////////////////////////////////////////////////////////////////////////
+  friend inline void vset(Grid_simd &ret, Scalar_type *a) {
+    ret.v = unary<Vector_type>(a, VsetSIMD());
   }
 
-  //Suppress adj for integer types... // odd; why conjugate above but not adj??
-  template < class S, class V, IfNotInteger<S> = 0 > 
-    inline Grid_simd<S,V> adj(const Grid_simd<S,V> &in){ return conjugate(in); }
-  
   ///////////////////////
-  // timesMinusI
+  // Vstore
   ///////////////////////
-  template<class S,class V,IfComplex<S> = 0 > 
-    inline void timesMinusI( Grid_simd<S,V> &ret,const Grid_simd<S,V> &in){
-    ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD());
+  friend inline void vstore(const Grid_simd &ret, Scalar_type *a) {
+    binary<void>(ret.v, (Real *)a, VstoreSIMD());
   }
 
-  template<class S,class V,IfComplex<S> = 0 > 
-    inline Grid_simd<S,V> timesMinusI(const Grid_simd<S,V> &in){
-    Grid_simd<S,V> ret; 
-    timesMinusI(ret,in);
+  ///////////////////////
+  // Vprefetch
+  ///////////////////////
+  friend inline void vprefetch(const Grid_simd &v) {
+    prefetch_HINT_T0((const char *)&v.v);
+  }
+
+  ///////////////////////
+  // Reduce
+  ///////////////////////
+  friend inline Scalar_type Reduce(const Grid_simd &in) {
+    return unary<Scalar_type>(in.v, ReduceSIMD<Scalar_type, Vector_type>());
+  }
+
+  ////////////////////////////
+  // opreator scalar * simd
+  ////////////////////////////
+  friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
+    Grid_simd va;
+    vsplat(va, a);
+    return va * b;
+  }
+  friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) {
+    return a * b;
+  }
+
+  ///////////////////////
+  // Unary negation
+  ///////////////////////
+  friend inline Grid_simd operator-(const Grid_simd &r) {
+    Grid_simd ret;
+    vzero(ret);
+    ret = ret - r;
     return ret;
   }
-
-  template<class S,class V,IfNotComplex<S> = 0 > 
-    inline Grid_simd<S,V> timesMinusI(const Grid_simd<S,V> &in){
-    return in;
+  // *=,+=,-= operators
+  inline Grid_simd &operator*=(const Grid_simd &r) {
+    *this = (*this) * r;
+    return *this;
+    // return (*this)*r; ?
+  }
+  inline Grid_simd &operator+=(const Grid_simd &r) {
+    *this = *this + r;
+    return *this;
+  }
+  inline Grid_simd &operator-=(const Grid_simd &r) {
+    *this = *this - r;
+    return *this;
   }
 
-    ///////////////////////
-    // timesI
-    ///////////////////////
-  template<class S,class V,IfComplex<S> = 0 > 
-    inline void timesI(Grid_simd<S,V> &ret,const Grid_simd<S,V> &in){
-    ret.v =   binary<V>(in.v, ret.v, TimesISIMD());     
-  }
-        
-  template<class S,class V,IfComplex<S> = 0 > 
-    inline Grid_simd<S,V> timesI(const Grid_simd<S,V> &in){
-    Grid_simd<S,V> ret; 
-    timesI(ret,in);
-    return ret;
-  }
+  ///////////////////////////////////////
+  // Not all functions are supported
+  // through SIMD and must breakout to
+  // scalar type and back again. This
+  // provides support
+  ///////////////////////////////////////
 
-  template<class S,class V,IfNotComplex<S> = 0 > 
-    inline Grid_simd<S,V> timesI(const Grid_simd<S,V> &in){
-    return in;
-  }
+  template <class functor>
+  friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
+    Grid_simd ret;
+    Grid_simd::conv_t conv;
 
-  /////////////////////
-  // Inner, outer
-  /////////////////////
-
-  template<class S, class V > 
-    inline Grid_simd< S, V>  innerProduct(const Grid_simd< S, V> & l, const Grid_simd< S, V> & r) 
-  {
-    return conjugate(l)*r; 
-  }
-
-  template<class S, class V >
-  inline Grid_simd< S, V> outerProduct(const Grid_simd< S, V> &l, const Grid_simd< S, V> & r)
-  {
-    return l*conjugate(r);
-  }
-
-  template<class S, class V >
-  inline Grid_simd< S, V> trace(const Grid_simd< S, V> &arg){
-    return arg;
-  }
-
-
-  ////////////////////////////////////////////////////////////
-  // copy/splat complex real parts into real;
-  // insert real into complex and zero imag;
-  ////////////////////////////////////////////////////////////
-
-  //real = toReal( complex )
-  template<class S,class V,IfReal<S>  = 0>	
-  inline Grid_simd<S,V> toReal(const Grid_simd<std::complex<S>,V> &in)
-  {
-    typedef Grid_simd<S,V> simd;
-    simd ret;
-    typename simd::conv_t conv;
-    conv.v = in.v; // copy the vector content (bytewise)
-    for(int i=0;i<simd::Nsimd();i+=2){
-      conv.s[i+1]=conv.s[i];    // duplicate (r,r);(r,r);(r,r); etc...
+    conv.v = v.v;
+    for (int i = 0; i < Nsimd(); i++) {
+      conv.s[i] = func(conv.s[i]);
     }
     ret.v = conv.v;
     return ret;
   }
-  
-  //complex = toComplex( real )
-  template<class R,class V,IfReal<R> = 0 >	// must be a real arg
-  inline Grid_simd<std::complex<R>,V> toComplex (const Grid_simd<R,V> &in)
-  {
-    typedef Grid_simd<R,V> Rsimd;
-    typedef Grid_simd<std::complex<R>,V> Csimd;
-    typename Rsimd::conv_t conv;// address as real
-    
-    conv.v = in.v;
-    for(int i=0;i<Rsimd::Nsimd();i+=2){
-      assert(conv.s[i+1]==conv.s[i]); // trap any cases where real was not duplicated 
-      // indicating the SIMD grids of real and imag assignment did not correctly match
-      conv.s[i+1]=0.0;                // zero imaginary parts
+  template <class functor>
+  friend inline Grid_simd SimdApplyBinop(const functor &func,
+                                         const Grid_simd &x,
+                                         const Grid_simd &y) {
+    Grid_simd ret;
+    Grid_simd::conv_t cx;
+    Grid_simd::conv_t cy;
+
+    cx.v = x.v;
+    cy.v = y.v;
+    for (int i = 0; i < Nsimd(); i++) {
+      cx.s[i] = func(cx.s[i], cy.s[i]);
     }
-    Csimd ret;
-    ret.v = conv.v;
+    ret.v = cx.v;
     return ret;
   }
 
+  ////////////////////////////////////////////////////////////////////
+  // General permute; assumes vector length is same across
+  // all subtypes; may not be a good assumption, but could
+  // add the vector width as a template param for BG/Q for example
+  ////////////////////////////////////////////////////////////////////
+  friend inline void permute0(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute0(b.v);
+  }
+  friend inline void permute1(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute1(b.v);
+  }
+  friend inline void permute2(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute2(b.v);
+  }
+  friend inline void permute3(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute3(b.v);
+  }
+  friend inline void permute(Grid_simd &y, Grid_simd b, int perm) {
+    if (perm & RotateBit) {
+      int dist = perm & 0xF;
+      y = rotate(b, dist);
+      return;
+    }
+    switch (perm) {
+      case 3:
+        permute3(y, b);
+        break;
+      case 2:
+        permute2(y, b);
+        break;
+      case 1:
+        permute1(y, b);
+        break;
+      case 0:
+        permute0(y, b);
+        break;
+      default:
+        assert(0);
+    }
+  }
 
+};  // end of Grid_simd class definition
 
-  
-  ///////////////////////////////
-  // Define available types
-  ///////////////////////////////
-  typedef Grid_simd< float                 , SIMD_Ftype > vRealF;
-  typedef Grid_simd< double                , SIMD_Dtype > vRealD;
-  typedef Grid_simd< std::complex< float > , SIMD_Ftype > vComplexF;
-  typedef Grid_simd< std::complex< double >, SIMD_Dtype > vComplexD;
-  typedef Grid_simd< Integer               , SIMD_Itype > vInteger;
+////////////////////////////////////////////////////////////////////
+// General rotate
+////////////////////////////////////////////////////////////////////
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
+  nrot = nrot % Grid_simd<S, V>::Nsimd();
+  Grid_simd<S, V> ret;
+  //    std::cout << "Rotate Real by "<<nrot<<std::endl;
+  ret.v = Optimization::Rotate::rotate(b.v, nrot);
+  return ret;
+}
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
+  nrot = nrot % Grid_simd<S, V>::Nsimd();
+  Grid_simd<S, V> ret;
+  //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
+  ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
+  return ret;
+}
 
-  /////////////////////////////////////////
-  // Some traits to recognise the types
-  /////////////////////////////////////////
-  template <typename T> struct is_simd : public std::false_type{};
-  template <> struct is_simd<vRealF>   : public std::true_type {};
-  template <> struct is_simd<vRealD>   : public std::true_type {};
-  template <> struct is_simd<vComplexF>: public std::true_type {};
-  template <> struct is_simd<vComplexD>: public std::true_type {};
-  template <> struct is_simd<vInteger> : public std::true_type {};
+///////////////////////
+// Splat
+///////////////////////
 
-  template <typename T> using IfSimd     = Invoke<std::enable_if< is_simd<T>::value,int> > ;
-  template <typename T> using IfNotSimd  = Invoke<std::enable_if<!is_simd<T>::value,unsigned> > ;
+// this is only for the complex version
+template <class S, class V, IfComplex<S> = 0, class ABtype>
+inline void vsplat(Grid_simd<S, V> &ret, ABtype a, ABtype b) {
+  ret.v = binary<V>(a, b, VsplatSIMD());
+}
 
+// overload if complex
+template <class S, class V>
+inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
+  vsplat(ret, real(c), imag(c));
+}
+
+// if real fill with a, if complex fill with a in the real part (first function
+// above)
+template <class S, class V>
+inline void vsplat(Grid_simd<S, V> &ret, NotEnableIf<is_complex<S>, S> a) {
+  ret.v = unary<V>(a, VsplatSIMD());
+}
+//////////////////////////
+
+///////////////////////////////////////////////
+// Initialise to 1,0,i for the correct types
+///////////////////////////////////////////////
+// For complex types
+template <class S, class V, IfComplex<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0, 0.0));
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0, 0.0));
+}  // use xor?
+template <class S, class V, IfComplex<S> = 0>
+inline void vcomplex_i(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0, 1.0));
+}
+
+template <class S, class V, IfComplex<S> = 0>
+inline void visign(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0, -1.0));
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vrsign(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(-1.0, 1.0));
+}
+
+// if not complex overload here
+template <class S, class V, IfReal<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0));
+}
+template <class S, class V, IfReal<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0));
+}
+
+// For integral types
+template <class S, class V, IfInteger<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, 1);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vtrue(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0xFFFFFFFF);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vfalse(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0);
+}
+
+template <class S, class V>
+inline void zeroit(Grid_simd<S, V> &z) {
+  vzero(z);
+}
+
+///////////////////////
+// Vstream
+///////////////////////
+template <class S, class V, IfReal<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  binary<void>((S *)&out.v, in.v, VstreamSIMD());
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  typedef typename S::value_type T;
+  binary<void>((T *)&out.v, in.v, VstreamSIMD());
+}
+
+template <class S, class V, IfInteger<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  out = in;
+}
+
+////////////////////////////////////
+// Arithmetic operator overloads +,-,*
+////////////////////////////////////
+template <class S, class V>
+inline Grid_simd<S, V> operator+(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, SumSIMD());
+  return ret;
+};
+
+template <class S, class V>
+inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, SubSIMD());
+  return ret;
+};
+
+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultComplexSIMD());
+  return ret;
+};
+
+// Real/Integer types
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultSIMD());
+  return ret;
+};
+
+///////////////////////
+// Conjugate
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  ret.v = unary<V>(in.v, ConjSIMD());
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
+  return in;  // for real objects
+}
+
+// Suppress adj for integer types... // odd; why conjugate above but not adj??
+template <class S, class V, IfNotInteger<S> = 0>
+inline Grid_simd<S, V> adj(const Grid_simd<S, V> &in) {
+  return conjugate(in);
+}
+
+///////////////////////
+// timesMinusI
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
+  ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD());
+}
+
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  timesMinusI(ret, in);
+  return ret;
+}
+
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
+  return in;
+}
+
+///////////////////////
+// timesI
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
+  ret.v = binary<V>(in.v, ret.v, TimesISIMD());
+}
+
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  timesI(ret, in);
+  return ret;
+}
+
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
+  return in;
+}
+
+/////////////////////
+// Inner, outer
+/////////////////////
+
+template <class S, class V>
+inline Grid_simd<S, V> innerProduct(const Grid_simd<S, V> &l,
+                                    const Grid_simd<S, V> &r) {
+  return conjugate(l) * r;
+}
+
+template <class S, class V>
+inline Grid_simd<S, V> outerProduct(const Grid_simd<S, V> &l,
+                                    const Grid_simd<S, V> &r) {
+  return l * conjugate(r);
+}
+
+template <class S, class V>
+inline Grid_simd<S, V> trace(const Grid_simd<S, V> &arg) {
+  return arg;
+}
+
+////////////////////////////////////////////////////////////
+// copy/splat complex real parts into real;
+// insert real into complex and zero imag;
+////////////////////////////////////////////////////////////
+
+// real = toReal( complex )
+template <class S, class V, IfReal<S> = 0>
+inline Grid_simd<S, V> toReal(const Grid_simd<std::complex<S>, V> &in) {
+  typedef Grid_simd<S, V> simd;
+  simd ret;
+  typename simd::conv_t conv;
+  conv.v = in.v;  // copy the vector content (bytewise)
+  for (int i = 0; i < simd::Nsimd(); i += 2) {
+    conv.s[i + 1] = conv.s[i];  // duplicate (r,r);(r,r);(r,r); etc...
+  }
+  ret.v = conv.v;
+  return ret;
+}
+
+// complex = toComplex( real )
+template <class R, class V, IfReal<R> = 0>  // must be a real arg
+inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
+  typedef Grid_simd<R, V> Rsimd;
+  typedef Grid_simd<std::complex<R>, V> Csimd;
+  typename Rsimd::conv_t conv;  // address as real
+
+  conv.v = in.v;
+  for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
+    assert(conv.s[i + 1] ==
+           conv.s[i]);  // trap any cases where real was not duplicated
+    // indicating the SIMD grids of real and imag assignment did not correctly
+    // match
+    conv.s[i + 1] = 0.0;  // zero imaginary parts
+  }
+  Csimd ret;
+  ret.v = conv.v;
+  return ret;
+}
+
+///////////////////////////////
+// Define available types
+///////////////////////////////
+typedef Grid_simd<float, SIMD_Ftype> vRealF;
+typedef Grid_simd<double, SIMD_Dtype> vRealD;
+typedef Grid_simd<std::complex<float>, SIMD_Ftype> vComplexF;
+typedef Grid_simd<std::complex<double>, SIMD_Dtype> vComplexD;
+typedef Grid_simd<Integer, SIMD_Itype> vInteger;
+
+/////////////////////////////////////////
+// Some traits to recognise the types
+/////////////////////////////////////////
+template <typename T>
+struct is_simd : public std::false_type {};
+template <>
+struct is_simd<vRealF> : public std::true_type {};
+template <>
+struct is_simd<vRealD> : public std::true_type {};
+template <>
+struct is_simd<vComplexF> : public std::true_type {};
+template <>
+struct is_simd<vComplexD> : public std::true_type {};
+template <>
+struct is_simd<vInteger> : public std::true_type {};
+
+template <typename T>
+using IfSimd = Invoke<std::enable_if<is_simd<T>::value, int> >;
+template <typename T>
+using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
 }
 
 #endif
diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h
index 2ce8590d..a67f4e7d 100644
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -1,247 +1,234 @@
-    /*************************************************************************************
+/*************************************************************************************
 
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
 
-    Source file: ./lib/simd/Grid_vector_unops.h
+Source file: ./lib/simd/Grid_vector_unops.h
 
-    Copyright (C) 2015
+Copyright (C) 2015
 
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
 
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_VECTOR_UNOPS
 #define GRID_VECTOR_UNOPS
 
 #include <cmath>
 
-namespace Grid { 
+namespace Grid {
 
-  template<class scalar> struct SqrtRealFunctor {
-    scalar operator()(const scalar &a) const {
-      return sqrt(real(a));
-    }
-  };
+template <class scalar>
+struct SqrtRealFunctor {
+  scalar operator()(const scalar &a) const { return sqrt(real(a)); }
+};
 
-  template<class scalar> struct RSqrtRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return scalar(1.0/sqrt(real(a)));
-    }
-  };
+template <class scalar>
+struct RSqrtRealFunctor {
+  scalar operator()(const scalar &a) const {
+    return scalar(1.0 / sqrt(real(a)));
+  }
+};
 
-  template<class scalar> struct CosRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return cos(real(a));
-    }
-  };
+template <class scalar>
+struct CosRealFunctor {
+  scalar operator()(const scalar &a) const { return cos(real(a)); }
+};
 
-  template<class scalar> struct SinRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return sin(real(a));
-    }
-  };
+template <class scalar>
+struct SinRealFunctor {
+  scalar operator()(const scalar &a) const { return sin(real(a)); }
+};
 
-  template<class scalar> struct AcosRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return acos(real(a));
-    }
-  };
+template <class scalar>
+struct AcosRealFunctor {
+  scalar operator()(const scalar &a) const { return acos(real(a)); }
+};
 
-  template<class scalar> struct AsinRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return asin(real(a));
-    }
-  };
+template <class scalar>
+struct AsinRealFunctor {
+  scalar operator()(const scalar &a) const { return asin(real(a)); }
+};
 
-  template<class scalar> struct LogRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return log(real(a));
-    }
-  };
+template <class scalar>
+struct LogRealFunctor {
+  scalar operator()(const scalar &a) const { return log(real(a)); }
+};
 
-  template<class scalar> struct ExpRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return exp(real(a));
-    }
-  };
-  template<class scalar> struct NotFunctor {
-    scalar operator()(const scalar &a)  const {
-      return (!a);
-    }
-  };
-  template<class scalar> struct AbsRealFunctor {
-    scalar operator()(const scalar &a)  const {
-      return std::abs(real(a));
-    }
-  };
+template <class scalar>
+struct ExpRealFunctor {
+  scalar operator()(const scalar &a) const { return exp(real(a)); }
+};
+template <class scalar>
+struct NotFunctor {
+  scalar operator()(const scalar &a) const { return (!a); }
+};
+template <class scalar>
+struct AbsRealFunctor {
+  scalar operator()(const scalar &a) const { return std::abs(real(a)); }
+};
 
-  template<class scalar> struct PowRealFunctor {
-    double y;
-  PowRealFunctor(double _y) : y(_y) {};
-    scalar operator()(const scalar &a)  const {
-      return pow(real(a),y);
-    }
-  };
+template <class scalar>
+struct PowRealFunctor {
+  double y;
+  PowRealFunctor(double _y) : y(_y){};
+  scalar operator()(const scalar &a) const { return pow(real(a), y); }
+};
 
-  template<class scalar> struct ModIntFunctor {
-    Integer y;
-  ModIntFunctor(Integer _y) : y(_y) {};
-    scalar operator()(const scalar &a)  const {
-      return Integer(a)%y;
-    }
-  };
+template <class scalar>
+struct ModIntFunctor {
+  Integer y;
+  ModIntFunctor(Integer _y) : y(_y){};
+  scalar operator()(const scalar &a) const { return Integer(a) % y; }
+};
 
-  template<class scalar> struct DivIntFunctor {
-    Integer y;
-  DivIntFunctor(Integer _y) : y(_y) {};
-    scalar operator()(const scalar &a)  const {
-      return Integer(a)/y;
-    }
-  };
+template <class scalar>
+struct DivIntFunctor {
+  Integer y;
+  DivIntFunctor(Integer _y) : y(_y){};
+  scalar operator()(const scalar &a) const { return Integer(a) / y; }
+};
 
-  template<class scalar> struct RealFunctor {
-    scalar operator()(const std::complex<scalar> &a)  const {
-      return real(a);
-    }
-  };
-  template<class scalar> struct ImagFunctor {
-    scalar operator()(const std::complex<scalar> &a)  const {
-      return imag(a);
-    }
-  };
-  template < class S, class V > 
-    inline Grid_simd<S,V> real(const Grid_simd<S,V> &r) {
-    return SimdApply(RealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-    inline Grid_simd<S,V> imag(const Grid_simd<S,V> &r) {
-    return SimdApply(ImagFunctor<S>(),r);
-  }
+template <class scalar>
+struct RealFunctor {
+  scalar operator()(const scalar &a) const { return std::real(a); }
+};
+template <class scalar>
+struct ImagFunctor {
+  scalar operator()(const scalar &a) const { return std::imag(a); }
+};
+template <class S, class V>
+inline Grid_simd<S, V> real(const Grid_simd<S, V> &r) {
+  return SimdApply(RealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> imag(const Grid_simd<S, V> &r) {
+  return SimdApply(ImagFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> sqrt(const Grid_simd<S, V> &r) {
+  return SimdApply(SqrtRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> rsqrt(const Grid_simd<S, V> &r) {
+  return SimdApply(RSqrtRealFunctor<S>(), r);
+}
+template <class Scalar>
+inline Scalar rsqrt(const Scalar &r) {
+  return (RSqrtRealFunctor<Scalar>(), r);
+}
 
-  template < class S, class V > 
-  inline Grid_simd<S,V> sqrt(const Grid_simd<S,V> &r) {
-    return SimdApply(SqrtRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> rsqrt(const Grid_simd<S,V> &r) {
-    return SimdApply(RSqrtRealFunctor<S>(),r);
-  }
-  template < class Scalar > 
-  inline Scalar rsqrt(const Scalar &r) {
-    return (RSqrtRealFunctor<Scalar>(),r);
-  }
-
-  template < class S, class V > 
-  inline Grid_simd<S,V> cos(const Grid_simd<S,V> &r) {
-    return SimdApply(CosRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> sin(const Grid_simd<S,V> &r) {
-    return SimdApply(SinRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> acos(const Grid_simd<S,V> &r) {
-    return SimdApply(AcosRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> asin(const Grid_simd<S,V> &r) {
-    return SimdApply(AsinRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> log(const Grid_simd<S,V> &r) {
-    return SimdApply(LogRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> abs(const Grid_simd<S,V> &r) {
-    return SimdApply(AbsRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> exp(const Grid_simd<S,V> &r) {
-    return SimdApply(ExpRealFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> Not(const Grid_simd<S,V> &r) {
-    return SimdApply(NotFunctor<S>(),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> pow(const Grid_simd<S,V> &r,double y) {
-    return SimdApply(PowRealFunctor<S>(y),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> mod(const Grid_simd<S,V> &r,Integer y) {
-    return SimdApply(ModIntFunctor<S>(y),r);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> div(const Grid_simd<S,V> &r,Integer y) {
-    return SimdApply(DivIntFunctor<S>(y),r);
-  }
-  ////////////////////////////////////////////////////////////////////////////
-  // Allows us to assign into **conformable** real vectors from complex
-  ////////////////////////////////////////////////////////////////////////////
-  //  template < class S, class V > 
-  //  inline auto ComplexRemove(const Grid_simd<S,V> &c) -> Grid_simd<Grid_simd<S,V>::Real,V> {
-  //    Grid_simd<Grid_simd<S,V>::Real,V> ret;
-  //    ret.v = c.v;
-  //    return ret;
-  //  }
-  template<class scalar> struct AndFunctor {
-    scalar operator()(const scalar &x, const scalar &y)  const {
-      return x & y;
-    }
-  };
-  template<class scalar> struct OrFunctor {
-    scalar operator()(const scalar &x, const scalar &y)  const {
-      return x | y;
-    }
-  };
-  template<class scalar> struct AndAndFunctor {
-    scalar operator()(const scalar &x, const scalar &y)  const {
-      return x && y;
-    }
-  };
-  template<class scalar> struct OrOrFunctor {
-    scalar operator()(const scalar &x, const scalar &y)  const {
-      return x || y;
-    }
-  };
-
-  ////////////////////////////////
-  // Calls to simd binop functors
-  ////////////////////////////////
-  template < class S, class V > 
-  inline Grid_simd<S,V> operator &(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
-    return SimdApplyBinop(AndFunctor<S>(),x,y);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> operator &&(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
-    return SimdApplyBinop(AndAndFunctor<S>(),x,y);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> operator |(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
-    return SimdApplyBinop(OrFunctor<S>(),x,y);
-  }
-  template < class S, class V > 
-  inline Grid_simd<S,V> operator ||(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
-    return SimdApplyBinop(OrOrFunctor<S>(),x,y);
-  }
+template <class S, class V>
+inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
+  return SimdApply(CosRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> sin(const Grid_simd<S, V> &r) {
+  return SimdApply(SinRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> acos(const Grid_simd<S, V> &r) {
+  return SimdApply(AcosRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> asin(const Grid_simd<S, V> &r) {
+  return SimdApply(AsinRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> log(const Grid_simd<S, V> &r) {
+  return SimdApply(LogRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
+  return SimdApply(AbsRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
+  return SimdApply(ExpRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
+  return SimdApply(NotFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> pow(const Grid_simd<S, V> &r, double y) {
+  return SimdApply(PowRealFunctor<S>(y), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> mod(const Grid_simd<S, V> &r, Integer y) {
+  return SimdApply(ModIntFunctor<S>(y), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> div(const Grid_simd<S, V> &r, Integer y) {
+  return SimdApply(DivIntFunctor<S>(y), r);
+}
+////////////////////////////////////////////////////////////////////////////
+// Allows us to assign into **conformable** real vectors from complex
+////////////////////////////////////////////////////////////////////////////
+//  template < class S, class V >
+//  inline auto ComplexRemove(const Grid_simd<S,V> &c) ->
+//  Grid_simd<Grid_simd<S,V>::Real,V> {
+//    Grid_simd<Grid_simd<S,V>::Real,V> ret;
+//    ret.v = c.v;
+//    return ret;
+//  }
+template <class scalar>
+struct AndFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x & y; }
+};
+template <class scalar>
+struct OrFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x | y; }
+};
+template <class scalar>
+struct AndAndFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x && y; }
+};
+template <class scalar>
+struct OrOrFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x || y; }
+};
 
+////////////////////////////////
+// Calls to simd binop functors
+////////////////////////////////
+template <class S, class V>
+inline Grid_simd<S, V> operator&(const Grid_simd<S, V> &x,
+                                 const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(AndFunctor<S>(), x, y);
+}
+template <class S, class V>
+inline Grid_simd<S, V> operator&&(const Grid_simd<S, V> &x,
+                                  const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(AndAndFunctor<S>(), x, y);
+}
+template <class S, class V>
+inline Grid_simd<S, V> operator|(const Grid_simd<S, V> &x,
+                                 const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(OrFunctor<S>(), x, y);
+}
+template <class S, class V>
+inline Grid_simd<S, V> operator||(const Grid_simd<S, V> &x,
+                                  const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(OrOrFunctor<S>(), x, y);
+}
 }
 #endif
diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc
index 2255c598..e5957db7 100644
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -1,31 +1,32 @@
-    /*************************************************************************************
+/*************************************************************************************
 
-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid
 
-    Source file: ./tests/Test_simd.cc
+Source file: ./tests/Test_simd.cc
 
-    Copyright (C) 2015
+Copyright (C) 2015
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
 
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #include <Grid.h>
 
 using namespace std;
@@ -62,6 +63,18 @@ public:
   template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = adj(i1);}
   std::string name(void) const { return std::string("Adj"); }
 };
+class funcImag {
+public:
+  funcImag() {};
+  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = imag(i1);}
+  std::string name(void) const { return std::string("imag"); }
+};
+class funcReal {
+public:
+  funcReal() {};
+  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = real(i1);}
+  std::string name(void) const { return std::string("real"); }
+};
 
 class funcTimesI {
 public:
@@ -141,7 +154,13 @@ void Tester(const functor &func)
   }
 
   extract<vec,scal>(v_result,result);
-  std::cout<<GridLogMessage << " " << func.name()<<std::endl;
+
+  std::cout << GridLogMessage << " " << func.name() << std::endl;
+
+  std::cout << GridLogDebug << v_input1 << std::endl;
+  std::cout << GridLogDebug << v_result << std::endl;
+
+
 
   int ok=0;
   for(int i=0;i<Nsimd;i++){
@@ -389,6 +408,8 @@ int main (int argc, char ** argv)
   Tester<ComplexF,vComplexF>(funcTimes());
   Tester<ComplexF,vComplexF>(funcConj());
   Tester<ComplexF,vComplexF>(funcAdj());
+  Tester<ComplexF,vComplexF>(funcReal());
+  Tester<ComplexF,vComplexF>(funcImag());
   Tester<ComplexF,vComplexF>(funcInnerProduct());
   ReductionTester<ComplexF,ComplexF,vComplexF>(funcReduce());
 
@@ -421,17 +442,21 @@ int main (int argc, char ** argv)
   Tester<ComplexD,vComplexD>(funcTimes());
   Tester<ComplexD,vComplexD>(funcConj());
   Tester<ComplexD,vComplexD>(funcAdj());
-  Tester<ComplexD,vComplexD>(funcInnerProduct());
-  ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());
+  Tester<ComplexD, vComplexD>(funcReal());
+  Tester<ComplexD, vComplexD>(funcImag());
 
+  Tester<ComplexD, vComplexD>(funcInnerProduct());
+  ReductionTester<ComplexD, ComplexD, vComplexD>(funcReduce());
 
-  std::cout<<GridLogMessage << "==================================="<<  std::endl;
-  std::cout<<GridLogMessage << "Testing vComplexD permutes "<<std::endl;
-  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout << GridLogMessage
+            << "===================================" << std::endl;
+  std::cout << GridLogMessage << "Testing vComplexD permutes " << std::endl;
+  std::cout << GridLogMessage
+            << "===================================" << std::endl;
 
   // Log2 iteration
-  for(int i=0;(1<<i)< vComplexD::Nsimd();i++){
-    PermTester<ComplexD,vComplexD>(funcPermute(i));
+  for (int i = 0; (1 << i) < vComplexD::Nsimd(); i++) {
+    PermTester<ComplexD, vComplexD>(funcPermute(i));
   }