From 1a1474b32339d9873fb46772032256ffe3330b72 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 4 Mar 2015 05:31:44 +0000
Subject: [PATCH] Better organisation

---
 Grid.h                   | 1122 +-------------------------------------
 Grid_Lattice.h           |   98 +---
 Grid_QCD.h               |   94 ++++
 Grid_aligned_allocator.h |   56 ++
 Grid_config.h            |  101 ++++
 Grid_math_types.h        |  819 ++++++++++++++++++++++++++++
 Grid_signal.cc           |   16 +-
 Grid_simd.h              |   68 ++-
 8 files changed, 1161 insertions(+), 1213 deletions(-)
 create mode 100644 Grid_QCD.h
 create mode 100644 Grid_aligned_allocator.h
 create mode 100644 Grid_config.h
 create mode 100644 Grid_math_types.h
diff --git a/Grid.h b/Grid.h
index b3669ace..042835ca 100644
--- a/Grid.h
+++ b/Grid.h
@@ -40,1123 +40,21 @@
 #endif
 
 
-////////////////////////////////////////////////////////////
-// SIMD Alignment controls
-////////////////////////////////////////////////////////////
-#ifdef HAVE_VAR_ATTRIBUTE_ALIGNED
-#define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A)))
-#else
-#define ALIGN_DIRECTIVE(A) __declspec(align(A))
-#endif
-
-#ifdef SSE2
-#include <pmmintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(16)
-#endif
-
-#if defined(AVX1) || defined (AVX2)
-#include <immintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(32)
-#endif
-
-#ifdef AVX512
-#include <immintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(64)
-#endif
-
+#include <Grid_simd.h>
+#include <Grid_math_types.h>
+#include <Grid_cartesian.h>
+#include <Grid_aligned_allocator.h>
+#include <Grid_aligned_allocator.h>
+#include <Grid_Lattice.h>
+#include <Grid_QCD.h>
 
 namespace dpo {
 
   void Grid_init(void);
-
-inline double usecond(void)
-{
-    struct timeval tv;
-    gettimeofday(&tv,NULL);
-    return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
-}
-    
-    typedef  float  RealF;
-    typedef  double RealD;
-    typedef  RealF  Real;
-    
-    typedef std::complex<RealF> ComplexF;
-    typedef std::complex<RealD> ComplexD;
-    typedef std::complex<Real>  Complex;
-    
-
-    class Zero{};
-    static Zero zero;
-    template<class itype> inline void ZeroIt(itype &arg){ arg=zero;};
-    template<>            inline void ZeroIt(ComplexF &arg){ arg=0; };
-    template<>            inline void ZeroIt(ComplexD &arg){ arg=0; };
-    template<>            inline void ZeroIt(RealF &arg){ arg=0; };
-    template<>            inline void ZeroIt(RealD &arg){ arg=0; };
-
-    // TODO
-    //
-    // Base class to share common code between vRealF, VComplexF etc...
-    //
-    // lattice Broad cast assignment
-    //
-    // where() support
-    // implement with masks, and/or? Type of the mask & boolean support?
-    //
-    // Unary functions
-    // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
-    // exp, log, sqrt, fabs
-    //
-    // transposeColor, transposeSpin,
-    // adjColor, adjSpin,
-    // traceColor, traceSpin.
-    // peekColor, peekSpin + pokeColor PokeSpin
-    //
-    // copyMask.
-    //
-    // localMaxAbs
-    //
-    // norm2,
-    // sumMulti equivalent.
-    // Fourier transform equivalent.
-    //
-    
-    ////////////////////////////////////////////////////////////////////////////////
-    //Provide support functions for basic real and complex data types required by dpo
-    //Single and double precision versions. Should be able to template this once only.
-    ////////////////////////////////////////////////////////////////////////////////
-    
-    inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
-    inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
-    inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
-    inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
-    inline ComplexD adj(const ComplexD& r){ return(conj(r)); }
-    // conj already supported for complex
-    
-    inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
-    inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
-    inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
-    inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
-    inline Complex  adj(const Complex& r ){ return(conj(r)); }
-    //conj already supported for complex
-    
-    inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
-    inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
-    inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
-    inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
-    inline RealD adj(const RealD & r){ return r; }  // No-op for real
-    inline RealD conj(const RealD & r){ return r; }
-    
-    inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
-    inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
-    inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
-    inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
-    inline RealF adj(const RealF  & r){ return r; }
-    inline RealF conj(const RealF  & r){ return r; }
-    
-    ////////////////////////////////////////////////////////////////////////
-    //  Vector types are arch dependent/////////////////////////////////////
-    ////////////////////////////////////////////////////////////////////////
-#if defined (SSE2)
-    typedef __m128 fvec;
-    typedef __m128d dvec;
-    typedef __m128 cvec;
-    typedef __m128d zvec;
-#endif
-#if defined (AVX1) || defined (AVX2)
-    typedef __m256 fvec;
-    typedef __m256d dvec;
-    typedef __m256 cvec;
-    typedef __m256d zvec;
-#endif
-#if defined (AVX512)
-    typedef __m512  fvec;
-    typedef __m512d dvec;
-    typedef __m512  cvec;
-    typedef __m512d zvec;
-#endif
-#if defined (QPX)
-    typedef float  fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision
-    typedef float  cvec __attribute__ ((vector_size (16)));
-    
-    typedef vector4double dvec;
-    typedef vector4double zvec;
-#endif
-#if defined (AVX1) || defined (AVX2) || defined (AVX512)
-    inline void v_prefetch0(int size, const char *ptr){
-          for(int i=0;i<size;i+=64){ //  Define L1 linesize above// What about SSE?
-            _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
-            _mm_prefetch(ptr+i+512,_MM_HINT_T0);
-          }
-    }
-#else 
-    inline void v_prefetch0(int size, const char *ptr){};
-#endif
-
-///////////////////////////////////////////////////
-// Scalar, Vector, Matrix objects.
-// These can be composed to form tensor products of internal indices.
-///////////////////////////////////////////////////
-    
-template<class vtype> class iScalar
-{
-public:
-  SIMDalign vtype _internal;
-    iScalar(){};
-    iScalar(Zero &z){ *this = zero; };
-    iScalar<vtype> & operator= (const Zero &hero){
-        zeroit(*this);
-        return *this;
-    }
-    friend void zeroit(iScalar<vtype> &that){
-        zeroit(that._internal);
-    }
-    // Unary negation
-    friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
-        iScalar<vtype> ret;
-        ret._internal= -r._internal;
-        return ret;
-    }
-    // *=,+=,-= operators
-    inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
-        *this = (*this)*r;
-        return *this;
-    }
-    inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
-        *this = (*this)-r;
-        return *this;
-    }
-    inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
-        *this = (*this)+r;
-        return *this;
-    }
-    
+  double usecond(void);
+  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
+  void Grid_debug_handler_init(void);
 
 };
-    
-template<class vtype,int N> class iVector
-{
-public:
-  SIMDalign vtype _internal[N];
-    iVector(Zero &z){ *this = zero; };
-    iVector() {};
-    iVector<vtype,N> & operator= (Zero &hero){
-        zeroit(*this);
-        return *this;
-    }
-    friend void zeroit(iVector<vtype,N> &that){
-        for(int i=0;i<N;i++){
-            zeroit(that._internal[i]);
-        }
-    }
-    // Unary negation
-    friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
-        iVector<vtype,N> ret;
-        for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
-        return ret;
-    }
-    // *=,+=,-= operators
-    inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
-        *this = (*this)*r;
-        return *this;
-    }
-    inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
-        *this = (*this)-r;
-        return *this;
-    }
-    inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
-        *this = (*this)+r;
-        return *this;
-    }
-
-};
-    
-    
-template<class vtype,int N> class iMatrix
-{
-public:
-  SIMDalign    vtype _internal[N][N];
-    iMatrix(Zero &z){ *this = zero; };
-    iMatrix() {};
-    iMatrix<vtype,N> & operator= (Zero &hero){
-        zeroit(*this);
-        return *this;
-    }
-    friend void zeroit(iMatrix<vtype,N> &that){
-        for(int i=0;i<N;i++){
-        for(int j=0;j<N;j++){
-                zeroit(that._internal[i][j]);
-        }}
-    }
-    // Unary negation
-    friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
-        iMatrix<vtype,N> ret;
-        for(int i=0;i<N;i++){
-        for(int j=0;j<N;j++){
-            ret._internal[i][j]= -r._internal[i][j];
-        }}
-        return ret;
-    }
-    // *=,+=,-= operators
-    template<class T>
-    inline iMatrix<vtype,N> &operator *=(const T &r) {
-        *this = (*this)*r;
-        return *this;
-    }
-    template<class T>
-    inline iMatrix<vtype,N> &operator -=(const T &r) {
-        *this = (*this)-r;
-        return *this;
-    }
-    template<class T>
-    inline iMatrix<vtype,N> &operator +=(const T &r) {
-        *this = (*this)+r;
-        return *this;
-    }
-
-};
-/*
-    inline vComplexD localInnerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
-    inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
-    inline vRealD localInnerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
-    inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
-*/
-    inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
-    inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
-    inline RealD localInnerProduct(const RealD & l, const RealD & r) { return conj(l)*r; }
-    inline RealF localInnerProduct(const RealF & l, const RealF & r) { return conj(l)*r; }
-
-    
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    /////////////////////////////////////////// ADD         ///////////////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    
-
-// ADD is simple for now; cannot mix types and straightforward template
-// Scalar +/- Scalar
-// Vector +/- Vector
-// Matrix +/- Matrix
-template<class vtype,class ltype,class rtype> inline void add(iScalar<vtype> * __restrict__ ret,
-                                                              const iScalar<ltype> * __restrict__ lhs,
-                                                              const iScalar<rtype> * __restrict__ rhs)
-{
-    add(&ret->_internal,&lhs->_internal,&rhs->_internal);
-}
-template<class vtype,class ltype,class rtype,int N> inline void add(iVector<vtype,N> * __restrict__ ret,
-                                                                    const iVector<ltype,N> * __restrict__ lhs,
-                                                                    const iVector<rtype,N> * __restrict__ rhs)
-{
-    for(int c=0;c<N;c++){
-        ret->_internal[c]=lhs->_internal[c]+rhs->_internal[c];
-    }
-    return;
-}
-template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
-                                                                      const iMatrix<ltype,N> * __restrict__ lhs,
-                                                                      const iMatrix<rtype,N> * __restrict__ rhs)
-{
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
-        add(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]);
-    }}
-    return;
-}
-template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
-                                                                      const iScalar<ltype>   * __restrict__ lhs,
-                                                                      const iMatrix<rtype,N> * __restrict__ rhs)
-{
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
-        add(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
-    }}
-    return;
-}
-template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
-                                                                      const iMatrix<ltype,N> * __restrict__ lhs,
-                                                                      const iScalar<rtype>   * __restrict__ rhs)
-{
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
-        if ( c1==c2)
-            add(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
-        else
-            ret->_internal[c1][c2]=lhs->_internal[c1][c2];
-    }}
-    return;
-}
-// Need to figure multi-precision.
-template<class Mytype>  Mytype timesI(Mytype &r)
-{
-    iScalar<Complex> i;
-    i._internal = Complex(0,1);
-    return r*i;
-}
-
-                // + operator for scalar, vector, matrix
-template<class ltype,class rtype>
-//inline auto operator + (iScalar<ltype>& lhs,iScalar<rtype>&& rhs) -> iScalar<decltype(lhs._internal + rhs._internal)>
-inline auto operator + (const iScalar<ltype>& lhs,const iScalar<rtype>& rhs) -> iScalar<decltype(lhs._internal + rhs._internal)>
-{
-    typedef iScalar<decltype(lhs._internal+rhs._internal)> ret_t;
-    ret_t ret;
-    add(&ret,&lhs,&rhs);
-    return ret;
-}
-template<class ltype,class rtype,int N>
-inline auto operator + (const iVector<ltype,N>& lhs,const iVector<rtype,N>& rhs) ->iVector<decltype(lhs._internal[0]+rhs._internal[0]),N>
-{
-    typedef iVector<decltype(lhs._internal[0]+rhs._internal[0]),N> ret_t;
-    ret_t ret;
-    add(&ret,&lhs,&rhs);
-    return ret;
-}
-template<class ltype,class rtype,int N>
-inline auto operator + (const iMatrix<ltype,N>& lhs,const iMatrix<rtype,N>& rhs) ->iMatrix<decltype(lhs._internal[0][0]+rhs._internal[0][0]),N>
-{
-    typedef iMatrix<decltype(lhs._internal[0][0]+rhs._internal[0][0]),N> ret_t;
-    ret_t ret;
-    add(&ret,&lhs,&rhs);
-    return ret;
-}
-template<class ltype,class rtype,int N>
-inline auto operator + (const iScalar<ltype>& lhs,const iMatrix<rtype,N>& rhs)->iMatrix<decltype(lhs._internal+rhs._internal[0][0]),N>
-{
-    typedef iMatrix<decltype(lhs._internal+rhs._internal[0][0]),N> ret_t;
-    ret_t ret;
-    add(&ret,&lhs,&rhs);
-    return ret;
-}
-template<class ltype,class rtype,int N>
-inline auto operator + (const iMatrix<ltype,N>& lhs,const iScalar<rtype>& rhs)->iMatrix<decltype(lhs._internal[0][0]+rhs._internal),N>
-{
-    typedef iMatrix<decltype(lhs._internal[0][0]+rhs._internal),N> ret_t;
-    ret_t ret;
-    add(&ret,&lhs,&rhs);
-    return ret;
-}
-
-
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    /////////////////////////////////////////// SUB         ///////////////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    
-
-// SUB is simple for now; cannot mix types and straightforward template
-// Scalar +/- Scalar
-// Vector +/- Vector
-// Matrix +/- Matrix
-// Matrix /- scalar
-template<class vtype,class ltype,class rtype> inline void sub(iScalar<vtype> * __restrict__ ret,
-                                                              const iScalar<ltype> * __restrict__ lhs,
-                                                              const iScalar<rtype> * __restrict__ rhs)
-{
-    sub(&ret->_internal,&lhs->_internal,&rhs->_internal);
-}
-
-template<class vtype,class ltype,class rtype,int N> inline void sub(iVector<vtype,N> * __restrict__ ret,
-                                                                    const iVector<ltype,N> * __restrict__ lhs,
-                                                                    const iVector<rtype,N> * __restrict__ rhs)
-{
-    for(int c=0;c<N;c++){
-        ret->_internal[c]=lhs->_internal[c]-rhs->_internal[c];
-    }
-    return;
-}
-template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
-                                                                     const iMatrix<ltype,N> * __restrict__ lhs,
-                                                                     const iMatrix<rtype,N> * __restrict__ rhs){
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
-        sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]);
-    }}
-    return;
-}
-template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
-                                                                     const iScalar<ltype> * __restrict__ lhs,
-                                                                     const iMatrix<rtype,N> * __restrict__ rhs){
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
-        if ( c1!=c2) {
-            sub(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
-        } else {
-            // Fails -- need unary minus. Catalogue other unops?
-            ret->_internal[c1][c2]=zero;
-            ret->_internal[c1][c2]=ret->_internal[c1][c2]-rhs->_internal[c1][c2];
-
-        }
-    }}
-    return;
-}
-template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
-                                                                     const iMatrix<ltype,N> * __restrict__ lhs,
-                                                                     const iScalar<rtype> * __restrict__ rhs){
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
-        if ( c1!=c2)
-            sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
-        else
-            ret->_internal[c1][c2]=lhs->_internal[c1][c2];
-    }}
-    return;
-}
-
-template<class v> void vprefetch(const iScalar<v> &vv)
-{
-  vprefetch(vv._internal);
-}
-template<class v,int N> void vprefetch(const iVector<v,N> &vv)
-{
-  for(int i=0;i<N;i++){
-    vprefetch(vv._internal[i]);
-  }
-}
-template<class v,int N> void vprefetch(const iMatrix<v,N> &vv)
-{
-  for(int i=0;i<N;i++){
-  for(int j=0;j<N;j++){
-    vprefetch(vv._internal[i][j]);
-  }}
-}
-
-    // - operator for scalar, vector, matrix
-template<class ltype,class rtype> inline auto
-operator - (const iScalar<ltype>& lhs, const iScalar<rtype>& rhs) -> iScalar<decltype(lhs._internal - rhs._internal)>
-{
-    typedef iScalar<decltype(lhs._internal-rhs._internal)> ret_t;
-    ret_t ret;
-    sub(&ret,&lhs,&rhs);
-    return ret;
-}
-template<class ltype,class rtype,int N>
-inline auto operator - (const iVector<ltype,N>& lhs,const iVector<rtype,N>& rhs) ->iVector<decltype(lhs._internal[0]-rhs._internal[0]),N>
-{
-    typedef iVector<decltype(lhs._internal[0]-rhs._internal[0]),N> ret_t;
-    ret_t ret;
-    sub(&ret,&lhs,&rhs);
-    return ret;
-}
-template<class ltype,class rtype,int N>
-inline auto operator - (const iMatrix<ltype,N>& lhs,const iMatrix<rtype,N>& rhs) ->iMatrix<decltype(lhs._internal[0][0]-rhs._internal[0][0]),N>
-{
-    typedef iMatrix<decltype(lhs._internal[0][0]-rhs._internal[0][0]),N> ret_t;
-    ret_t ret;
-    sub(&ret,&lhs,&rhs);
-    return ret;
-}
-template<class ltype,class rtype,int N>
-inline auto operator - (const iScalar<ltype>& lhs,const iMatrix<rtype,N>& rhs)->iMatrix<decltype(lhs._internal-rhs._internal[0][0]),N>
-{
-    typedef iMatrix<decltype(lhs._internal-rhs._internal[0][0]),N> ret_t;
-    ret_t ret;
-    sub(&ret,&lhs,&rhs);
-    return ret;
-}
-template<class ltype,class rtype,int N>
-inline auto operator - (const iMatrix<ltype,N>& lhs,const iScalar<rtype>& rhs)->iMatrix<decltype(lhs._internal[0][0]-rhs._internal),N>
-{
-    typedef iMatrix<decltype(lhs._internal[0][0]-rhs._internal),N> ret_t;
-    ret_t ret;
-    sub(&ret,&lhs,&rhs);
-    return ret;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////// MAC         ///////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-    ///////////////////////////
-    // Legal multiplication table
-    ///////////////////////////
-    // scal x scal = scal
-    // mat x  mat  = mat
-    // mat  x scal = mat
-    // scal x mat  = mat
-    // mat  x vec  = vec
-    // vec  x scal = vec
-    // scal x vec  = vec
-    ///////////////////////////
-template<class rtype,class vtype,class mtype>
-inline  void mac(iScalar<rtype> * __restrict__ ret,const iScalar<vtype> * __restrict__ lhs,const iScalar<mtype> * __restrict__ rhs)
-{
-    mac(&ret->_internal,&lhs->_internal,&rhs->_internal);
-}
-template<class rrtype,class ltype,class rtype,int N>
-inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
-    for(int c3=0;c3<N;c3++){
-        mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
-    }}}
-    return;
-}
-template<class rrtype,class ltype,class rtype,int N>
-inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
-    for(int c1=0;c1<N;c1++){
-    for(int c2=0;c2<N;c2++){
-        mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
-    }}
-    return;
-}
-template<class rrtype,class ltype,class rtype,int N>
-inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iScalar<ltype> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
-    for(int c1=0;c1<N;c1++){
-    for(int c2=0;c2<N;c2++){
-        mac(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
-    }}
-    return;
-}
-template<class rrtype,class ltype,class rtype,int N>
-inline void mac(iVector<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iVector<rtype,N> * __restrict__ rhs)
-{
-    for(int c1=0;c1<N;c1++){
-    for(int c2=0;c2<N;c2++){
-        mac(&ret->_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]);
-    }}
-    return;
-}
-template<class rrtype,class ltype,class rtype,int N>
-inline void mac(iVector<rrtype,N> * __restrict__ ret,const iScalar<ltype> * __restrict__ lhs,const iVector<rtype,N> * __restrict__ rhs)
-{
-    for(int c1=0;c1<N;c1++){
-        mac(&ret->_internal[c1],&lhs->_internal,&rhs->_internal[c1]);
-    }
-    return;
-}
-template<class rrtype,class ltype,class rtype,int N>
-inline void mac(iVector<rrtype,N> * __restrict__ ret,const iVector<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs)
-{
-    for(int c1=0;c1<N;c1++){
-        mac(&ret->_internal[c1],&lhs->_internal[c1],&rhs->_internal);
-    }
-    return;
-}
-
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    /////////////////////////////////////////// MUL         ///////////////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-
-    
-template<class rtype,class vtype,class mtype>
-inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> * __restrict__ lhs,const iScalar<vtype> * __restrict__ rhs){
-    mult(&ret->_internal,&lhs->_internal,&rhs->_internal);
-}
-
-template<class rrtype,class ltype,class rtype,int N>
-inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
-        mult(&ret->_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]);
-        for(int c3=1;c3<N;c3++){
-            mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
-        }
-    }}
-    return;
-}
-template<class rrtype,class ltype,class rtype,int N>
-inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
-        mult(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
-    }}
-    return;
-}
-
-template<class rrtype,class ltype,class rtype, int N>
-inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iScalar<ltype>   * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
-    for(int c2=0;c2<N;c2++){
-    for(int c1=0;c1<N;c1++){
-        mult(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
-    }}
-    return;
-}
-// Matrix left multiplies vector
-template<class rtype,class vtype,class mtype,int N>
-inline void mult(iVector<rtype,N> * __restrict__ ret,const iMatrix<mtype,N> * __restrict__ lhs,const iVector<vtype,N> * __restrict__ rhs)
-{
-    for(int c1=0;c1<N;c1++){
-        mult(&ret->_internal[c1],&lhs->_internal[c1][0],&rhs->_internal[0]);
-        for(int c2=1;c2<N;c2++){
-            mac(&ret->_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]);
-        }
-    }
-    return;
-}
-template<class rtype,class vtype,class mtype,int N>
-inline void mult(iVector<rtype,N> * __restrict__ ret,
-                 const iScalar<mtype>   * __restrict__ lhs,
-                 const iVector<vtype,N> * __restrict__ rhs){
-    for(int c1=0;c1<N;c1++){
-        mult(&ret->_internal[c1],&lhs->_internal,&rhs->_internal[c1]);
-    }
-}
-template<class rtype,class vtype,class mtype,int N>
-inline void mult(iVector<rtype,N> * __restrict__ ret,
-                 const iVector<vtype,N> * __restrict__ rhs,
-                 const iScalar<mtype> * __restrict__ lhs){
-    mult(ret,lhs,rhs);
-}
-    
-
-
-template<class rtype,class vtype,class mtype,int N> inline
-iVector<rtype,N> operator * (const iMatrix<mtype,N>& lhs,const iVector<vtype,N>& rhs)
-{
-    iVector<rtype,N> ret;
-    mult(&ret,&lhs,&rhs);
-    return ret;
-}
-
-template<class rtype,class vtype,class mtype,int N> inline
-iVector<rtype,N> operator * (const iScalar<mtype>& lhs,const iVector<vtype,N>& rhs)
-{
-    iVector<rtype,N> ret;
-    mult(&ret,&lhs,&rhs);
-    return ret;
-}
-
-template<class rtype,class vtype,class mtype,int N> inline
-iVector<rtype,N> operator * (const iVector<mtype,N>& lhs,const iScalar<vtype>& rhs)
-{
-    iVector<rtype,N> ret;
-    mult(&ret,&lhs,&rhs);
-    return ret;
-}
-    
-    //////////////////////////////////////////////////////////////////
-    // Glue operators to mult routines. Must resolve return type cleverly from typeof(internal)
-    // since nesting matrix<scalar> x matrix<matrix>-> matrix<matrix>
-    // while         matrix<scalar> x matrix<scalar>-> matrix<scalar>
-    // so return type depends on argument types in nasty way.
-    //////////////////////////////////////////////////////////////////
-    // scal x scal = scal
-    // mat x  mat  = mat
-    // mat  x scal = mat
-    // scal x mat  = mat
-    // mat  x vec  = vec
-    // vec  x scal = vec
-    // scal x vec  = vec
-    
-template<class l,class r>
-inline auto operator * (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(lhs._internal * rhs._internal)>
-{
-    typedef iScalar<decltype(lhs._internal*rhs._internal)> ret_t;
-    ret_t ret;
-    mult(&ret,&lhs,&rhs);
-    return ret;
-}
-template<class l,class r,int N> inline
-auto operator * (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iMatrix<decltype(lhs._internal[0][0]*rhs._internal[0][0]),N>
-{
-    typedef decltype(lhs._internal[0][0]*rhs._internal[0][0]) ret_t;
-    iMatrix<ret_t,N> ret;
-    mult(&ret,&lhs,&rhs);
-    return ret;
-}
-template<class l,class r, int N> inline
-auto operator * (const iMatrix<r,N>& lhs,const iScalar<l>& rhs) -> iMatrix<decltype(lhs._internal[0][0]*rhs._internal),N>
-{
-    typedef decltype(lhs._internal[0][0]*rhs._internal) ret_t;
-        
-    iMatrix<ret_t,N> ret;
-    for(int c1=0;c1<N;c1++){
-    for(int c2=0;c2<N;c2++){
-        mult(&ret._internal[c1][c2],&lhs._internal[c1][c2],&rhs._internal);
-    }}
-    return ret;
-}
-template<class l,class r,int N> inline
-auto operator * (const iScalar<l>& lhs,const iMatrix<r,N>& rhs) -> iMatrix<decltype(lhs._internal*rhs._internal[0][0]),N>
-{
-    typedef decltype(lhs._internal*rhs._internal[0][0]) ret_t;
-    iMatrix<ret_t,N> ret;
-    for(int c1=0;c1<N;c1++){
-    for(int c2=0;c2<N;c2++){
-        mult(&ret._internal[c1][c2],&lhs._internal,&rhs._internal[c1][c2]);
-    }}
-    return ret;
-}
-template<class l,class r,int N> inline
-auto operator * (const iMatrix<l,N>& lhs,const iVector<r,N>& rhs) -> iVector<decltype(lhs._internal[0][0]*rhs._internal[0]),N>
-{
-    typedef decltype(lhs._internal[0][0]*rhs._internal[0]) ret_t;
-    iVector<ret_t,N> ret;
-    for(int c1=0;c1<N;c1++){
-        mult(&ret._internal[c1],&lhs._internal[c1][0],&rhs._internal[0]);
-        for(int c2=1;c2<N;c2++){
-            mac(&ret._internal[c1],&lhs._internal[c1][c2],&rhs._internal[c2]);
-        }
-    }
-    return ret;
-}
-template<class l,class r,int N> inline
-auto operator * (const iScalar<l>& lhs,const iVector<r,N>& rhs) -> iVector<decltype(lhs._internal*rhs._internal[0]),N>
-{
-    typedef decltype(lhs._internal*rhs._internal[0]) ret_t;
-    iVector<ret_t,N> ret;
-    for(int c1=0;c1<N;c1++){
-        mult(&ret._internal[c1],&lhs._internal,&rhs._internal[c1]);
-    }
-    return ret;
-}
-template<class l,class r,int N> inline
-auto operator * (const iVector<l,N>& lhs,const iScalar<r>& rhs) -> iVector<decltype(lhs._internal[0]*rhs._internal),N>
-{
-    typedef decltype(lhs._internal[0]*rhs._internal) ret_t;
-    iVector<ret_t,N> ret;
-    for(int c1=0;c1<N;c1++){
-        mult(&ret._internal[c1],&lhs._internal[c1],&rhs._internal);
-    }
-    return ret;
-}
-    ///////////////////////////////////////////////////////////////////////////////////////
-    // localInnerProduct Scalar x Scalar -> Scalar
-    // localInnerProduct Vector x Vector -> Scalar
-    // localInnerProduct Matrix x Matrix -> Scalar
-    ///////////////////////////////////////////////////////////////////////////////////////
-    template<class l,class r,int N> inline
-    auto localInnerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0],rhs._internal[0]))>
-    {
-        typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
-        iScalar<ret_t> ret=zero;
-        for(int c1=0;c1<N;c1++){
-            ret._internal += localInnerProduct(lhs._internal[c1],rhs._internal[c1]);
-        }
-        return ret;
-    }
-    template<class l,class r,int N> inline
-    auto localInnerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
-    {
-        typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
-        iScalar<ret_t> ret=zero;
-        for(int c1=0;c1<N;c1++){
-        for(int c2=0;c2<N;c2++){
-            ret._internal += localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
-        }}
-        return ret;
-    }
-    template<class l,class r> inline
-    auto localInnerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal,rhs._internal))>
-    {
-        typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t;
-        iScalar<ret_t> ret;
-        ret._internal = localInnerProduct(lhs._internal,rhs._internal);
-        return ret;
-    }
-
-    ///////////////////////////////////////////////////////////////////////////////////////
-    // outerProduct Scalar x Scalar -> Scalar
-    //              Vector x Vector -> Matrix
-    ///////////////////////////////////////////////////////////////////////////////////////
-
-template<class l,class r,int N> inline
-auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iMatrix<decltype(outerProduct(lhs._internal[0],rhs._internal[0])),N>
-{
-    typedef decltype(outerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
-    iMatrix<ret_t,N> ret;
-    for(int c1=0;c1<N;c1++){
-    for(int c2=0;c2<N;c2++){
-        ret._internal[c1][c2] = outerProduct(lhs._internal[c1],rhs._internal[c2]);
-    }}
-    return ret;
-}
-template<class l,class r> inline
-auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(outerProduct(lhs._internal,rhs._internal))>
-{
-    typedef decltype(outerProduct(lhs._internal,rhs._internal)) ret_t;
-    iScalar<ret_t> ret;
-    ret._internal = outerProduct(lhs._internal,rhs._internal);
-    return ret;
-}
-/*
-    inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r)
-    {
-        return l*r;
-    }
-    inline vComplexD outerProduct(const vComplexD &l, const vComplexD& r)
-    {
-        return l*r;
-    }
-    inline vRealF outerProduct(const vRealF &l, const vRealF& r)
-    {
-        return l*r;
-    }
-    inline vRealD outerProduct(const vRealD &l, const vRealD& r)
-    {
-        return l*r;
-    }
-*/
-    inline ComplexF outerProduct(const ComplexF &l, const ComplexF& r)
-    {
-        return l*r;
-    }
-    inline ComplexD outerProduct(const ComplexD &l, const ComplexD& r)
-    {
-        return l*r;
-    }
-    inline RealF outerProduct(const RealF &l, const RealF& r)
-    {
-        return l*r;
-    }
-    inline RealD outerProduct(const RealD &l, const RealD& r)
-    {
-        return l*r;
-    }
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    /////////////////////////////////////////// CONJ         ///////////////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
- 
-// Conj function for scalar, vector, matrix
-template<class vtype> inline iScalar<vtype> conj(const iScalar<vtype>&r)
-{
-    iScalar<vtype> ret;
-    ret._internal = conj(r._internal);
-    return ret;
-}
-
-// Adj function for scalar, vector, matrix
-template<class vtype> inline iScalar<vtype> adj(const iScalar<vtype>&r)
-{
-    iScalar<vtype> ret;
-    ret._internal = adj(r._internal);
-    return ret;
-}
-template<class vtype,int N> inline iVector<vtype,N> adj(const iVector<vtype,N>&r)
-{
-    iVector<vtype,N> ret;
-    for(int i=0;i<N;i++){
-        ret._internal[i] = adj(r._internal[i]);
-    }
-    return ret;
-}
-template<class vtype,int N> inline iMatrix<vtype,N> adj(const iMatrix<vtype,N> &arg)
-{
-    iMatrix<vtype,N> ret;
-    for(int c1=0;c1<N;c1++){
-    for(int c2=0;c2<N;c2++){
-        ret._internal[c1][c2]=adj(arg._internal[c2][c1]);
-    }}
-    return ret;
-}
-
-/////////////////////////////////////////////////////////////////
-// Can only take the real/imag part of scalar objects, since
-// lattice objects of different complexity are non-conformable.
-/////////////////////////////////////////////////////////////////
-template<class itype> inline auto real(const iScalar<itype> &z) -> iScalar<decltype(real(z._internal))>
-{
-    iScalar<decltype(real(z._internal))> ret;
-    ret._internal = real(z._internal);
-    return ret;
-}
-template<class itype,int N> inline auto real(const iMatrix<itype,N> &z) -> iMatrix<decltype(real(z._internal[0][0])),N>
-{
-    iMatrix<decltype(real(z._internal[0][0])),N> ret;
-    for(int c1=0;c1<N;c1++){
-    for(int c2=0;c2<N;c2++){
-        ret._internal[c1][c2] = real(z._internal[c1][c2]);
-    }}
-    return ret;
-}
-template<class itype,int N> inline auto real(const iVector<itype,N> &z) -> iVector<decltype(real(z._internal[0])),N>
-{
-    iVector<decltype(real(z._internal[0])),N> ret;
-    for(int c1=0;c1<N;c1++){
-        ret._internal[c1] = real(z._internal[c1]);
-    }
-    return ret;
-}
-    
-template<class itype> inline auto imag(const iScalar<itype> &z) -> iScalar<decltype(imag(z._internal))>
-{
-    iScalar<decltype(imag(z._internal))> ret;
-    ret._internal = imag(z._internal);
-    return ret;
-}
-template<class itype,int N> inline auto imag(const iMatrix<itype,N> &z) -> iMatrix<decltype(imag(z._internal[0][0])),N>
-{
-    iMatrix<decltype(imag(z._internal[0][0])),N> ret;
-    for(int c1=0;c1<N;c1++){
-    for(int c2=0;c2<N;c2++){
-        ret._internal[c1][c2] = imag(z._internal[c1][c2]);
-    }}
-    return ret;
-}
-template<class itype,int N> inline auto imag(const iVector<itype,N> &z) -> iVector<decltype(imag(z._internal[0])),N>
-{
-    iVector<decltype(imag(z._internal[0])),N> ret;
-    for(int c1=0;c1<N;c1++){
-        ret._internal[c1] = imag(z._internal[c1]);
-    }
-    return ret;
-}
-
-    /////////////////////////////////
-    // Trace of scalar and matrix
-    /////////////////////////////////
-
-inline Complex trace( const Complex &arg){
-    return arg;
-}
-//inline vComplex trace(const vComplex &arg){
-//    return arg;
-//}
-template<class vtype,int N>
-inline auto trace(const iMatrix<vtype,N> &arg) -> iScalar<decltype(trace(arg._internal[0][0]))>
-{
-    iScalar<decltype( trace(arg._internal[0][0] )) > ret;
-    ZeroIt(ret._internal);
-    for(int i=0;i<N;i++){
-        ret._internal=ret._internal+trace(arg._internal[i][i]);
-    }
-    return ret;
-}
-template<class vtype>
-inline auto trace(const iScalar<vtype> &arg) -> iScalar<decltype(trace(arg._internal))>
-{
-    iScalar<decltype(trace(arg._internal))> ret;
-    ret._internal=trace(arg._internal);
-    return ret;
-}
-    
-/////////////////////////////////////////////////////////////////////////
-// Generic routine to promote object<complex> -> object<vcomplex>
-// Supports the array reordering transformation that gives me SIMD utilisation
-/////////////////////////////////////////////////////////////////////////
-/*
-template<template<class> class object>
-inline object<vComplex> splat(object<Complex >s){
-    object<vComplex> ret;
-    vComplex * v_ptr = (vComplex *)& ret;
-    Complex * s_ptr = (Complex *) &s;
-    for(int i=0;i<sizeof(ret);i+=sizeof(vComplex)){
-        vsplat(*(v_ptr++),*(s_ptr++));
-    }
-    return ret;
-}
-*/
-    
-    /////////////////////////////////////////////////////////////////////////////////////////
-    // Grid Support. Following will go into Grid.h.
-    /////////////////////////////////////////////////////////////////////////////////////////
-    // Cartesian grids
-    // dpo::Grid
-    // dpo::GridCartesian
-    // dpo::GridCartesianRedBlack
-    
-class Grid {
-public:
-    // Give Lattice access
-    template<class object> friend class Lattice;
-
-        
-//protected:
-        
-    // Lattice wide random support. not yet fully implemented. Need seed strategy
-    // and one generator per site.
-    //std::default_random_engine generator;
-  //    static std::mt19937  generator( 9 );
-
-        
-    // Grid information.
-    unsigned long _ndimension;
-    std::vector<int> _layout;     // Which dimensions get relayed out over simd lanes.
-    std::vector<int> _dimensions; // Dimensions of array
-    std::vector<int> _rdimensions;// Reduced dimensions with simd lane images removed
-    std::vector<int> _ostride;    // Outer stride for each dimension
-    std::vector<int> _istride;    // Inner stride i.e. within simd lane
-    int _osites;                  // _isites*_osites = product(dimensions).
-    int _isites;
-        
-    // subslice information
-    std::vector<int> _slice_block;
-    std::vector<int> _slice_stride;
-    std::vector<int> _slice_nblock;
-public:
-    
-    // These routines are key. Subdivide the linearised cartesian index into
-    //      "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
-    //      "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
-    // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
-    // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
-    // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
-    // lanes are operated upon simultaneously.
-    
-    inline int oIndexReduced(std::vector<int> &rcoor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*rcoor[d];
-        return idx;
-    }
-    virtual int oIndex(std::vector<int> &coor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
-        return idx;
-    }
-    inline int iIndex(std::vector<int> &rcoor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]);
-        return idx;
-    }
-        
-    inline int oSites(void) { return _osites; };
-    inline int iSites(void) { return _isites; };
-    virtual int CheckerBoard(std::vector<int> site)=0;
-    virtual int CheckerBoardDestination(int source_cb,int shift)=0;
-    virtual int CheckerBoardShift(int source_cb,int dim,int shift)=0;
-};
-
-////////////////////////////////////////////////////////////////////
-// A lattice of something, but assume the something is SIMDized.
-////////////////////////////////////////////////////////////////////
-template<typename _Tp>
-class myallocator {
-public: 
-  typedef std::size_t     size_type;
-  typedef std::ptrdiff_t  difference_type;
-  typedef _Tp*       pointer;
-  typedef const _Tp* const_pointer;
-  typedef _Tp&       reference;
-  typedef const _Tp& const_reference;
-  typedef _Tp        value_type;
-
-  template<typename _Tp1>  struct rebind { typedef myallocator<_Tp1> other; };
-  myallocator() throw() { }
-  myallocator(const myallocator&) throw() { }
-  template<typename _Tp1> myallocator(const myallocator<_Tp1>&) throw() { }
-  ~myallocator() throw() { }
-  pointer address(reference __x) const { return &__x; }
-  const_pointer address(const_reference __x) const { return &__x; }
-  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
-  // Should override allocate and deallocate
-  pointer allocate(size_type __n, const void* = 0)
-  { 
-    //_Tp * ptr = (_Tp *) memalign(sizeof(_Tp),__n*sizeof(_Tp));
-    // _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
-#ifdef AVX512
-    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
-#else
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
-#endif
-
-    return ptr;
-  }
-  void deallocate(pointer __p, size_type) { 
-    free(__p); 
-  }
-  void construct(pointer __p, const _Tp& __val) { };
-  void construct(pointer __p) { };
-  void destroy(pointer __p) { };
-};
-
-template<typename _Tp>  inline bool
-operator==(const myallocator<_Tp>&, const myallocator<_Tp>&){ return true; }
-
-template<typename _Tp>  inline bool
-operator!=(const myallocator<_Tp>&, const myallocator<_Tp>&){ return false; }
-
-    
-}; // namespace dpo
-
 
 #endif
diff --git a/Grid_Lattice.h b/Grid_Lattice.h
index f18784fb..47b85b0f 100644
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -1,5 +1,7 @@
+#ifndef GRID_LATTICE_H
+#define GRID_LATTICE_H
+
 #include "Grid.h"
-#include "Grid_vComplexD.h"
 
 namespace dpo {
 
@@ -9,7 +11,7 @@ class Lattice
 public:
     Grid *_grid;
     int checkerboard;
-    std::vector<vobj,myallocator<vobj> > _odata;
+    std::vector<vobj,alignedAllocator<vobj> > _odata;
 
 public:
     
@@ -554,95 +556,5 @@ public:
         return ret;
     }
 
-    
-namespace QCD {
-
-    static const int Nc=3;
-    static const int Ns=4;
-
-    static const int CbRed  =0;
-    static const int CbBlack=1;
-
-    // QCD iMatrix types
-    template<typename vtype> using iSinglet          = iScalar<iScalar<vtype> > ;
-    template<typename vtype> using iSpinMatrix       = iMatrix<iScalar<vtype>, Ns>;
-    template<typename vtype> using iSpinColourMatrix = iMatrix<iMatrix<vtype, Nc>, Ns>;
-
-    template<typename vtype> using iColourMatrix     = iScalar<iMatrix<vtype, Nc>> ;
-
-    template<typename vtype> using iSpinVector       = iVector<iScalar<vtype>, Ns>;
-    template<typename vtype> using iColourVector     = iScalar<iVector<vtype, Nc> >;
-    template<typename vtype> using iSpinColourVector = iVector<iVector<vtype, Nc>, Ns>;
-
-    typedef iSinglet<Complex >          TComplex;    // This is painful. Tensor singlet complex type.
-    typedef iSinglet<vComplex >         vTComplex;
-    typedef iSinglet<Real >          TReal;    // This is painful. Tensor singlet complex type.
-
-    typedef iSpinMatrix<Complex >       SpinMatrix;
-    typedef iColourMatrix<Complex >     ColourMatrix;
-    typedef iSpinColourMatrix<Complex > SpinColourMatrix;
-
-    typedef iSpinVector<Complex >       SpinVector;
-    typedef iColourVector<Complex >     ColourVector;
-    typedef iSpinColourVector<Complex > SpinColourVector;
-
-    
-    typedef iSpinMatrix<vComplex >       vSpinMatrix;
-    typedef iColourMatrix<vComplex >     vColourMatrix;
-    typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
-    
-    typedef iSpinVector<vComplex >       vSpinVector;
-    typedef iColourVector<vComplex >     vColourVector;
-    typedef iSpinColourVector<vComplex > vSpinColourVector;
-
-    
-    typedef Lattice<vTComplex>         LatticeComplex;
-    
-    typedef Lattice<vColourMatrix>     LatticeColourMatrix;
-    typedef Lattice<vSpinMatrix>       LatticeSpinMatrix;
-    typedef Lattice<vSpinColourMatrix> LatticePropagator;
-    typedef LatticePropagator LatticeSpinColourMatrix;
-
-    typedef Lattice<vSpinColourVector> LatticeFermion;
-    typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
-    typedef Lattice<vSpinVector>       LatticeSpinVector;
-    typedef Lattice<vColourVector>     LatticeColourVector;
-
-    // localNorm2,
-    template<class tt>
-    inline LatticeComplex localNorm2 (const Lattice<tt> &rhs)
-    {
-        LatticeComplex ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=trace(adj(rhs)*rhs);
-        }
-        return ret;
-    }
-    // localInnerProduct
-    template<class tt>
-    inline LatticeComplex localInnerProduct (const Lattice<tt> &lhs,const Lattice<tt> &rhs)
-    {
-        LatticeComplex ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
-    }
-    
-    // outerProduct Scalar x Scalar -> Scalar
-    //              Vector x Vector -> Matrix
-    template<class ll,class rr>
-    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
-    {
-        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
-     }
-}   //namespace QCD
-
 }
+#endif
diff --git a/Grid_QCD.h b/Grid_QCD.h
new file mode 100644
index 00000000..92a02506
--- /dev/null
+++ b/Grid_QCD.h
@@ -0,0 +1,94 @@
+#ifndef GRID_QCD_H
+#define GRID_QCD_H
+namespace dpo{
+namespace QCD {
+
+    static const int Nc=3;
+    static const int Ns=4;
+
+    static const int CbRed  =0;
+    static const int CbBlack=1;
+
+    // QCD iMatrix types
+    template<typename vtype> using iSinglet          = iScalar<iScalar<vtype> > ;
+    template<typename vtype> using iSpinMatrix       = iMatrix<iScalar<vtype>, Ns>;
+    template<typename vtype> using iSpinColourMatrix = iMatrix<iMatrix<vtype, Nc>, Ns>;
+
+    template<typename vtype> using iColourMatrix     = iScalar<iMatrix<vtype, Nc>> ;
+
+    template<typename vtype> using iSpinVector       = iVector<iScalar<vtype>, Ns>;
+    template<typename vtype> using iColourVector     = iScalar<iVector<vtype, Nc> >;
+    template<typename vtype> using iSpinColourVector = iVector<iVector<vtype, Nc>, Ns>;
+
+    typedef iSinglet<Complex >          TComplex;    // This is painful. Tensor singlet complex type.
+    typedef iSinglet<vComplex >         vTComplex;
+    typedef iSinglet<Real >          TReal;    // This is painful. Tensor singlet complex type.
+
+    typedef iSpinMatrix<Complex >       SpinMatrix;
+    typedef iColourMatrix<Complex >     ColourMatrix;
+    typedef iSpinColourMatrix<Complex > SpinColourMatrix;
+
+    typedef iSpinVector<Complex >       SpinVector;
+    typedef iColourVector<Complex >     ColourVector;
+    typedef iSpinColourVector<Complex > SpinColourVector;
+
+    
+    typedef iSpinMatrix<vComplex >       vSpinMatrix;
+    typedef iColourMatrix<vComplex >     vColourMatrix;
+    typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
+    
+    typedef iSpinVector<vComplex >       vSpinVector;
+    typedef iColourVector<vComplex >     vColourVector;
+    typedef iSpinColourVector<vComplex > vSpinColourVector;
+
+    
+    typedef Lattice<vTComplex>         LatticeComplex;
+    
+    typedef Lattice<vColourMatrix>     LatticeColourMatrix;
+    typedef Lattice<vSpinMatrix>       LatticeSpinMatrix;
+    typedef Lattice<vSpinColourMatrix> LatticePropagator;
+    typedef LatticePropagator LatticeSpinColourMatrix;
+
+    typedef Lattice<vSpinColourVector> LatticeFermion;
+    typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
+    typedef Lattice<vSpinVector>       LatticeSpinVector;
+    typedef Lattice<vColourVector>     LatticeColourVector;
+
+    // localNorm2,
+    template<class tt>
+    inline LatticeComplex localNorm2 (const Lattice<tt> &rhs)
+    {
+        LatticeComplex ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=trace(adj(rhs)*rhs);
+        }
+        return ret;
+    }
+    // localInnerProduct
+    template<class tt>
+    inline LatticeComplex localInnerProduct (const Lattice<tt> &lhs,const Lattice<tt> &rhs)
+    {
+        LatticeComplex ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+    }
+    
+    // outerProduct Scalar x Scalar -> Scalar
+    //              Vector x Vector -> Matrix
+    template<class ll,class rr>
+    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
+    {
+        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+     }
+}   //namespace QCD
+} // dpo
+#endif
diff --git a/Grid_aligned_allocator.h b/Grid_aligned_allocator.h
new file mode 100644
index 00000000..6040c233
--- /dev/null
+++ b/Grid_aligned_allocator.h
@@ -0,0 +1,56 @@
+#ifndef GRID_ALIGNED_ALLOCATOR_H
+#define GRID_ALIGNED_ALLOCATOR_H
+namespace dpo {
+
+////////////////////////////////////////////////////////////////////
+// A lattice of something, but assume the something is SIMDized.
+////////////////////////////////////////////////////////////////////
+template<typename _Tp>
+class alignedAllocator {
+public: 
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef _Tp*       pointer;
+  typedef const _Tp* const_pointer;
+  typedef _Tp&       reference;
+  typedef const _Tp& const_reference;
+  typedef _Tp        value_type;
+
+  template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; };
+  alignedAllocator() throw() { }
+  alignedAllocator(const alignedAllocator&) throw() { }
+  template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
+  ~alignedAllocator() throw() { }
+  pointer address(reference __x) const { return &__x; }
+  const_pointer address(const_reference __x) const { return &__x; }
+  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+  // Should override allocate and deallocate
+  pointer allocate(size_type __n, const void* = 0)
+  { 
+    //_Tp * ptr = (_Tp *) memalign(sizeof(_Tp),__n*sizeof(_Tp));
+    // _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+#ifdef AVX512
+    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+#else
+    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+#endif
+
+    return ptr;
+  }
+  void deallocate(pointer __p, size_type) { 
+    free(__p); 
+  }
+  void construct(pointer __p, const _Tp& __val) { };
+  void construct(pointer __p) { };
+  void destroy(pointer __p) { };
+};
+
+template<typename _Tp>  inline bool
+operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
+
+template<typename _Tp>  inline bool
+operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+
+    
+}; // namespace dpo
+#endif
diff --git a/Grid_config.h b/Grid_config.h
new file mode 100644
index 00000000..b882917d
--- /dev/null
+++ b/Grid_config.h
@@ -0,0 +1,101 @@
+/* Grid_config.h.  Generated from Grid_config.h.in by configure.  */
+/* Grid_config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* AVX */
+#define AVX1 1
+
+/* AVX2 */
+/* #undef AVX2 */
+
+/* AVX512 */
+/* #undef AVX512 */
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <malloc.h> header file. */
+/* #undef HAVE_MALLOC_H */
+
+/* Define to 1 if you have the <malloc/malloc.h> header file. */
+#define HAVE_MALLOC_MALLOC_H 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if the system has the `aligned' variable attribute */
+#define HAVE_VAR_ATTRIBUTE_ALIGNED 1
+
+/* Name of package */
+#define PACKAGE "grid"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "paboyle@ph.ed.ac.uk"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "Grid"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "Grid 1.0"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "grid"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "1.0"
+
+/* SSE2 */
+/* #undef SSE2 */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Version number of package */
+#define VERSION "1.0"
+
+/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+/* #undef _UINT32_T */
+
+/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+/* #undef _UINT64_T */
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
+/* Define to the type of an unsigned integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef uint32_t */
+
+/* Define to the type of an unsigned integer type of width exactly 64 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef uint64_t */
diff --git a/Grid_math_types.h b/Grid_math_types.h
new file mode 100644
index 00000000..405d5c9a
--- /dev/null
+++ b/Grid_math_types.h
@@ -0,0 +1,819 @@
+#ifndef GRID_MATH_TYPES_H
+#define GRID_MATH_TYPES_H
+namespace dpo {
+///////////////////////////////////////////////////
+// Scalar, Vector, Matrix objects.
+// These can be composed to form tensor products of internal indices.
+///////////////////////////////////////////////////
+    
+template<class vtype> class iScalar
+{
+public:
+  SIMDalign vtype _internal;
+    iScalar(){};
+    iScalar(Zero &z){ *this = zero; };
+    iScalar<vtype> & operator= (const Zero &hero){
+        zeroit(*this);
+        return *this;
+    }
+    friend void zeroit(iScalar<vtype> &that){
+        zeroit(that._internal);
+    }
+    // Unary negation
+    friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
+        iScalar<vtype> ret;
+        ret._internal= -r._internal;
+        return ret;
+    }
+    // *=,+=,-= operators
+    inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
+        *this = (*this)*r;
+        return *this;
+    }
+    inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
+        *this = (*this)-r;
+        return *this;
+    }
+    inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
+        *this = (*this)+r;
+        return *this;
+    }
+    
+
+};
+    
+template<class vtype,int N> class iVector
+{
+public:
+  SIMDalign vtype _internal[N];
+    iVector(Zero &z){ *this = zero; };
+    iVector() {};
+    iVector<vtype,N> & operator= (Zero &hero){
+        zeroit(*this);
+        return *this;
+    }
+    friend void zeroit(iVector<vtype,N> &that){
+        for(int i=0;i<N;i++){
+            zeroit(that._internal[i]);
+        }
+    }
+    // Unary negation
+    friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
+        iVector<vtype,N> ret;
+        for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
+        return ret;
+    }
+    // *=,+=,-= operators
+    inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
+        *this = (*this)*r;
+        return *this;
+    }
+    inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
+        *this = (*this)-r;
+        return *this;
+    }
+    inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
+        *this = (*this)+r;
+        return *this;
+    }
+
+};
+    
+    
+template<class vtype,int N> class iMatrix
+{
+public:
+  SIMDalign    vtype _internal[N][N];
+    iMatrix(Zero &z){ *this = zero; };
+    iMatrix() {};
+    iMatrix<vtype,N> & operator= (Zero &hero){
+        zeroit(*this);
+        return *this;
+    }
+    friend void zeroit(iMatrix<vtype,N> &that){
+        for(int i=0;i<N;i++){
+        for(int j=0;j<N;j++){
+                zeroit(that._internal[i][j]);
+        }}
+    }
+    // Unary negation
+    friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
+        iMatrix<vtype,N> ret;
+        for(int i=0;i<N;i++){
+        for(int j=0;j<N;j++){
+            ret._internal[i][j]= -r._internal[i][j];
+        }}
+        return ret;
+    }
+    // *=,+=,-= operators
+    template<class T>
+    inline iMatrix<vtype,N> &operator *=(const T &r) {
+        *this = (*this)*r;
+        return *this;
+    }
+    template<class T>
+    inline iMatrix<vtype,N> &operator -=(const T &r) {
+        *this = (*this)-r;
+        return *this;
+    }
+    template<class T>
+    inline iMatrix<vtype,N> &operator +=(const T &r) {
+        *this = (*this)+r;
+        return *this;
+    }
+
+};
+
+    
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// ADD         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    
+
+// ADD is simple for now; cannot mix types and straightforward template
+// Scalar +/- Scalar
+// Vector +/- Vector
+// Matrix +/- Matrix
+template<class vtype,class ltype,class rtype> inline void add(iScalar<vtype> * __restrict__ ret,
+                                                              const iScalar<ltype> * __restrict__ lhs,
+                                                              const iScalar<rtype> * __restrict__ rhs)
+{
+    add(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+template<class vtype,class ltype,class rtype,int N> inline void add(iVector<vtype,N> * __restrict__ ret,
+                                                                    const iVector<ltype,N> * __restrict__ lhs,
+                                                                    const iVector<rtype,N> * __restrict__ rhs)
+{
+    for(int c=0;c<N;c++){
+        ret->_internal[c]=lhs->_internal[c]+rhs->_internal[c];
+    }
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+                                                                      const iMatrix<ltype,N> * __restrict__ lhs,
+                                                                      const iMatrix<rtype,N> * __restrict__ rhs)
+{
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        add(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+                                                                      const iScalar<ltype>   * __restrict__ lhs,
+                                                                      const iMatrix<rtype,N> * __restrict__ rhs)
+{
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        add(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+                                                                      const iMatrix<ltype,N> * __restrict__ lhs,
+                                                                      const iScalar<rtype>   * __restrict__ rhs)
+{
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        if ( c1==c2)
+            add(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+        else
+            ret->_internal[c1][c2]=lhs->_internal[c1][c2];
+    }}
+    return;
+}
+// Need to figure multi-precision.
+template<class Mytype>  Mytype timesI(Mytype &r)
+{
+    iScalar<Complex> i;
+    i._internal = Complex(0,1);
+    return r*i;
+}
+
+                // + operator for scalar, vector, matrix
+template<class ltype,class rtype>
+//inline auto operator + (iScalar<ltype>& lhs,iScalar<rtype>&& rhs) -> iScalar<decltype(lhs._internal + rhs._internal)>
+inline auto operator + (const iScalar<ltype>& lhs,const iScalar<rtype>& rhs) -> iScalar<decltype(lhs._internal + rhs._internal)>
+{
+    typedef iScalar<decltype(lhs._internal+rhs._internal)> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator + (const iVector<ltype,N>& lhs,const iVector<rtype,N>& rhs) ->iVector<decltype(lhs._internal[0]+rhs._internal[0]),N>
+{
+    typedef iVector<decltype(lhs._internal[0]+rhs._internal[0]),N> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator + (const iMatrix<ltype,N>& lhs,const iMatrix<rtype,N>& rhs) ->iMatrix<decltype(lhs._internal[0][0]+rhs._internal[0][0]),N>
+{
+    typedef iMatrix<decltype(lhs._internal[0][0]+rhs._internal[0][0]),N> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator + (const iScalar<ltype>& lhs,const iMatrix<rtype,N>& rhs)->iMatrix<decltype(lhs._internal+rhs._internal[0][0]),N>
+{
+    typedef iMatrix<decltype(lhs._internal+rhs._internal[0][0]),N> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator + (const iMatrix<ltype,N>& lhs,const iScalar<rtype>& rhs)->iMatrix<decltype(lhs._internal[0][0]+rhs._internal),N>
+{
+    typedef iMatrix<decltype(lhs._internal[0][0]+rhs._internal),N> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+}
+
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// SUB         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    
+
+// SUB is simple for now; cannot mix types and straightforward template
+// Scalar +/- Scalar
+// Vector +/- Vector
+// Matrix +/- Matrix
+// Matrix /- scalar
+template<class vtype,class ltype,class rtype> inline void sub(iScalar<vtype> * __restrict__ ret,
+                                                              const iScalar<ltype> * __restrict__ lhs,
+                                                              const iScalar<rtype> * __restrict__ rhs)
+{
+    sub(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+
+template<class vtype,class ltype,class rtype,int N> inline void sub(iVector<vtype,N> * __restrict__ ret,
+                                                                    const iVector<ltype,N> * __restrict__ lhs,
+                                                                    const iVector<rtype,N> * __restrict__ rhs)
+{
+    for(int c=0;c<N;c++){
+        ret->_internal[c]=lhs->_internal[c]-rhs->_internal[c];
+    }
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+                                                                     const iMatrix<ltype,N> * __restrict__ lhs,
+                                                                     const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+                                                                     const iScalar<ltype> * __restrict__ lhs,
+                                                                     const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        if ( c1!=c2) {
+            sub(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+        } else {
+            // Fails -- need unary minus. Catalogue other unops?
+            ret->_internal[c1][c2]=zero;
+            ret->_internal[c1][c2]=ret->_internal[c1][c2]-rhs->_internal[c1][c2];
+
+        }
+    }}
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+                                                                     const iMatrix<ltype,N> * __restrict__ lhs,
+                                                                     const iScalar<rtype> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        if ( c1!=c2)
+            sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+        else
+            ret->_internal[c1][c2]=lhs->_internal[c1][c2];
+    }}
+    return;
+}
+
+template<class v> void vprefetch(const iScalar<v> &vv)
+{
+  vprefetch(vv._internal);
+}
+template<class v,int N> void vprefetch(const iVector<v,N> &vv)
+{
+  for(int i=0;i<N;i++){
+    vprefetch(vv._internal[i]);
+  }
+}
+template<class v,int N> void vprefetch(const iMatrix<v,N> &vv)
+{
+  for(int i=0;i<N;i++){
+  for(int j=0;j<N;j++){
+    vprefetch(vv._internal[i][j]);
+  }}
+}
+
+    // - operator for scalar, vector, matrix
+template<class ltype,class rtype> inline auto
+operator - (const iScalar<ltype>& lhs, const iScalar<rtype>& rhs) -> iScalar<decltype(lhs._internal - rhs._internal)>
+{
+    typedef iScalar<decltype(lhs._internal-rhs._internal)> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iVector<ltype,N>& lhs,const iVector<rtype,N>& rhs) ->iVector<decltype(lhs._internal[0]-rhs._internal[0]),N>
+{
+    typedef iVector<decltype(lhs._internal[0]-rhs._internal[0]),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iMatrix<ltype,N>& lhs,const iMatrix<rtype,N>& rhs) ->iMatrix<decltype(lhs._internal[0][0]-rhs._internal[0][0]),N>
+{
+    typedef iMatrix<decltype(lhs._internal[0][0]-rhs._internal[0][0]),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iScalar<ltype>& lhs,const iMatrix<rtype,N>& rhs)->iMatrix<decltype(lhs._internal-rhs._internal[0][0]),N>
+{
+    typedef iMatrix<decltype(lhs._internal-rhs._internal[0][0]),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iMatrix<ltype,N>& lhs,const iScalar<rtype>& rhs)->iMatrix<decltype(lhs._internal[0][0]-rhs._internal),N>
+{
+    typedef iMatrix<decltype(lhs._internal[0][0]-rhs._internal),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////// MAC         ///////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+    ///////////////////////////
+    // Legal multiplication table
+    ///////////////////////////
+    // scal x scal = scal
+    // mat x  mat  = mat
+    // mat  x scal = mat
+    // scal x mat  = mat
+    // mat  x vec  = vec
+    // vec  x scal = vec
+    // scal x vec  = vec
+    ///////////////////////////
+template<class rtype,class vtype,class mtype>
+inline  void mac(iScalar<rtype> * __restrict__ ret,const iScalar<vtype> * __restrict__ lhs,const iScalar<mtype> * __restrict__ rhs)
+{
+    mac(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+    for(int c3=0;c3<N;c3++){
+        mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+    }}}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iScalar<ltype> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mac(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iVector<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iVector<rtype,N> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mac(&ret->_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]);
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iVector<rrtype,N> * __restrict__ ret,const iScalar<ltype> * __restrict__ lhs,const iVector<rtype,N> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+        mac(&ret->_internal[c1],&lhs->_internal,&rhs->_internal[c1]);
+    }
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iVector<rrtype,N> * __restrict__ ret,const iVector<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+        mac(&ret->_internal[c1],&lhs->_internal[c1],&rhs->_internal);
+    }
+    return;
+}
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// MUL         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+    
+template<class rtype,class vtype,class mtype>
+inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> * __restrict__ lhs,const iScalar<vtype> * __restrict__ rhs){
+    mult(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+
+template<class rrtype,class ltype,class rtype,int N>
+inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]);
+        for(int c3=1;c3<N;c3++){
+            mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+        }
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+    }}
+    return;
+}
+
+template<class rrtype,class ltype,class rtype, int N>
+inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iScalar<ltype>   * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+// Matrix left multiplies vector
+template<class rtype,class vtype,class mtype,int N>
+inline void mult(iVector<rtype,N> * __restrict__ ret,const iMatrix<mtype,N> * __restrict__ lhs,const iVector<vtype,N> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1],&lhs->_internal[c1][0],&rhs->_internal[0]);
+        for(int c2=1;c2<N;c2++){
+            mac(&ret->_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]);
+        }
+    }
+    return;
+}
+template<class rtype,class vtype,class mtype,int N>
+inline void mult(iVector<rtype,N> * __restrict__ ret,
+                 const iScalar<mtype>   * __restrict__ lhs,
+                 const iVector<vtype,N> * __restrict__ rhs){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1],&lhs->_internal,&rhs->_internal[c1]);
+    }
+}
+template<class rtype,class vtype,class mtype,int N>
+inline void mult(iVector<rtype,N> * __restrict__ ret,
+                 const iVector<vtype,N> * __restrict__ rhs,
+                 const iScalar<mtype> * __restrict__ lhs){
+    mult(ret,lhs,rhs);
+}
+    
+
+
+template<class rtype,class vtype,class mtype,int N> inline
+iVector<rtype,N> operator * (const iMatrix<mtype,N>& lhs,const iVector<vtype,N>& rhs)
+{
+    iVector<rtype,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+
+template<class rtype,class vtype,class mtype,int N> inline
+iVector<rtype,N> operator * (const iScalar<mtype>& lhs,const iVector<vtype,N>& rhs)
+{
+    iVector<rtype,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+
+template<class rtype,class vtype,class mtype,int N> inline
+iVector<rtype,N> operator * (const iVector<mtype,N>& lhs,const iScalar<vtype>& rhs)
+{
+    iVector<rtype,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+    
+    //////////////////////////////////////////////////////////////////
+    // Glue operators to mult routines. Must resolve return type cleverly from typeof(internal)
+    // since nesting matrix<scalar> x matrix<matrix>-> matrix<matrix>
+    // while         matrix<scalar> x matrix<scalar>-> matrix<scalar>
+    // so return type depends on argument types in nasty way.
+    //////////////////////////////////////////////////////////////////
+    // scal x scal = scal
+    // mat x  mat  = mat
+    // mat  x scal = mat
+    // scal x mat  = mat
+    // mat  x vec  = vec
+    // vec  x scal = vec
+    // scal x vec  = vec
+    
+template<class l,class r>
+inline auto operator * (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(lhs._internal * rhs._internal)>
+{
+    typedef iScalar<decltype(lhs._internal*rhs._internal)> ret_t;
+    ret_t ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iMatrix<decltype(lhs._internal[0][0]*rhs._internal[0][0]),N>
+{
+    typedef decltype(lhs._internal[0][0]*rhs._internal[0][0]) ret_t;
+    iMatrix<ret_t,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class l,class r, int N> inline
+auto operator * (const iMatrix<r,N>& lhs,const iScalar<l>& rhs) -> iMatrix<decltype(lhs._internal[0][0]*rhs._internal),N>
+{
+    typedef decltype(lhs._internal[0][0]*rhs._internal) ret_t;
+        
+    iMatrix<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mult(&ret._internal[c1][c2],&lhs._internal[c1][c2],&rhs._internal);
+    }}
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iScalar<l>& lhs,const iMatrix<r,N>& rhs) -> iMatrix<decltype(lhs._internal*rhs._internal[0][0]),N>
+{
+    typedef decltype(lhs._internal*rhs._internal[0][0]) ret_t;
+    iMatrix<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mult(&ret._internal[c1][c2],&lhs._internal,&rhs._internal[c1][c2]);
+    }}
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iMatrix<l,N>& lhs,const iVector<r,N>& rhs) -> iVector<decltype(lhs._internal[0][0]*rhs._internal[0]),N>
+{
+    typedef decltype(lhs._internal[0][0]*rhs._internal[0]) ret_t;
+    iVector<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+        mult(&ret._internal[c1],&lhs._internal[c1][0],&rhs._internal[0]);
+        for(int c2=1;c2<N;c2++){
+            mac(&ret._internal[c1],&lhs._internal[c1][c2],&rhs._internal[c2]);
+        }
+    }
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iScalar<l>& lhs,const iVector<r,N>& rhs) -> iVector<decltype(lhs._internal*rhs._internal[0]),N>
+{
+    typedef decltype(lhs._internal*rhs._internal[0]) ret_t;
+    iVector<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+        mult(&ret._internal[c1],&lhs._internal,&rhs._internal[c1]);
+    }
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iVector<l,N>& lhs,const iScalar<r>& rhs) -> iVector<decltype(lhs._internal[0]*rhs._internal),N>
+{
+    typedef decltype(lhs._internal[0]*rhs._internal) ret_t;
+    iVector<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+        mult(&ret._internal[c1],&lhs._internal[c1],&rhs._internal);
+    }
+    return ret;
+}
+    ///////////////////////////////////////////////////////////////////////////////////////
+    // localInnerProduct Scalar x Scalar -> Scalar
+    // localInnerProduct Vector x Vector -> Scalar
+    // localInnerProduct Matrix x Matrix -> Scalar
+    ///////////////////////////////////////////////////////////////////////////////////////
+    template<class l,class r,int N> inline
+    auto localInnerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0],rhs._internal[0]))>
+    {
+        typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
+        iScalar<ret_t> ret=zero;
+        for(int c1=0;c1<N;c1++){
+            ret._internal += localInnerProduct(lhs._internal[c1],rhs._internal[c1]);
+        }
+        return ret;
+    }
+    template<class l,class r,int N> inline
+    auto localInnerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
+    {
+        typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
+        iScalar<ret_t> ret=zero;
+        for(int c1=0;c1<N;c1++){
+        for(int c2=0;c2<N;c2++){
+            ret._internal += localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
+        }}
+        return ret;
+    }
+    template<class l,class r> inline
+    auto localInnerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal,rhs._internal))>
+    {
+        typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t;
+        iScalar<ret_t> ret;
+        ret._internal = localInnerProduct(lhs._internal,rhs._internal);
+        return ret;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////
+    // outerProduct Scalar x Scalar -> Scalar
+    //              Vector x Vector -> Matrix
+    ///////////////////////////////////////////////////////////////////////////////////////
+
+template<class l,class r,int N> inline
+auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iMatrix<decltype(outerProduct(lhs._internal[0],rhs._internal[0])),N>
+{
+    typedef decltype(outerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
+    iMatrix<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2] = outerProduct(lhs._internal[c1],rhs._internal[c2]);
+    }}
+    return ret;
+}
+template<class l,class r> inline
+auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(outerProduct(lhs._internal,rhs._internal))>
+{
+    typedef decltype(outerProduct(lhs._internal,rhs._internal)) ret_t;
+    iScalar<ret_t> ret;
+    ret._internal = outerProduct(lhs._internal,rhs._internal);
+    return ret;
+}
+
+inline ComplexF outerProduct(const ComplexF &l, const ComplexF& r)
+{
+  return l*r;
+}
+inline ComplexD outerProduct(const ComplexD &l, const ComplexD& r)
+{
+  return l*r;
+}
+inline RealF outerProduct(const RealF &l, const RealF& r)
+{
+  return l*r;
+}
+inline RealD outerProduct(const RealD &l, const RealD& r)
+{
+  return l*r;
+}
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// CONJ         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+ 
+// Conj function for scalar, vector, matrix
+template<class vtype> inline iScalar<vtype> conj(const iScalar<vtype>&r)
+{
+    iScalar<vtype> ret;
+    ret._internal = conj(r._internal);
+    return ret;
+}
+
+// Adj function for scalar, vector, matrix
+template<class vtype> inline iScalar<vtype> adj(const iScalar<vtype>&r)
+{
+    iScalar<vtype> ret;
+    ret._internal = adj(r._internal);
+    return ret;
+}
+template<class vtype,int N> inline iVector<vtype,N> adj(const iVector<vtype,N>&r)
+{
+    iVector<vtype,N> ret;
+    for(int i=0;i<N;i++){
+        ret._internal[i] = adj(r._internal[i]);
+    }
+    return ret;
+}
+template<class vtype,int N> inline iMatrix<vtype,N> adj(const iMatrix<vtype,N> &arg)
+{
+    iMatrix<vtype,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2]=adj(arg._internal[c2][c1]);
+    }}
+    return ret;
+}
+
+/////////////////////////////////////////////////////////////////
+// Can only take the real/imag part of scalar objects, since
+// lattice objects of different complexity are non-conformable.
+/////////////////////////////////////////////////////////////////
+template<class itype> inline auto real(const iScalar<itype> &z) -> iScalar<decltype(real(z._internal))>
+{
+    iScalar<decltype(real(z._internal))> ret;
+    ret._internal = real(z._internal);
+    return ret;
+}
+template<class itype,int N> inline auto real(const iMatrix<itype,N> &z) -> iMatrix<decltype(real(z._internal[0][0])),N>
+{
+    iMatrix<decltype(real(z._internal[0][0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2] = real(z._internal[c1][c2]);
+    }}
+    return ret;
+}
+template<class itype,int N> inline auto real(const iVector<itype,N> &z) -> iVector<decltype(real(z._internal[0])),N>
+{
+    iVector<decltype(real(z._internal[0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+        ret._internal[c1] = real(z._internal[c1]);
+    }
+    return ret;
+}
+    
+template<class itype> inline auto imag(const iScalar<itype> &z) -> iScalar<decltype(imag(z._internal))>
+{
+    iScalar<decltype(imag(z._internal))> ret;
+    ret._internal = imag(z._internal);
+    return ret;
+}
+template<class itype,int N> inline auto imag(const iMatrix<itype,N> &z) -> iMatrix<decltype(imag(z._internal[0][0])),N>
+{
+    iMatrix<decltype(imag(z._internal[0][0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2] = imag(z._internal[c1][c2]);
+    }}
+    return ret;
+}
+template<class itype,int N> inline auto imag(const iVector<itype,N> &z) -> iVector<decltype(imag(z._internal[0])),N>
+{
+    iVector<decltype(imag(z._internal[0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+        ret._internal[c1] = imag(z._internal[c1]);
+    }
+    return ret;
+}
+
+    /////////////////////////////////
+    // Trace of scalar and matrix
+    /////////////////////////////////
+
+inline Complex trace( const Complex &arg){
+    return arg;
+}
+//inline vComplex trace(const vComplex &arg){
+//    return arg;
+//}
+template<class vtype,int N>
+inline auto trace(const iMatrix<vtype,N> &arg) -> iScalar<decltype(trace(arg._internal[0][0]))>
+{
+    iScalar<decltype( trace(arg._internal[0][0] )) > ret;
+    ZeroIt(ret._internal);
+    for(int i=0;i<N;i++){
+        ret._internal=ret._internal+trace(arg._internal[i][i]);
+    }
+    return ret;
+}
+template<class vtype>
+inline auto trace(const iScalar<vtype> &arg) -> iScalar<decltype(trace(arg._internal))>
+{
+    iScalar<decltype(trace(arg._internal))> ret;
+    ret._internal=trace(arg._internal);
+    return ret;
+}
+};
+/////////////////////////////////////////////////////////////////////////
+// Generic routine to promote object<complex> -> object<vcomplex>
+// Supports the array reordering transformation that gives me SIMD utilisation
+/////////////////////////////////////////////////////////////////////////
+/*
+template<template<class> class object>
+inline object<vComplex> splat(object<Complex >s){
+    object<vComplex> ret;
+    vComplex * v_ptr = (vComplex *)& ret;
+    Complex * s_ptr = (Complex *) &s;
+    for(int i=0;i<sizeof(ret);i+=sizeof(vComplex)){
+        vsplat(*(v_ptr++),*(s_ptr++));
+    }
+    return ret;
+}
+*/
+    
+#endif
diff --git a/Grid_signal.cc b/Grid_signal.cc
index 39676d30..e943c3b0 100755
--- a/Grid_signal.cc
+++ b/Grid_signal.cc
@@ -16,13 +16,15 @@
 #undef __X86_64
 namespace dpo {
 
-  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
-  void Grid_debug_handler_init(void);
-
-  void Grid_init(void)
-  {
-    Grid_debug_handler_init();
-  }
+void Grid_init(void)
+{
+  Grid_debug_handler_init();
+}
+double usecond(void) {
+  struct timeval tv;
+  gettimeofday(&tv,NULL);
+  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
+}
 
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
diff --git a/Grid_simd.h b/Grid_simd.h
index aa16286b..8c7f3a1c 100644
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -10,6 +10,33 @@
 //
 // Vector types are arch dependent
 ////////////////////////////////////////////////////////////////////////
+    // TODO
+    //
+    // Base class to share common code between vRealF, VComplexF etc...
+    //
+    // lattice Broad cast assignment
+    //
+    // where() support
+    // implement with masks, and/or? Type of the mask & boolean support?
+    //
+    // Unary functions
+    // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
+    // exp, log, sqrt, fabs
+    //
+    // transposeColor, transposeSpin,
+    // adjColor, adjSpin,
+    // traceColor, traceSpin.
+    // peekColor, peekSpin + pokeColor PokeSpin
+    //
+    // copyMask.
+    //
+    // localMaxAbs
+    //
+    // norm2,
+    // sumMulti equivalent.
+    // Fourier transform equivalent.
+    //
+    
 
 namespace dpo {
 
@@ -21,7 +48,46 @@ namespace dpo {
   typedef std::complex<RealD> ComplexD;
   typedef std::complex<Real>  Complex;
   
-  
+
+  inline RealF adj(const RealF  & r){ return r; }
+  inline RealF conj(const RealF  & r){ return r; }
+  inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
+  inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
+  inline RealD localInnerProduct(const RealD & l, const RealD & r) { return l*r; }
+  inline RealF localInnerProduct(const RealF & l, const RealF & r) { return l*r; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    //Provide support functions for basic real and complex data types required by dpo
+    //Single and double precision versions. Should be able to template this once only.
+    ////////////////////////////////////////////////////////////////////////////////
+    inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
+    inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
+    inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
+    inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
+    inline ComplexD adj(const ComplexD& r){ return(conj(r)); }
+    // conj already supported for complex
+    
+    inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
+    inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
+    inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
+    inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
+    inline Complex  adj(const Complex& r ){ return(conj(r)); }
+    //conj already supported for complex
+    
+    inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
+    inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
+    inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
+    inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
+    inline RealD adj(const RealD & r){ return r; }  // No-op for real
+    inline RealD conj(const RealD & r){ return r; }
+    
+    inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
+    inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
+    inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
+    inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
+    
+
+
   class Zero{};
   static Zero zero;
   template<class itype> inline void ZeroIt(itype &arg){ arg=zero;};