Better organisation

2025-08-10 16:27:05 +01:00 · 2015-03-04 05:31:44 +00:00
parent ad1b9b6ccf
commit 1a1474b323
8 changed files with 1161 additions and 1213 deletions
--- a/Grid.h
+++ b/Grid.h
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -1,5 +1,7 @@
+#ifndef GRID_LATTICE_H
+#define GRID_LATTICE_H
+
 #include "Grid.h"
-#include "Grid_vComplexD.h"

 namespace dpo {

@@ -9,7 +11,7 @@ class Lattice
 public:
    Grid *_grid;
    int checkerboard;
-    std::vector<vobj,myallocator<vobj> > _odata;
+    std::vector<vobj,alignedAllocator<vobj> > _odata;

 public:
    
@@ -554,95 +556,5 @@ public:
        return ret;
    }

-    
-namespace QCD {
-
-    static const int Nc=3;
-    static const int Ns=4;
-
-    static const int CbRed  =0;
-    static const int CbBlack=1;
-
-    // QCD iMatrix types
-    template<typename vtype> using iSinglet          = iScalar<iScalar<vtype> > ;
-    template<typename vtype> using iSpinMatrix       = iMatrix<iScalar<vtype>, Ns>;
-    template<typename vtype> using iSpinColourMatrix = iMatrix<iMatrix<vtype, Nc>, Ns>;
-
-    template<typename vtype> using iColourMatrix     = iScalar<iMatrix<vtype, Nc>> ;
-
-    template<typename vtype> using iSpinVector       = iVector<iScalar<vtype>, Ns>;
-    template<typename vtype> using iColourVector     = iScalar<iVector<vtype, Nc> >;
-    template<typename vtype> using iSpinColourVector = iVector<iVector<vtype, Nc>, Ns>;
-
-    typedef iSinglet<Complex >          TComplex;    // This is painful. Tensor singlet complex type.
-    typedef iSinglet<vComplex >         vTComplex;
-    typedef iSinglet<Real >          TReal;    // This is painful. Tensor singlet complex type.
-
-    typedef iSpinMatrix<Complex >       SpinMatrix;
-    typedef iColourMatrix<Complex >     ColourMatrix;
-    typedef iSpinColourMatrix<Complex > SpinColourMatrix;
-
-    typedef iSpinVector<Complex >       SpinVector;
-    typedef iColourVector<Complex >     ColourVector;
-    typedef iSpinColourVector<Complex > SpinColourVector;
-
-    
-    typedef iSpinMatrix<vComplex >       vSpinMatrix;
-    typedef iColourMatrix<vComplex >     vColourMatrix;
-    typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
-    
-    typedef iSpinVector<vComplex >       vSpinVector;
-    typedef iColourVector<vComplex >     vColourVector;
-    typedef iSpinColourVector<vComplex > vSpinColourVector;
-
-    
-    typedef Lattice<vTComplex>         LatticeComplex;
-    
-    typedef Lattice<vColourMatrix>     LatticeColourMatrix;
-    typedef Lattice<vSpinMatrix>       LatticeSpinMatrix;
-    typedef Lattice<vSpinColourMatrix> LatticePropagator;
-    typedef LatticePropagator LatticeSpinColourMatrix;
-
-    typedef Lattice<vSpinColourVector> LatticeFermion;
-    typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
-    typedef Lattice<vSpinVector>       LatticeSpinVector;
-    typedef Lattice<vColourVector>     LatticeColourVector;
-
-    // localNorm2,
-    template<class tt>
-    inline LatticeComplex localNorm2 (const Lattice<tt> &rhs)
-    {
-        LatticeComplex ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=trace(adj(rhs)*rhs);
-        }
-        return ret;
-    }
-    // localInnerProduct
-    template<class tt>
-    inline LatticeComplex localInnerProduct (const Lattice<tt> &lhs,const Lattice<tt> &rhs)
-    {
-        LatticeComplex ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
-    }
-    
-    // outerProduct Scalar x Scalar -> Scalar
-    //              Vector x Vector -> Matrix
-    template<class ll,class rr>
-    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
-    {
-        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
-     }
-}   //namespace QCD
-
 }
+#endif
--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@@ -0,0 +1,94 @@
+#ifndef GRID_QCD_H
+#define GRID_QCD_H
+namespace dpo{
+namespace QCD {
+
+    static const int Nc=3;
+    static const int Ns=4;
+
+    static const int CbRed  =0;
+    static const int CbBlack=1;
+
+    // QCD iMatrix types
+    template<typename vtype> using iSinglet          = iScalar<iScalar<vtype> > ;
+    template<typename vtype> using iSpinMatrix       = iMatrix<iScalar<vtype>, Ns>;
+    template<typename vtype> using iSpinColourMatrix = iMatrix<iMatrix<vtype, Nc>, Ns>;
+
+    template<typename vtype> using iColourMatrix     = iScalar<iMatrix<vtype, Nc>> ;
+
+    template<typename vtype> using iSpinVector       = iVector<iScalar<vtype>, Ns>;
+    template<typename vtype> using iColourVector     = iScalar<iVector<vtype, Nc> >;
+    template<typename vtype> using iSpinColourVector = iVector<iVector<vtype, Nc>, Ns>;
+
+    typedef iSinglet<Complex >          TComplex;    // This is painful. Tensor singlet complex type.
+    typedef iSinglet<vComplex >         vTComplex;
+    typedef iSinglet<Real >          TReal;    // This is painful. Tensor singlet complex type.
+
+    typedef iSpinMatrix<Complex >       SpinMatrix;
+    typedef iColourMatrix<Complex >     ColourMatrix;
+    typedef iSpinColourMatrix<Complex > SpinColourMatrix;
+
+    typedef iSpinVector<Complex >       SpinVector;
+    typedef iColourVector<Complex >     ColourVector;
+    typedef iSpinColourVector<Complex > SpinColourVector;
+
+    
+    typedef iSpinMatrix<vComplex >       vSpinMatrix;
+    typedef iColourMatrix<vComplex >     vColourMatrix;
+    typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
+    
+    typedef iSpinVector<vComplex >       vSpinVector;
+    typedef iColourVector<vComplex >     vColourVector;
+    typedef iSpinColourVector<vComplex > vSpinColourVector;
+
+    
+    typedef Lattice<vTComplex>         LatticeComplex;
+    
+    typedef Lattice<vColourMatrix>     LatticeColourMatrix;
+    typedef Lattice<vSpinMatrix>       LatticeSpinMatrix;
+    typedef Lattice<vSpinColourMatrix> LatticePropagator;
+    typedef LatticePropagator LatticeSpinColourMatrix;
+
+    typedef Lattice<vSpinColourVector> LatticeFermion;
+    typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
+    typedef Lattice<vSpinVector>       LatticeSpinVector;
+    typedef Lattice<vColourVector>     LatticeColourVector;
+
+    // localNorm2,
+    template<class tt>
+    inline LatticeComplex localNorm2 (const Lattice<tt> &rhs)
+    {
+        LatticeComplex ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=trace(adj(rhs)*rhs);
+        }
+        return ret;
+    }
+    // localInnerProduct
+    template<class tt>
+    inline LatticeComplex localInnerProduct (const Lattice<tt> &lhs,const Lattice<tt> &rhs)
+    {
+        LatticeComplex ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+    }
+    
+    // outerProduct Scalar x Scalar -> Scalar
+    //              Vector x Vector -> Matrix
+    template<class ll,class rr>
+    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
+    {
+        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+     }
+}   //namespace QCD
+} // dpo
+#endif
--- a/Grid_aligned_allocator.h
+++ b/Grid_aligned_allocator.h
@@ -0,0 +1,56 @@
+#ifndef GRID_ALIGNED_ALLOCATOR_H
+#define GRID_ALIGNED_ALLOCATOR_H
+namespace dpo {
+
+////////////////////////////////////////////////////////////////////
+// A lattice of something, but assume the something is SIMDized.
+////////////////////////////////////////////////////////////////////
+template<typename _Tp>
+class alignedAllocator {
+public: 
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef _Tp*       pointer;
+  typedef const _Tp* const_pointer;
+  typedef _Tp&       reference;
+  typedef const _Tp& const_reference;
+  typedef _Tp        value_type;
+
+  template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; };
+  alignedAllocator() throw() { }
+  alignedAllocator(const alignedAllocator&) throw() { }
+  template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
+  ~alignedAllocator() throw() { }
+  pointer address(reference __x) const { return &__x; }
+  const_pointer address(const_reference __x) const { return &__x; }
+  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+  // Should override allocate and deallocate
+  pointer allocate(size_type __n, const void* = 0)
+  { 
+    //_Tp * ptr = (_Tp *) memalign(sizeof(_Tp),__n*sizeof(_Tp));
+    // _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+#ifdef AVX512
+    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+#else
+    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+#endif
+
+    return ptr;
+  }
+  void deallocate(pointer __p, size_type) { 
+    free(__p); 
+  }
+  void construct(pointer __p, const _Tp& __val) { };
+  void construct(pointer __p) { };
+  void destroy(pointer __p) { };
+};
+
+template<typename _Tp>  inline bool
+operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
+
+template<typename _Tp>  inline bool
+operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+
+    
+}; // namespace dpo
+#endif
--- a/Grid_config.h
+++ b/Grid_config.h
@@ -0,0 +1,101 @@
+/* Grid_config.h.  Generated from Grid_config.h.in by configure.  */
+/* Grid_config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* AVX */
+#define AVX1 1
+
+/* AVX2 */
+/* #undef AVX2 */
+
+/* AVX512 */
+/* #undef AVX512 */
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <malloc.h> header file. */
+/* #undef HAVE_MALLOC_H */
+
+/* Define to 1 if you have the <malloc/malloc.h> header file. */
+#define HAVE_MALLOC_MALLOC_H 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if the system has the `aligned' variable attribute */
+#define HAVE_VAR_ATTRIBUTE_ALIGNED 1
+
+/* Name of package */
+#define PACKAGE "grid"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "paboyle@ph.ed.ac.uk"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "Grid"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "Grid 1.0"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "grid"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "1.0"
+
+/* SSE2 */
+/* #undef SSE2 */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Version number of package */
+#define VERSION "1.0"
+
+/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+/* #undef _UINT32_T */
+
+/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+/* #undef _UINT64_T */
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
+/* Define to the type of an unsigned integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef uint32_t */
+
+/* Define to the type of an unsigned integer type of width exactly 64 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef uint64_t */
--- a/Grid_math_types.h
+++ b/Grid_math_types.h
@@ -0,0 +1,819 @@
+#ifndef GRID_MATH_TYPES_H
+#define GRID_MATH_TYPES_H
+namespace dpo {
+///////////////////////////////////////////////////
+// Scalar, Vector, Matrix objects.
+// These can be composed to form tensor products of internal indices.
+///////////////////////////////////////////////////
+    
+template<class vtype> class iScalar
+{
+public:
+  SIMDalign vtype _internal;
+    iScalar(){};
+    iScalar(Zero &z){ *this = zero; };
+    iScalar<vtype> & operator= (const Zero &hero){
+        zeroit(*this);
+        return *this;
+    }
+    friend void zeroit(iScalar<vtype> &that){
+        zeroit(that._internal);
+    }
+    // Unary negation
+    friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
+        iScalar<vtype> ret;
+        ret._internal= -r._internal;
+        return ret;
+    }
+    // *=,+=,-= operators
+    inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
+        *this = (*this)*r;
+        return *this;
+    }
+    inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
+        *this = (*this)-r;
+        return *this;
+    }
+    inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
+        *this = (*this)+r;
+        return *this;
+    }
+    
+
+};
+    
+template<class vtype,int N> class iVector
+{
+public:
+  SIMDalign vtype _internal[N];
+    iVector(Zero &z){ *this = zero; };
+    iVector() {};
+    iVector<vtype,N> & operator= (Zero &hero){
+        zeroit(*this);
+        return *this;
+    }
+    friend void zeroit(iVector<vtype,N> &that){
+        for(int i=0;i<N;i++){
+            zeroit(that._internal[i]);
+        }
+    }
+    // Unary negation
+    friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
+        iVector<vtype,N> ret;
+        for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
+        return ret;
+    }
+    // *=,+=,-= operators
+    inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
+        *this = (*this)*r;
+        return *this;
+    }
+    inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
+        *this = (*this)-r;
+        return *this;
+    }
+    inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
+        *this = (*this)+r;
+        return *this;
+    }
+
+};
+    
+    
+template<class vtype,int N> class iMatrix
+{
+public:
+  SIMDalign    vtype _internal[N][N];
+    iMatrix(Zero &z){ *this = zero; };
+    iMatrix() {};
+    iMatrix<vtype,N> & operator= (Zero &hero){
+        zeroit(*this);
+        return *this;
+    }
+    friend void zeroit(iMatrix<vtype,N> &that){
+        for(int i=0;i<N;i++){
+        for(int j=0;j<N;j++){
+                zeroit(that._internal[i][j]);
+        }}
+    }
+    // Unary negation
+    friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
+        iMatrix<vtype,N> ret;
+        for(int i=0;i<N;i++){
+        for(int j=0;j<N;j++){
+            ret._internal[i][j]= -r._internal[i][j];
+        }}
+        return ret;
+    }
+    // *=,+=,-= operators
+    template<class T>
+    inline iMatrix<vtype,N> &operator *=(const T &r) {
+        *this = (*this)*r;
+        return *this;
+    }
+    template<class T>
+    inline iMatrix<vtype,N> &operator -=(const T &r) {
+        *this = (*this)-r;
+        return *this;
+    }
+    template<class T>
+    inline iMatrix<vtype,N> &operator +=(const T &r) {
+        *this = (*this)+r;
+        return *this;
+    }
+
+};
+
+    
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// ADD         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    
+
+// ADD is simple for now; cannot mix types and straightforward template
+// Scalar +/- Scalar
+// Vector +/- Vector
+// Matrix +/- Matrix
+template<class vtype,class ltype,class rtype> inline void add(iScalar<vtype> * __restrict__ ret,
+                                                              const iScalar<ltype> * __restrict__ lhs,
+                                                              const iScalar<rtype> * __restrict__ rhs)
+{
+    add(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+template<class vtype,class ltype,class rtype,int N> inline void add(iVector<vtype,N> * __restrict__ ret,
+                                                                    const iVector<ltype,N> * __restrict__ lhs,
+                                                                    const iVector<rtype,N> * __restrict__ rhs)
+{
+    for(int c=0;c<N;c++){
+        ret->_internal[c]=lhs->_internal[c]+rhs->_internal[c];
+    }
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+                                                                      const iMatrix<ltype,N> * __restrict__ lhs,
+                                                                      const iMatrix<rtype,N> * __restrict__ rhs)
+{
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        add(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+                                                                      const iScalar<ltype>   * __restrict__ lhs,
+                                                                      const iMatrix<rtype,N> * __restrict__ rhs)
+{
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        add(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+                                                                      const iMatrix<ltype,N> * __restrict__ lhs,
+                                                                      const iScalar<rtype>   * __restrict__ rhs)
+{
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        if ( c1==c2)
+            add(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+        else
+            ret->_internal[c1][c2]=lhs->_internal[c1][c2];
+    }}
+    return;
+}
+// Need to figure multi-precision.
+template<class Mytype>  Mytype timesI(Mytype &r)
+{
+    iScalar<Complex> i;
+    i._internal = Complex(0,1);
+    return r*i;
+}
+
+                // + operator for scalar, vector, matrix
+template<class ltype,class rtype>
+//inline auto operator + (iScalar<ltype>& lhs,iScalar<rtype>&& rhs) -> iScalar<decltype(lhs._internal + rhs._internal)>
+inline auto operator + (const iScalar<ltype>& lhs,const iScalar<rtype>& rhs) -> iScalar<decltype(lhs._internal + rhs._internal)>
+{
+    typedef iScalar<decltype(lhs._internal+rhs._internal)> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator + (const iVector<ltype,N>& lhs,const iVector<rtype,N>& rhs) ->iVector<decltype(lhs._internal[0]+rhs._internal[0]),N>
+{
+    typedef iVector<decltype(lhs._internal[0]+rhs._internal[0]),N> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator + (const iMatrix<ltype,N>& lhs,const iMatrix<rtype,N>& rhs) ->iMatrix<decltype(lhs._internal[0][0]+rhs._internal[0][0]),N>
+{
+    typedef iMatrix<decltype(lhs._internal[0][0]+rhs._internal[0][0]),N> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator + (const iScalar<ltype>& lhs,const iMatrix<rtype,N>& rhs)->iMatrix<decltype(lhs._internal+rhs._internal[0][0]),N>
+{
+    typedef iMatrix<decltype(lhs._internal+rhs._internal[0][0]),N> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator + (const iMatrix<ltype,N>& lhs,const iScalar<rtype>& rhs)->iMatrix<decltype(lhs._internal[0][0]+rhs._internal),N>
+{
+    typedef iMatrix<decltype(lhs._internal[0][0]+rhs._internal),N> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+}
+
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// SUB         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    
+
+// SUB is simple for now; cannot mix types and straightforward template
+// Scalar +/- Scalar
+// Vector +/- Vector
+// Matrix +/- Matrix
+// Matrix /- scalar
+template<class vtype,class ltype,class rtype> inline void sub(iScalar<vtype> * __restrict__ ret,
+                                                              const iScalar<ltype> * __restrict__ lhs,
+                                                              const iScalar<rtype> * __restrict__ rhs)
+{
+    sub(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+
+template<class vtype,class ltype,class rtype,int N> inline void sub(iVector<vtype,N> * __restrict__ ret,
+                                                                    const iVector<ltype,N> * __restrict__ lhs,
+                                                                    const iVector<rtype,N> * __restrict__ rhs)
+{
+    for(int c=0;c<N;c++){
+        ret->_internal[c]=lhs->_internal[c]-rhs->_internal[c];
+    }
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+                                                                     const iMatrix<ltype,N> * __restrict__ lhs,
+                                                                     const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+                                                                     const iScalar<ltype> * __restrict__ lhs,
+                                                                     const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        if ( c1!=c2) {
+            sub(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+        } else {
+            // Fails -- need unary minus. Catalogue other unops?
+            ret->_internal[c1][c2]=zero;
+            ret->_internal[c1][c2]=ret->_internal[c1][c2]-rhs->_internal[c1][c2];
+
+        }
+    }}
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+                                                                     const iMatrix<ltype,N> * __restrict__ lhs,
+                                                                     const iScalar<rtype> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        if ( c1!=c2)
+            sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+        else
+            ret->_internal[c1][c2]=lhs->_internal[c1][c2];
+    }}
+    return;
+}
+
+template<class v> void vprefetch(const iScalar<v> &vv)
+{
+  vprefetch(vv._internal);
+}
+template<class v,int N> void vprefetch(const iVector<v,N> &vv)
+{
+  for(int i=0;i<N;i++){
+    vprefetch(vv._internal[i]);
+  }
+}
+template<class v,int N> void vprefetch(const iMatrix<v,N> &vv)
+{
+  for(int i=0;i<N;i++){
+  for(int j=0;j<N;j++){
+    vprefetch(vv._internal[i][j]);
+  }}
+}
+
+    // - operator for scalar, vector, matrix
+template<class ltype,class rtype> inline auto
+operator - (const iScalar<ltype>& lhs, const iScalar<rtype>& rhs) -> iScalar<decltype(lhs._internal - rhs._internal)>
+{
+    typedef iScalar<decltype(lhs._internal-rhs._internal)> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iVector<ltype,N>& lhs,const iVector<rtype,N>& rhs) ->iVector<decltype(lhs._internal[0]-rhs._internal[0]),N>
+{
+    typedef iVector<decltype(lhs._internal[0]-rhs._internal[0]),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iMatrix<ltype,N>& lhs,const iMatrix<rtype,N>& rhs) ->iMatrix<decltype(lhs._internal[0][0]-rhs._internal[0][0]),N>
+{
+    typedef iMatrix<decltype(lhs._internal[0][0]-rhs._internal[0][0]),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iScalar<ltype>& lhs,const iMatrix<rtype,N>& rhs)->iMatrix<decltype(lhs._internal-rhs._internal[0][0]),N>
+{
+    typedef iMatrix<decltype(lhs._internal-rhs._internal[0][0]),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iMatrix<ltype,N>& lhs,const iScalar<rtype>& rhs)->iMatrix<decltype(lhs._internal[0][0]-rhs._internal),N>
+{
+    typedef iMatrix<decltype(lhs._internal[0][0]-rhs._internal),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////// MAC         ///////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+    ///////////////////////////
+    // Legal multiplication table
+    ///////////////////////////
+    // scal x scal = scal
+    // mat x  mat  = mat
+    // mat  x scal = mat
+    // scal x mat  = mat
+    // mat  x vec  = vec
+    // vec  x scal = vec
+    // scal x vec  = vec
+    ///////////////////////////
+template<class rtype,class vtype,class mtype>
+inline  void mac(iScalar<rtype> * __restrict__ ret,const iScalar<vtype> * __restrict__ lhs,const iScalar<mtype> * __restrict__ rhs)
+{
+    mac(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+    for(int c3=0;c3<N;c3++){
+        mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+    }}}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iScalar<ltype> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mac(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iVector<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iVector<rtype,N> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mac(&ret->_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]);
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iVector<rrtype,N> * __restrict__ ret,const iScalar<ltype> * __restrict__ lhs,const iVector<rtype,N> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+        mac(&ret->_internal[c1],&lhs->_internal,&rhs->_internal[c1]);
+    }
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iVector<rrtype,N> * __restrict__ ret,const iVector<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+        mac(&ret->_internal[c1],&lhs->_internal[c1],&rhs->_internal);
+    }
+    return;
+}
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// MUL         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+    
+template<class rtype,class vtype,class mtype>
+inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> * __restrict__ lhs,const iScalar<vtype> * __restrict__ rhs){
+    mult(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+
+template<class rrtype,class ltype,class rtype,int N>
+inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]);
+        for(int c3=1;c3<N;c3++){
+            mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+        }
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+    }}
+    return;
+}
+
+template<class rrtype,class ltype,class rtype, int N>
+inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iScalar<ltype>   * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+// Matrix left multiplies vector
+template<class rtype,class vtype,class mtype,int N>
+inline void mult(iVector<rtype,N> * __restrict__ ret,const iMatrix<mtype,N> * __restrict__ lhs,const iVector<vtype,N> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1],&lhs->_internal[c1][0],&rhs->_internal[0]);
+        for(int c2=1;c2<N;c2++){
+            mac(&ret->_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]);
+        }
+    }
+    return;
+}
+template<class rtype,class vtype,class mtype,int N>
+inline void mult(iVector<rtype,N> * __restrict__ ret,
+                 const iScalar<mtype>   * __restrict__ lhs,
+                 const iVector<vtype,N> * __restrict__ rhs){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1],&lhs->_internal,&rhs->_internal[c1]);
+    }
+}
+template<class rtype,class vtype,class mtype,int N>
+inline void mult(iVector<rtype,N> * __restrict__ ret,
+                 const iVector<vtype,N> * __restrict__ rhs,
+                 const iScalar<mtype> * __restrict__ lhs){
+    mult(ret,lhs,rhs);
+}
+    
+
+
+template<class rtype,class vtype,class mtype,int N> inline
+iVector<rtype,N> operator * (const iMatrix<mtype,N>& lhs,const iVector<vtype,N>& rhs)
+{
+    iVector<rtype,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+
+template<class rtype,class vtype,class mtype,int N> inline
+iVector<rtype,N> operator * (const iScalar<mtype>& lhs,const iVector<vtype,N>& rhs)
+{
+    iVector<rtype,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+
+template<class rtype,class vtype,class mtype,int N> inline
+iVector<rtype,N> operator * (const iVector<mtype,N>& lhs,const iScalar<vtype>& rhs)
+{
+    iVector<rtype,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+    
+    //////////////////////////////////////////////////////////////////
+    // Glue operators to mult routines. Must resolve return type cleverly from typeof(internal)
+    // since nesting matrix<scalar> x matrix<matrix>-> matrix<matrix>
+    // while         matrix<scalar> x matrix<scalar>-> matrix<scalar>
+    // so return type depends on argument types in nasty way.
+    //////////////////////////////////////////////////////////////////
+    // scal x scal = scal
+    // mat x  mat  = mat
+    // mat  x scal = mat
+    // scal x mat  = mat
+    // mat  x vec  = vec
+    // vec  x scal = vec
+    // scal x vec  = vec
+    
+template<class l,class r>
+inline auto operator * (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(lhs._internal * rhs._internal)>
+{
+    typedef iScalar<decltype(lhs._internal*rhs._internal)> ret_t;
+    ret_t ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iMatrix<decltype(lhs._internal[0][0]*rhs._internal[0][0]),N>
+{
+    typedef decltype(lhs._internal[0][0]*rhs._internal[0][0]) ret_t;
+    iMatrix<ret_t,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class l,class r, int N> inline
+auto operator * (const iMatrix<r,N>& lhs,const iScalar<l>& rhs) -> iMatrix<decltype(lhs._internal[0][0]*rhs._internal),N>
+{
+    typedef decltype(lhs._internal[0][0]*rhs._internal) ret_t;
+        
+    iMatrix<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mult(&ret._internal[c1][c2],&lhs._internal[c1][c2],&rhs._internal);
+    }}
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iScalar<l>& lhs,const iMatrix<r,N>& rhs) -> iMatrix<decltype(lhs._internal*rhs._internal[0][0]),N>
+{
+    typedef decltype(lhs._internal*rhs._internal[0][0]) ret_t;
+    iMatrix<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mult(&ret._internal[c1][c2],&lhs._internal,&rhs._internal[c1][c2]);
+    }}
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iMatrix<l,N>& lhs,const iVector<r,N>& rhs) -> iVector<decltype(lhs._internal[0][0]*rhs._internal[0]),N>
+{
+    typedef decltype(lhs._internal[0][0]*rhs._internal[0]) ret_t;
+    iVector<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+        mult(&ret._internal[c1],&lhs._internal[c1][0],&rhs._internal[0]);
+        for(int c2=1;c2<N;c2++){
+            mac(&ret._internal[c1],&lhs._internal[c1][c2],&rhs._internal[c2]);
+        }
+    }
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iScalar<l>& lhs,const iVector<r,N>& rhs) -> iVector<decltype(lhs._internal*rhs._internal[0]),N>
+{
+    typedef decltype(lhs._internal*rhs._internal[0]) ret_t;
+    iVector<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+        mult(&ret._internal[c1],&lhs._internal,&rhs._internal[c1]);
+    }
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iVector<l,N>& lhs,const iScalar<r>& rhs) -> iVector<decltype(lhs._internal[0]*rhs._internal),N>
+{
+    typedef decltype(lhs._internal[0]*rhs._internal) ret_t;
+    iVector<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+        mult(&ret._internal[c1],&lhs._internal[c1],&rhs._internal);
+    }
+    return ret;
+}
+    ///////////////////////////////////////////////////////////////////////////////////////
+    // localInnerProduct Scalar x Scalar -> Scalar
+    // localInnerProduct Vector x Vector -> Scalar
+    // localInnerProduct Matrix x Matrix -> Scalar
+    ///////////////////////////////////////////////////////////////////////////////////////
+    template<class l,class r,int N> inline
+    auto localInnerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0],rhs._internal[0]))>
+    {
+        typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
+        iScalar<ret_t> ret=zero;
+        for(int c1=0;c1<N;c1++){
+            ret._internal += localInnerProduct(lhs._internal[c1],rhs._internal[c1]);
+        }
+        return ret;
+    }
+    template<class l,class r,int N> inline
+    auto localInnerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
+    {
+        typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
+        iScalar<ret_t> ret=zero;
+        for(int c1=0;c1<N;c1++){
+        for(int c2=0;c2<N;c2++){
+            ret._internal += localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
+        }}
+        return ret;
+    }
+    template<class l,class r> inline
+    auto localInnerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal,rhs._internal))>
+    {
+        typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t;
+        iScalar<ret_t> ret;
+        ret._internal = localInnerProduct(lhs._internal,rhs._internal);
+        return ret;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////
+    // outerProduct Scalar x Scalar -> Scalar
+    //              Vector x Vector -> Matrix
+    ///////////////////////////////////////////////////////////////////////////////////////
+
+template<class l,class r,int N> inline
+auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iMatrix<decltype(outerProduct(lhs._internal[0],rhs._internal[0])),N>
+{
+    typedef decltype(outerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
+    iMatrix<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2] = outerProduct(lhs._internal[c1],rhs._internal[c2]);
+    }}
+    return ret;
+}
+template<class l,class r> inline
+auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(outerProduct(lhs._internal,rhs._internal))>
+{
+    typedef decltype(outerProduct(lhs._internal,rhs._internal)) ret_t;
+    iScalar<ret_t> ret;
+    ret._internal = outerProduct(lhs._internal,rhs._internal);
+    return ret;
+}
+
+inline ComplexF outerProduct(const ComplexF &l, const ComplexF& r)
+{
+  return l*r;
+}
+inline ComplexD outerProduct(const ComplexD &l, const ComplexD& r)
+{
+  return l*r;
+}
+inline RealF outerProduct(const RealF &l, const RealF& r)
+{
+  return l*r;
+}
+inline RealD outerProduct(const RealD &l, const RealD& r)
+{
+  return l*r;
+}
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// CONJ         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+ 
+// Conj function for scalar, vector, matrix
+template<class vtype> inline iScalar<vtype> conj(const iScalar<vtype>&r)
+{
+    iScalar<vtype> ret;
+    ret._internal = conj(r._internal);
+    return ret;
+}
+
+// Adj function for scalar, vector, matrix
+template<class vtype> inline iScalar<vtype> adj(const iScalar<vtype>&r)
+{
+    iScalar<vtype> ret;
+    ret._internal = adj(r._internal);
+    return ret;
+}
+template<class vtype,int N> inline iVector<vtype,N> adj(const iVector<vtype,N>&r)
+{
+    iVector<vtype,N> ret;
+    for(int i=0;i<N;i++){
+        ret._internal[i] = adj(r._internal[i]);
+    }
+    return ret;
+}
+template<class vtype,int N> inline iMatrix<vtype,N> adj(const iMatrix<vtype,N> &arg)
+{
+    iMatrix<vtype,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2]=adj(arg._internal[c2][c1]);
+    }}
+    return ret;
+}
+
+/////////////////////////////////////////////////////////////////
+// Can only take the real/imag part of scalar objects, since
+// lattice objects of different complexity are non-conformable.
+/////////////////////////////////////////////////////////////////
+template<class itype> inline auto real(const iScalar<itype> &z) -> iScalar<decltype(real(z._internal))>
+{
+    iScalar<decltype(real(z._internal))> ret;
+    ret._internal = real(z._internal);
+    return ret;
+}
+template<class itype,int N> inline auto real(const iMatrix<itype,N> &z) -> iMatrix<decltype(real(z._internal[0][0])),N>
+{
+    iMatrix<decltype(real(z._internal[0][0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2] = real(z._internal[c1][c2]);
+    }}
+    return ret;
+}
+template<class itype,int N> inline auto real(const iVector<itype,N> &z) -> iVector<decltype(real(z._internal[0])),N>
+{
+    iVector<decltype(real(z._internal[0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+        ret._internal[c1] = real(z._internal[c1]);
+    }
+    return ret;
+}
+    
+template<class itype> inline auto imag(const iScalar<itype> &z) -> iScalar<decltype(imag(z._internal))>
+{
+    iScalar<decltype(imag(z._internal))> ret;
+    ret._internal = imag(z._internal);
+    return ret;
+}
+template<class itype,int N> inline auto imag(const iMatrix<itype,N> &z) -> iMatrix<decltype(imag(z._internal[0][0])),N>
+{
+    iMatrix<decltype(imag(z._internal[0][0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2] = imag(z._internal[c1][c2]);
+    }}
+    return ret;
+}
+template<class itype,int N> inline auto imag(const iVector<itype,N> &z) -> iVector<decltype(imag(z._internal[0])),N>
+{
+    iVector<decltype(imag(z._internal[0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+        ret._internal[c1] = imag(z._internal[c1]);
+    }
+    return ret;
+}
+
+    /////////////////////////////////
+    // Trace of scalar and matrix
+    /////////////////////////////////
+
+inline Complex trace( const Complex &arg){
+    return arg;
+}
+//inline vComplex trace(const vComplex &arg){
+//    return arg;
+//}
+template<class vtype,int N>
+inline auto trace(const iMatrix<vtype,N> &arg) -> iScalar<decltype(trace(arg._internal[0][0]))>
+{
+    iScalar<decltype( trace(arg._internal[0][0] )) > ret;
+    ZeroIt(ret._internal);
+    for(int i=0;i<N;i++){
+        ret._internal=ret._internal+trace(arg._internal[i][i]);
+    }
+    return ret;
+}
+template<class vtype>
+inline auto trace(const iScalar<vtype> &arg) -> iScalar<decltype(trace(arg._internal))>
+{
+    iScalar<decltype(trace(arg._internal))> ret;
+    ret._internal=trace(arg._internal);
+    return ret;
+}
+};
+/////////////////////////////////////////////////////////////////////////
+// Generic routine to promote object<complex> -> object<vcomplex>
+// Supports the array reordering transformation that gives me SIMD utilisation
+/////////////////////////////////////////////////////////////////////////
+/*
+template<template<class> class object>
+inline object<vComplex> splat(object<Complex >s){
+    object<vComplex> ret;
+    vComplex * v_ptr = (vComplex *)& ret;
+    Complex * s_ptr = (Complex *) &s;
+    for(int i=0;i<sizeof(ret);i+=sizeof(vComplex)){
+        vsplat(*(v_ptr++),*(s_ptr++));
+    }
+    return ret;
+}
+*/
+    
+#endif
--- a/Grid_signal.cc
+++ b/Grid_signal.cc
@@ -16,13 +16,15 @@
 #undef __X86_64
 namespace dpo {

-  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
-  void Grid_debug_handler_init(void);
-
-  void Grid_init(void)
-  {
-    Grid_debug_handler_init();
-  }
+void Grid_init(void)
+{
+  Grid_debug_handler_init();
+}
+double usecond(void) {
+  struct timeval tv;
+  gettimeofday(&tv,NULL);
+  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
+}

 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -10,6 +10,33 @@
 //
 // Vector types are arch dependent
 ////////////////////////////////////////////////////////////////////////
+    // TODO
+    //
+    // Base class to share common code between vRealF, VComplexF etc...
+    //
+    // lattice Broad cast assignment
+    //
+    // where() support
+    // implement with masks, and/or? Type of the mask & boolean support?
+    //
+    // Unary functions
+    // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
+    // exp, log, sqrt, fabs
+    //
+    // transposeColor, transposeSpin,
+    // adjColor, adjSpin,
+    // traceColor, traceSpin.
+    // peekColor, peekSpin + pokeColor PokeSpin
+    //
+    // copyMask.
+    //
+    // localMaxAbs
+    //
+    // norm2,
+    // sumMulti equivalent.
+    // Fourier transform equivalent.
+    //
+    

 namespace dpo {

@@ -21,7 +48,46 @@ namespace dpo {
  typedef std::complex<RealD> ComplexD;
  typedef std::complex<Real>  Complex;
  
-  
+
+  inline RealF adj(const RealF  & r){ return r; }
+  inline RealF conj(const RealF  & r){ return r; }
+  inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
+  inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
+  inline RealD localInnerProduct(const RealD & l, const RealD & r) { return l*r; }
+  inline RealF localInnerProduct(const RealF & l, const RealF & r) { return l*r; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    //Provide support functions for basic real and complex data types required by dpo
+    //Single and double precision versions. Should be able to template this once only.
+    ////////////////////////////////////////////////////////////////////////////////
+    inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
+    inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
+    inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
+    inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
+    inline ComplexD adj(const ComplexD& r){ return(conj(r)); }
+    // conj already supported for complex
+    
+    inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
+    inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
+    inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
+    inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
+    inline Complex  adj(const Complex& r ){ return(conj(r)); }
+    //conj already supported for complex
+    
+    inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
+    inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
+    inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
+    inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
+    inline RealD adj(const RealD & r){ return r; }  // No-op for real
+    inline RealD conj(const RealD & r){ return r; }
+    
+    inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
+    inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
+    inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
+    inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
+    
+
+
  class Zero{};
  static Zero zero;
  template<class itype> inline void ZeroIt(itype &arg){ arg=zero;};